90 files changed, 1630 insertions, 628 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b6..24f8c81fc48d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -588,16 +588,6 @@ out:
 }
 /**
- * acct_init_pacct - initialize a new pacct_struct
- * @pacct: per-process accounting info struct to initialize
- */
-void acct_init_pacct(struct pacct_struct *pacct)
-{
-        memset(pacct, 0, sizeof(struct pacct_struct));
-        pacct->ac_utime = pacct->ac_stime = cputime_zero;
-}
-/**
 * acct_collect - collect accounting information into pacct_struct
 * @exitcode: task exit code
 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/async.c b/kernel/async.c
index 27235f5de198..15319d6c18fe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel.
 #include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 #include <asm/atomic.h>
 static async_cookie_t next_cookie = 1;
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..c71bd26631a2 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,7 @@
 #include <asm/atomic.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/kthread.h>
@@ -398,7 +399,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
        skb_get(skb);
        err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
        if (err < 0) {
-                BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
                audit_log_lost("auditd dissapeared\n");
                audit_pid = 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 028e85663f27..46a57b57a335 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,6 +3,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/kthread.h>
+#include <linux/slab.h>
 struct audit_tree;
 struct audit_chunk;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index cc7e87936cbc..8df43696f4ba 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/inotify.h>
 #include <linux/security.h>
 #include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a70604047f3c..ce08041f578d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/netlink.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/security.h>
 #include "audit.h"
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f3a461c0970a..3828ad5fb8f1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -49,6 +49,7 @@
 #include <linux/namei.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/socket.h>
 #include <linux/mqueue.h>
@@ -1893,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context,
 {
        if (context->name_count >= AUDIT_NAMES) {
                if (inode)
-                        printk(KERN_DEBUG "name_count maxed, losing inode data: "
+                        printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
                               "dev=%02x:%02x, inode=%lu\n",
                               MAJOR(inode->i_sb->s_dev),
                               MINOR(inode->i_sb->s_dev),
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4fd90e129772..e2769e13980c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
+ *  Notifications support
+ *  Copyright (C) 2009 Nokia Corporation
+ *  Author: Kirill A. Shutemov
+ *
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
@@ -23,7 +27,6 @@
 */
 #include <linux/cgroup.h>
-#include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -44,6 +47,7 @@
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
+#include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
@@ -52,15 +56,21 @@
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/eventfd.h>
+#include <linux/poll.h>
 #include <asm/atomic.h>
 static DEFINE_MUTEX(cgroup_mutex);
-/* Generate an array of cgroup subsystem pointers */
+/*
+ * Generate an array of cgroup subsystem pointers. At boot time, this is
+ * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
+ * registered after that. The mutable section of this array is protected by
+ * cgroup_mutex.
+ */
 #define SUBSYS(_x) &_x ## _subsys,
+static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
-static struct cgroup_subsys *subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
@@ -147,6 +157,35 @@ struct css_id {
        unsigned short stack[0]; /* Array of Length (depth+1) */
 };
+/*
+ * cgroup_event represents events which userspace want to recieve.
+ */
+struct cgroup_event {
+        /*
+         * Cgroup which the event belongs to.
+         */
+        struct cgroup *cgrp;
+        /*
+         * Control file which the event associated.
+         */
+        struct cftype *cft;
+        /*
+         * eventfd to signal userspace about the event.
+         */
+        struct eventfd_ctx *eventfd;
+        /*
+         * Each of these stored in a list by the cgroup.
+         */
+        struct list_head list;
+        /*
+         * All fields below needed to unregister event when
+         * userspace closes eventfd.
+         */
+        poll_table pt;
+        wait_queue_head_t *wqh;
+        wait_queue_t wait;
+        struct work_struct remove;
+};
 /* The list of hierarchy roots */
@@ -250,7 +289,8 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
-static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+static int cgroup_init_idr(struct cgroup_subsys *ss,
+                           struct cgroup_subsys_state *css);
 /* css_set_lock protects the list of css_set objects, and the
 * chain of tasks off each css_set.  Nests outside task->alloc_lock
@@ -448,8 +488,11 @@ static struct css_set *find_existing_css_set(
        struct hlist_node *node;
        struct css_set *cg;
-        /* Built the set of subsystem state objects that we want to
+        /*
-         * see in the new css_set */
+         * Build the set of subsystem state objects that we want to see in the
+         * new css_set. while subsystems can change globally, the entries here
+         * won't change, so no need for locking.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                if (root->subsys_bits & (1UL << i)) {
                        /* Subsystem is in this hierarchy. So we want
@@ -696,6 +739,7 @@ void cgroup_lock(void)
 {
        mutex_lock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_lock);
 /**
 * cgroup_unlock - release lock on cgroup changes
@@ -706,6 +750,7 @@ void cgroup_unlock(void)
 {
        mutex_unlock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_unlock);
 /*
 * A couple of forward declarations required, due to cyclic reference loop:
@@ -757,6 +802,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
                        if (ret)
                                break;
                }
        return ret;
 }
@@ -884,7 +930,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
        css_put(css);
 }
+/*
+ * Call with cgroup_mutex held. Drops reference counts on modules, including
+ * any duplicate ones that parse_cgroupfs_options took. If this function
+ * returns an error, no reference counts are touched.
+ */
 static int rebind_subsystems(struct cgroupfs_root *root,
                              unsigned long final_bits)
 {
@@ -892,6 +942,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        struct cgroup *cgrp = &root->top_cgroup;
        int i;
+        BUG_ON(!mutex_is_locked(&cgroup_mutex));
        removed_bits = root->actual_subsys_bits & ~final_bits;
        added_bits = final_bits & ~root->actual_subsys_bits;
        /* Check that any added subsystems are currently free */
@@ -900,6 +952,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                struct cgroup_subsys *ss = subsys[i];
                if (!(bit & added_bits))
                        continue;
+                /*
+                 * Nobody should tell us to do a subsys that doesn't exist:
+                 * parse_cgroupfs_options should catch that case and refcounts
+                 * ensure that subsystems won't disappear once selected.
+                 */
+                BUG_ON(ss == NULL);
                if (ss->root != &rootnode) {
                        /* Subsystem isn't free */
                        return -EBUSY;
@@ -919,6 +977,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                unsigned long bit = 1UL << i;
                if (bit & added_bits) {
                        /* We're binding this subsystem to this hierarchy */
+                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i]);
                        BUG_ON(!dummytop->subsys[i]);
                        BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -930,8 +989,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        if (ss->bind)
                                ss->bind(ss, cgrp);
                        mutex_unlock(&ss->hierarchy_mutex);
+                        /* refcount was already taken, and we're keeping it */
                } else if (bit & removed_bits) {
                        /* We're removing this subsystem */
+                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
                        mutex_lock(&ss->hierarchy_mutex);
@@ -942,9 +1003,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        subsys[i]->root = &rootnode;
                        list_move(&ss->sibling, &rootnode.subsys_list);
                        mutex_unlock(&ss->hierarchy_mutex);
+                        /* subsystem is now free - drop reference on module */
+                        module_put(ss->module);
                } else if (bit & final_bits) {
                        /* Subsystem state should already exist */
+                        BUG_ON(ss == NULL);
                        BUG_ON(!cgrp->subsys[i]);
+                        /*
+                         * a refcount was taken, but we already had one, so
+                         * drop the extra reference.
+                         */
+                        module_put(ss->module);
+#ifdef CONFIG_MODULE_UNLOAD
+                        BUG_ON(ss->module && !module_refcount(ss->module));
+#endif
                } else {
                        /* Subsystem state shouldn't exist */
                        BUG_ON(cgrp->subsys[i]);
@@ -986,13 +1058,20 @@ struct cgroup_sb_opts {
 };
-/* Convert a hierarchy specifier into a bitmask of subsystems and
+/*
- * flags. */
+ * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
-static int parse_cgroupfs_options(char *data,
+ * with cgroup_mutex held to protect the subsys[] array. This function takes
-                                     struct cgroup_sb_opts *opts)
+ * refcounts on subsystems to be used, unless it returns error, in which case
+ * no refcounts are taken.
+ */
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
        char *token, *o = data ?: "all";
        unsigned long mask = (unsigned long)-1;
+        int i;
+        bool module_pin_failed = false;
+        BUG_ON(!mutex_is_locked(&cgroup_mutex));
 #ifdef CONFIG_CPUSETS
        mask = ~(1UL << cpuset_subsys_id);
@@ -1005,10 +1084,11 @@ static int parse_cgroupfs_options(char *data,
                        return -EINVAL;
                if (!strcmp(token, "all")) {
                        /* Add all non-disabled subsystems */
-                        int i;
                        opts->subsys_bits = 0;
                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                struct cgroup_subsys *ss = subsys[i];
+                                if (ss == NULL)
+                                        continue;
                                if (!ss->disabled)
                                        opts->subsys_bits |= 1ul << i;
                        }
@@ -1026,7 +1106,6 @@ static int parse_cgroupfs_options(char *data,
                        if (!opts->release_agent)
                                return -ENOMEM;
                } else if (!strncmp(token, "name=", 5)) {
-                        int i;
                        const char *name = token + 5;
                        /* Can't specify an empty name */
                        if (!strlen(name))
@@ -1050,9 +1129,10 @@ static int parse_cgroupfs_options(char *data,
                                return -ENOMEM;
                } else {
                        struct cgroup_subsys *ss;
-                        int i;
                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                ss = subsys[i];
+                                if (ss == NULL)
+                                        continue;
                                if (!strcmp(token, ss->name)) {
                                        if (!ss->disabled)
                                                set_bit(i, &opts->subsys_bits);
@@ -1087,9 +1167,54 @@ static int parse_cgroupfs_options(char *data,
        if (!opts->subsys_bits && !opts->name)
                return -EINVAL;
+        /*
+         * Grab references on all the modules we'll need, so the subsystems
+         * don't dance around before rebind_subsystems attaches them. This may
+         * take duplicate reference counts on a subsystem that's already used,
+         * but rebind_subsystems handles this case.
+         */
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                unsigned long bit = 1UL << i;
+                if (!(bit & opts->subsys_bits))
+                        continue;
+                if (!try_module_get(subsys[i]->module)) {
+                        module_pin_failed = true;
+                        break;
+                }
+        }
+        if (module_pin_failed) {
+                /*
+                 * oops, one of the modules was going away. this means that we
+                 * raced with a module_delete call, and to the user this is
+                 * essentially a "subsystem doesn't exist" case.
+                 */
+                for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
+                        /* drop refcounts only on the ones we took */
+                        unsigned long bit = 1UL << i;
+                        if (!(bit & opts->subsys_bits))
+                                continue;
+                        module_put(subsys[i]->module);
+                }
+                return -ENOENT;
+        }
        return 0;
 }
+static void drop_parsed_module_refcounts(unsigned long subsys_bits)
+{
+        int i;
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                unsigned long bit = 1UL << i;
+                if (!(bit & subsys_bits))
+                        continue;
+                module_put(subsys[i]->module);
+        }
+}
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 {
        int ret = 0;
@@ -1106,21 +1231,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                goto out_unlock;
-        /* Don't allow flags to change at remount */
+        /* Don't allow flags or name to change at remount */
-        if (opts.flags != root->flags) {
+        if (opts.flags != root->flags ||
-                ret = -EINVAL;
+            (opts.name && strcmp(opts.name, root->name))) {
-                goto out_unlock;
-        }
-        /* Don't allow name to change at remount */
-        if (opts.name && strcmp(opts.name, root->name)) {
                ret = -EINVAL;
+                drop_parsed_module_refcounts(opts.subsys_bits);
                goto out_unlock;
        }
        ret = rebind_subsystems(root, opts.subsys_bits);
-        if (ret)
+        if (ret) {
+                drop_parsed_module_refcounts(opts.subsys_bits);
                goto out_unlock;
+        }
        /* (re)populate subsystem files */
        cgroup_populate_dir(cgrp);
@@ -1151,6 +1274,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
+        INIT_LIST_HEAD(&cgrp->event_list);
+        spin_lock_init(&cgrp->event_list_lock);
 }
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1306,7 +1431,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        struct cgroupfs_root *new_root;
        /* First find the desired set of subsystems */
+        mutex_lock(&cgroup_mutex);
        ret = parse_cgroupfs_options(data, &opts);
+        mutex_unlock(&cgroup_mutex);
        if (ret)
                goto out_err;
@@ -1317,7 +1444,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        new_root = cgroup_root_from_opts(&opts);
        if (IS_ERR(new_root)) {
                ret = PTR_ERR(new_root);
-                goto out_err;
+                goto drop_modules;
        }
        opts.new_root = new_root;
@@ -1326,7 +1453,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
                cgroup_drop_root(opts.new_root);
-                goto out_err;
+                goto drop_modules;
        }
        root = sb->s_fs_info;
@@ -1382,6 +1509,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                        free_cg_links(&tmp_cg_links);
                        goto drop_new_super;
                }
+                /*
+                 * There must be no failure case after here, since rebinding
+                 * takes care of subsystems' refcounts, which are explicitly
+                 * dropped in the failure exit path.
+                 */
                /* EBUSY should be the only error here */
                BUG_ON(ret);
@@ -1420,6 +1552,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                 * any) is not needed
                 */
                cgroup_drop_root(opts.new_root);
+                /* no subsys rebinding, so refcounts don't change */
+                drop_parsed_module_refcounts(opts.subsys_bits);
        }
        simple_set_mnt(mnt, sb);
@@ -1429,6 +1563,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 drop_new_super:
        deactivate_locked_super(sb);
+ drop_modules:
+        drop_parsed_module_refcounts(opts.subsys_bits);
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
@@ -1542,6 +1678,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        memmove(buf, start, buf + buflen - start);
        return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_path);
 /**
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1554,7 +1691,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
        int retval = 0;
-        struct cgroup_subsys *ss;
+        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
        struct css_set *cg;
        struct css_set *newcg;
@@ -1568,8 +1705,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
                        retval = ss->can_attach(ss, cgrp, tsk, false);
-                        if (retval)
+                        if (retval) {
-                                return retval;
+                                /*
+                                 * Remember on which subsystem the can_attach()
+                                 * failed, so that we only call cancel_attach()
+                                 * against the subsystems whose can_attach()
+                                 * succeeded. (See below)
+                                 */
+                                failed_ss = ss;
+                                goto out;
+                        }
                }
        }
@@ -1583,14 +1728,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         */
        newcg = find_css_set(cg, cgrp);
        put_css_set(cg);
-        if (!newcg)
+        if (!newcg) {
-                return -ENOMEM;
+                retval = -ENOMEM;
+                goto out;
+        }
        task_lock(tsk);
        if (tsk->flags & PF_EXITING) {
                task_unlock(tsk);
                put_css_set(newcg);
-                return -ESRCH;
+                retval = -ESRCH;
+                goto out;
        }
        rcu_assign_pointer(tsk->cgroups, newcg);
        task_unlock(tsk);
@@ -1616,7 +1764,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         * is no longer empty.
         */
        cgroup_wakeup_rmdir_waiter(cgrp);
-        return 0;
+out:
+        if (retval) {
+                for_each_subsys(root, ss) {
+                        if (ss == failed_ss)
+                                /*
+                                 * This subsystem was the one that failed the
+                                 * can_attach() check earlier, so we don't need
+                                 * to call cancel_attach() against it or any
+                                 * remaining subsystems.
+                                 */
+                                break;
+                        if (ss->cancel_attach)
+                                ss->cancel_attach(ss, cgrp, tsk, false);
+                }
+        }
+        return retval;
 }
 /*
@@ -1682,6 +1845,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
        }
        return true;
 }
+EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
@@ -1950,6 +2114,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
        .rename = cgroup_rename,
 };
+/*
+ * Check if a file is a control file
+ */
+static inline struct cftype *__file_cft(struct file *file)
+{
+        if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+                return ERR_PTR(-EINVAL);
+        return __d_cft(file->f_dentry);
+}
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                                struct super_block *sb)
 {
@@ -2069,6 +2243,7 @@ int cgroup_add_file(struct cgroup *cgrp,
                error = PTR_ERR(dentry);
        return error;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_file);
 int cgroup_add_files(struct cgroup *cgrp,
                        struct cgroup_subsys *subsys,
@@ -2083,6 +2258,7 @@ int cgroup_add_files(struct cgroup *cgrp,
        }
        return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_files);
 /**
 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2468,7 +2644,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 {
        struct cgroup_pidlist *l;
        /* don't need task_nsproxy() if we're looking at ourself */
-        struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+        struct pid_namespace *ns = current->nsproxy->pid_ns;
        /*
         * We can't drop the pidlist_mutex before taking the l->mutex in case
         * the last ref-holder is trying to remove l from the list at the same
@@ -2478,8 +2655,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        mutex_lock(&cgrp->pidlist_mutex);
        list_for_each_entry(l, &cgrp->pidlists, links) {
                if (l->key.type == type && l->key.ns == ns) {
-                        /* found a matching list - drop the extra refcount */
-                        put_pid_ns(ns);
                        /* make sure l doesn't vanish out from under us */
                        down_write(&l->mutex);
                        mutex_unlock(&cgrp->pidlist_mutex);
@@ -2490,13 +2665,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
        if (!l) {
                mutex_unlock(&cgrp->pidlist_mutex);
-                put_pid_ns(ns);
                return l;
        }
        init_rwsem(&l->mutex);
        down_write(&l->mutex);
        l->key.type = type;
-        l->key.ns = ns;
+        l->key.ns = get_pid_ns(ns);
        l->use_count = 0; /* don't increment here */
        l->list = NULL;
        l->owner = cgrp;
@@ -2804,6 +2978,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 }
 /*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void cgroup_event_remove(struct work_struct *work)
+{
+        struct cgroup_event *event = container_of(work, struct cgroup_event,
+                        remove);
+        struct cgroup *cgrp = event->cgrp;
+        /* TODO: check return code */
+        event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+        eventfd_ctx_put(event->eventfd);
+        kfree(event);
+        dput(cgrp->dentry);
+}
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
+                int sync, void *key)
+{
+        struct cgroup_event *event = container_of(wait,
+                        struct cgroup_event, wait);
+        struct cgroup *cgrp = event->cgrp;
+        unsigned long flags = (unsigned long)key;
+        if (flags & POLLHUP) {
+                remove_wait_queue_locked(event->wqh, &event->wait);
+                spin_lock(&cgrp->event_list_lock);
+                list_del(&event->list);
+                spin_unlock(&cgrp->event_list_lock);
+                /*
+                 * We are in atomic context, but cgroup_event_remove() may
+                 * sleep, so we have to call it in workqueue.
+                 */
+                schedule_work(&event->remove);
+        }
+        return 0;
+}
+static void cgroup_event_ptable_queue_proc(struct file *file,
+                wait_queue_head_t *wqh, poll_table *pt)
+{
+        struct cgroup_event *event = container_of(pt,
+                        struct cgroup_event, pt);
+        event->wqh = wqh;
+        add_wait_queue(wqh, &event->wait);
+}
+/*
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buffer)
+{
+        struct cgroup_event *event = NULL;
+        unsigned int efd, cfd;
+        struct file *efile = NULL;
+        struct file *cfile = NULL;
+        char *endp;
+        int ret;
+        efd = simple_strtoul(buffer, &endp, 10);
+        if (*endp != ' ')
+                return -EINVAL;
+        buffer = endp + 1;
+        cfd = simple_strtoul(buffer, &endp, 10);
+        if ((*endp != ' ') && (*endp != '\0'))
+                return -EINVAL;
+        buffer = endp + 1;
+        event = kzalloc(sizeof(*event), GFP_KERNEL);
+        if (!event)
+                return -ENOMEM;
+        event->cgrp = cgrp;
+        INIT_LIST_HEAD(&event->list);
+        init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
+        init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
+        INIT_WORK(&event->remove, cgroup_event_remove);
+        efile = eventfd_fget(efd);
+        if (IS_ERR(efile)) {
+                ret = PTR_ERR(efile);
+                goto fail;
+        }
+        event->eventfd = eventfd_ctx_fileget(efile);
+        if (IS_ERR(event->eventfd)) {
+                ret = PTR_ERR(event->eventfd);
+                goto fail;
+        }
+        cfile = fget(cfd);
+        if (!cfile) {
+                ret = -EBADF;
+                goto fail;
+        }
+        /* the process need read permission on control file */
+        ret = file_permission(cfile, MAY_READ);
+        if (ret < 0)
+                goto fail;
+        event->cft = __file_cft(cfile);
+        if (IS_ERR(event->cft)) {
+                ret = PTR_ERR(event->cft);
+                goto fail;
+        }
+        if (!event->cft->register_event || !event->cft->unregister_event) {
+                ret = -EINVAL;
+                goto fail;
+        }
+        ret = event->cft->register_event(cgrp, event->cft,
+                        event->eventfd, buffer);
+        if (ret)
+                goto fail;
+        if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
+                event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+                ret = 0;
+                goto fail;
+        }
+        /*
+         * Events should be removed after rmdir of cgroup directory, but before
+         * destroying subsystem state objects. Let's take reference to cgroup
+         * directory dentry to do that.
+         */
+        dget(cgrp->dentry);
+        spin_lock(&cgrp->event_list_lock);
+        list_add(&event->list, &cgrp->event_list);
+        spin_unlock(&cgrp->event_list_lock);
+        fput(cfile);
+        fput(efile);
+        return 0;
+fail:
+        if (cfile)
+                fput(cfile);
+        if (event && event->eventfd && !IS_ERR(event->eventfd))
+                eventfd_ctx_put(event->eventfd);
+        if (!IS_ERR_OR_NULL(efile))
+                fput(efile);
+        kfree(event);
+        return ret;
+}
+/*
 * for the common functions, 'private' gives the type of file
 */
 /* for hysterical raisins, we can't put this on the older files */
@@ -2828,6 +3170,11 @@ static struct cftype files[] = {
                .read_u64 = cgroup_read_notify_on_release,
                .write_u64 = cgroup_write_notify_on_release,
        },
+        {
+                .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
+                .write_string = cgroup_write_event_control,
+                .mode = S_IWUGO,
+        },
 };
 static struct cftype cft_release_agent = {
@@ -2892,8 +3239,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
        /* We need to take each hierarchy_mutex in a consistent order */
        int i;
+        /*
+         * No worry about a race with rebind_subsystems that might mess up the
+         * locking order, since both parties are under cgroup_mutex.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                if (ss->root == root)
                        mutex_lock(&ss->hierarchy_mutex);
        }
@@ -2905,6 +3258,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                if (ss->root == root)
                        mutex_unlock(&ss->hierarchy_mutex);
        }
@@ -3028,11 +3383,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
         * synchronization other than RCU, and the subsystem linked
         * list isn't RCU-safe */
        int i;
+        /*
+         * We won't need to lock the subsys array, because the subsystems
+         * we're concerned about aren't going anywhere since our cgroup root
+         * has a reference on them.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                struct cgroup_subsys_state *css;
-                /* Skip subsystems not in this hierarchy */
+                /* Skip subsystems not present or not in this hierarchy */
-                if (ss->root != cgrp->root)
+                if (ss == NULL || ss->root != cgrp->root)
                        continue;
                css = cgrp->subsys[ss->subsys_id];
                /* When called from check_for_release() it's possible
@@ -3106,6 +3466,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        struct dentry *d;
        struct cgroup *parent;
        DEFINE_WAIT(wait);
+        struct cgroup_event *event, *tmp;
        int ret;
        /* the vfs holds both inode->i_mutex already */
@@ -3189,6 +3550,20 @@ again:
        set_bit(CGRP_RELEASABLE, &parent->flags);
        check_for_release(parent);
+        /*
+         * Unregister events and notify userspace.
+         * Notify userspace about cgroup removing only after rmdir of cgroup
+         * directory to avoid race between userspace and kernelspace
+         */
+        spin_lock(&cgrp->event_list_lock);
+        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
+                list_del(&event->list);
+                remove_wait_queue(event->wqh, &event->wait);
+                eventfd_signal(event->eventfd, 1);
+                schedule_work(&event->remove);
+        }
+        spin_unlock(&cgrp->event_list_lock);
        mutex_unlock(&cgroup_mutex);
        return 0;
 }
@@ -3223,9 +3598,198 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        mutex_init(&ss->hierarchy_mutex);
        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
        ss->active = 1;
+        /* this function shouldn't be used with modular subsystems, since they
+         * need to register a subsys_id, among other things */
+        BUG_ON(ss->module);
 }
 /**
+ * cgroup_load_subsys: load and register a modular subsystem at runtime
+ * @ss: the subsystem to load
+ *
+ * This function should be called in a modular subsystem's initcall. If the
+ * subsytem is built as a module, it will be assigned a new subsys_id and set
+ * up for use. If the subsystem is built-in anyway, work is delegated to the
+ * simpler cgroup_init_subsys.
+ */
+int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
+{
+        int i;
+        struct cgroup_subsys_state *css;
+        /* check name and function validity */
+        if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
+            ss->create == NULL || ss->destroy == NULL)
+                return -EINVAL;
+        /*
+         * we don't support callbacks in modular subsystems. this check is
+         * before the ss->module check for consistency; a subsystem that could
+         * be a module should still have no callbacks even if the user isn't
+         * compiling it as one.
+         */
+        if (ss->fork || ss->exit)
+                return -EINVAL;
+        /*
+         * an optionally modular subsystem is built-in: we want to do nothing,
+         * since cgroup_init_subsys will have already taken care of it.
+         */
+        if (ss->module == NULL) {
+                /* a few sanity checks */
+                BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
+                BUG_ON(subsys[ss->subsys_id] != ss);
+                return 0;
+        }
+        /*
+         * need to register a subsys id before anything else - for example,
+         * init_cgroup_css needs it.
+         */
+        mutex_lock(&cgroup_mutex);
+        /* find the first empty slot in the array */
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                if (subsys[i] == NULL)
+                        break;
+        }
+        if (i == CGROUP_SUBSYS_COUNT) {
+                /* maximum number of subsystems already registered! */
+                mutex_unlock(&cgroup_mutex);
+                return -EBUSY;
+        }
+        /* assign ourselves the subsys_id */
+        ss->subsys_id = i;
+        subsys[i] = ss;
+        /*
+         * no ss->create seems to need anything important in the ss struct, so
+         * this can happen first (i.e. before the rootnode attachment).
+         */
+        css = ss->create(ss, dummytop);
+        if (IS_ERR(css)) {
+                /* failure case - need to deassign the subsys[] slot. */
+                subsys[i] = NULL;
+                mutex_unlock(&cgroup_mutex);
+                return PTR_ERR(css);
+        }
+        list_add(&ss->sibling, &rootnode.subsys_list);
+        ss->root = &rootnode;
+        /* our new subsystem will be attached to the dummy hierarchy. */
+        init_cgroup_css(css, ss, dummytop);
+        /* init_idr must be after init_cgroup_css because it sets css->id. */
+        if (ss->use_id) {
+                int ret = cgroup_init_idr(ss, css);
+                if (ret) {
+                        dummytop->subsys[ss->subsys_id] = NULL;
+                        ss->destroy(ss, dummytop);
+                        subsys[i] = NULL;
+                        mutex_unlock(&cgroup_mutex);
+                        return ret;
+                }
+        }
+        /*
+         * Now we need to entangle the css into the existing css_sets. unlike
+         * in cgroup_init_subsys, there are now multiple css_sets, so each one
+         * will need a new pointer to it; done by iterating the css_set_table.
+         * furthermore, modifying the existing css_sets will corrupt the hash
+         * table state, so each changed css_set will need its hash recomputed.
+         * this is all done under the css_set_lock.
+         */
+        write_lock(&css_set_lock);
+        for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+                struct css_set *cg;
+                struct hlist_node *node, *tmp;
+                struct hlist_head *bucket = &css_set_table[i], *new_bucket;
+                hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
+                        /* skip entries that we already rehashed */
+                        if (cg->subsys[ss->subsys_id])
+                                continue;
+                        /* remove existing entry */
+                        hlist_del(&cg->hlist);
+                        /* set new value */
+                        cg->subsys[ss->subsys_id] = css;
+                        /* recompute hash and restore entry */
+                        new_bucket = css_set_hash(cg->subsys);
+                        hlist_add_head(&cg->hlist, new_bucket);
+                }
+        }
+        write_unlock(&css_set_lock);
+        mutex_init(&ss->hierarchy_mutex);
+        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
+        ss->active = 1;
+        /* success! */
+        mutex_unlock(&cgroup_mutex);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(cgroup_load_subsys);
+/**
+ * cgroup_unload_subsys: unload a modular subsystem
+ * @ss: the subsystem to unload
+ *
+ * This function should be called in a modular subsystem's exitcall. When this
+ * function is invoked, the refcount on the subsystem's module will be 0, so
+ * the subsystem will not be attached to any hierarchy.
+ */
+void cgroup_unload_subsys(struct cgroup_subsys *ss)
+{
+        struct cg_cgroup_link *link;
+        struct hlist_head *hhead;
+        BUG_ON(ss->module == NULL);
+        /*
+         * we shouldn't be called if the subsystem is in use, and the use of
+         * try_module_get in parse_cgroupfs_options should ensure that it
+         * doesn't start being used while we're killing it off.
+         */
+        BUG_ON(ss->root != &rootnode);
+        mutex_lock(&cgroup_mutex);
+        /* deassign the subsys_id */
+        BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
+        subsys[ss->subsys_id] = NULL;
+        /* remove subsystem from rootnode's list of subsystems */
+        list_del(&ss->sibling);
+        /*
+         * disentangle the css from all css_sets attached to the dummytop. as
+         * in loading, we need to pay our respects to the hashtable gods.
+         */
+        write_lock(&css_set_lock);
+        list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
+                struct css_set *cg = link->cg;
+                hlist_del(&cg->hlist);
+                BUG_ON(!cg->subsys[ss->subsys_id]);
+                cg->subsys[ss->subsys_id] = NULL;
+                hhead = css_set_hash(cg->subsys);
+                hlist_add_head(&cg->hlist, hhead);
+        }
+        write_unlock(&css_set_lock);
+        /*
+         * remove subsystem's css from the dummytop and free it - need to free
+         * before marking as null because ss->destroy needs the cgrp->subsys
+         * pointer to find their state. note that this also takes care of
+         * freeing the css_id.
+         */
+        ss->destroy(ss, dummytop);
+        dummytop->subsys[ss->subsys_id] = NULL;
+        mutex_unlock(&cgroup_mutex);
+}
+EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
+/**
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
@@ -3253,7 +3817,8 @@ int __init cgroup_init_early(void)
        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
                INIT_HLIST_HEAD(&css_set_table[i]);
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                BUG_ON(!ss->name);
@@ -3288,12 +3853,13 @@ int __init cgroup_init(void)
        if (err)
                return err;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                if (!ss->early_init)
                        cgroup_init_subsys(ss);
                if (ss->use_id)
-                        cgroup_subsys_init_idr(ss);
+                        cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
        }
        /* Add init_css_set to the hash table */
@@ -3397,9 +3963,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
        int i;
        seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+        /*
+         * ideally we don't want subsystems moving around while we do this.
+         * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+         * subsys/hierarchy state.
+         */
        mutex_lock(&cgroup_mutex);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                seq_printf(m, "%s\t%d\t%d\t%d\n",
                           ss->name, ss->root->hierarchy_id,
                           ss->root->number_of_cgroups, !ss->disabled);
@@ -3457,7 +4030,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
 {
        if (need_forkexit_callback) {
                int i;
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * forkexit callbacks are only supported for builtin
+                 * subsystems, and the builtin section of the subsys array is
+                 * immutable, so we don't need to lock the subsys array here.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->fork)
                                ss->fork(ss, child);
@@ -3526,7 +4104,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        struct css_set *cg;
        if (run_callbacks && need_forkexit_callback) {
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * modular subsystems can't use callbacks, so no need to lock
+                 * the subsys array
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->exit)
                                ss->exit(ss, tsk);
@@ -3720,12 +4302,13 @@ static void check_for_release(struct cgroup *cgrp)
        }
 }
-void __css_put(struct cgroup_subsys_state *css)
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css, int count)
 {
        struct cgroup *cgrp = css->cgroup;
        int val;
        rcu_read_lock();
-        val = atomic_dec_return(&css->refcnt);
+        val = atomic_sub_return(count, &css->refcnt);
        if (val == 1) {
                if (notify_on_release(cgrp)) {
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3736,6 +4319,7 @@ void __css_put(struct cgroup_subsys_state *css)
        rcu_read_unlock();
        WARN_ON_ONCE(val < 1);
 }
+EXPORT_SYMBOL_GPL(__css_put);
 /*
 * Notify userspace when a cgroup is released, by running the
@@ -3817,8 +4401,11 @@ static int __init cgroup_disable(char *str)
        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;
+                /*
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                 * cgroup_disable, being at boot time, can't know about module
+                 * subsystems, so we don't worry about them.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (!strcmp(token, ss->name)) {
@@ -3848,6 +4435,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
                return cssid->id;
        return 0;
 }
+EXPORT_SYMBOL_GPL(css_id);
 unsigned short css_depth(struct cgroup_subsys_state *css)
 {
@@ -3857,6 +4445,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
                return cssid->depth;
        return 0;
 }
+EXPORT_SYMBOL_GPL(css_depth);
 bool css_is_ancestor(struct cgroup_subsys_state *child,
                    const struct cgroup_subsys_state *root)
@@ -3893,6 +4482,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        spin_unlock(&ss->id_lock);
        call_rcu(&id->rcu_head, __free_css_id_cb);
 }
+EXPORT_SYMBOL_GPL(free_css_id);
 /*
 * This is called by init or create(). Then, calls to this function are
@@ -3942,15 +4532,14 @@ err_out:
 }
-static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
+                                            struct cgroup_subsys_state *rootcss)
 {
        struct css_id *newid;
-        struct cgroup_subsys_state *rootcss;
        spin_lock_init(&ss->id_lock);
        idr_init(&ss->idr);
-        rootcss = init_css_set.subsys[ss->subsys_id];
        newid = get_new_cssid(ss, 0);
        if (IS_ERR(newid))
                return PTR_ERR(newid);
@@ -4010,6 +4599,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
        return rcu_dereference(cssid->css);
 }
+EXPORT_SYMBOL_GPL(css_lookup);
 /**
 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 59e9ef6aab40..e5c0244962b0 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -15,6 +15,7 @@
 */
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
@@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task)
                            struct freezer, css);
 }
-int cgroup_frozen(struct task_struct *task)
+int cgroup_freezing_or_frozen(struct task_struct *task)
 {
        struct freezer *freezer;
        enum freezer_state state;
        task_lock(task);
        freezer = task_freezer(task);
-        state = freezer->state;
+        if (!freezer->css.cgroup->parent)
+                state = CGROUP_THAWED; /* root cgroup can't be frozen */
+        else
+                state = freezer->state;
        task_unlock(task);
-        return state == CGROUP_FROZEN;
+        return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
 }
 /*
@@ -201,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
         * No lock is needed, since the task isn't on tasklist yet,
         * so it can't be moved to another cgroup, which means the
         * freezer won't be removed and will be valid during this
-         * function call.
+         * function call.  Nevertheless, apply RCU read-side critical
+         * section to suppress RCU lockdep false positives.
         */
+        rcu_read_lock();
        freezer = task_freezer(task);
+        rcu_read_unlock();
        /*
         * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f07ea6..7f40e9275fd9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -25,6 +25,7 @@
 #include <linux/posix-timers.h>
 #include <linux/times.h>
 #include <linux/ptrace.h>
+#include <linux/gfp.h>
 #include <asm/uaccess.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f8cced2692b3..25bba73b1be3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,7 @@
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
+#include <linux/gfp.h>
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459f..d10946748ec2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    call to guarantee_online_mems(), as we know no one is changing
 *    our task's cpuset.
 *
- *    Hold callback_mutex around the two modifications of our tasks
- *    mems_allowed to synchronize with cpuset_mems_allowed().
- *
 *    While the mm_struct we are migrating is typically from some
 *    other task, the task_struct mems_allowed that we are hacking
 *    is for our current task, which must allocate new pages for that
@@ -973,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
-        nodemask_t newmems;
+        NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
+        if (!newmems)
+                return;
        cs = cgroup_cs(scan->cg);
-        guarantee_online_mems(cs, &newmems);
+        guarantee_online_mems(cs, newmems);
        task_lock(p);
-        cpuset_change_task_nodemask(p, &newmems);
+        cpuset_change_task_nodemask(p, newmems);
        task_unlock(p);
+        NODEMASK_FREE(newmems);
        mm = get_task_mm(p);
        if (!mm)
                return;
@@ -1051,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                           const char *buf)
 {
-        nodemask_t oldmem;
+        NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
        int retval;
        struct ptr_heap heap;
+        if (!oldmem)
+                return -ENOMEM;
        /*
         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
         * it's read-only
         */
-        if (cs == &top_cpuset)
+        if (cs == &top_cpuset) {
-                return -EACCES;
+                retval = -EACCES;
+                goto done;
+        }
        /*
         * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                        goto done;
                if (!nodes_subset(trialcs->mems_allowed,
-                                node_states[N_HIGH_MEMORY]))
+                                node_states[N_HIGH_MEMORY])) {
-                        return -EINVAL;
+                        retval =  -EINVAL;
+                        goto done;
+                }
        }
-        oldmem = cs->mems_allowed;
+        *oldmem = cs->mems_allowed;
-        if (nodes_equal(oldmem, trialcs->mems_allowed)) {
+        if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
                retval = 0;             /* Too easy - nothing to do */
                goto done;
        }
@@ -1096,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        cs->mems_allowed = trialcs->mems_allowed;
        mutex_unlock(&callback_mutex);
-        update_tasks_nodemask(cs, &oldmem, &heap);
+        update_tasks_nodemask(cs, oldmem, &heap);
        heap_free(&heap);
 done:
+        NODEMASK_FREE(oldmem);
        return retval;
 }
@@ -1384,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
                          struct cgroup *oldcont, struct task_struct *tsk,
                          bool threadgroup)
 {
-        nodemask_t from, to;
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
+        NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
+        NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
+        if (from == NULL || to == NULL)
+                goto alloc_fail;
        if (cs == &top_cpuset) {
                cpumask_copy(cpus_attach, cpu_possible_mask);
-                to = node_possible_map;
        } else {
                guarantee_online_cpus(cs, cpus_attach);
-                guarantee_online_mems(cs, &to);
        }
+        guarantee_online_mems(cs, to);
        /* do per-task migration stuff possibly for each in the threadgroup */
-        cpuset_attach_task(tsk, &to, cs);
+        cpuset_attach_task(tsk, to, cs);
        if (threadgroup) {
                struct task_struct *c;
                rcu_read_lock();
                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        cpuset_attach_task(c, &to, cs);
+                        cpuset_attach_task(c, to, cs);
                }
                rcu_read_unlock();
        }
        /* change mm; only needs to be done once even if threadgroup */
-        from = oldcs->mems_allowed;
+        *from = oldcs->mems_allowed;
-        to = cs->mems_allowed;
+        *to = cs->mems_allowed;
        mm = get_task_mm(tsk);
        if (mm) {
-                mpol_rebind_mm(mm, &to);
+                mpol_rebind_mm(mm, to);
                if (is_memory_migrate(cs))
-                        cpuset_migrate_mm(mm, &from, &to);
+                        cpuset_migrate_mm(mm, from, to);
                mmput(mm);
        }
+alloc_fail:
+        NODEMASK_FREE(from);
+        NODEMASK_FREE(to);
 }
 /* The various types of files and directories in a cpuset file system */
@@ -1562,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
-        nodemask_t mask;
+        NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
+        int retval;
+        if (mask == NULL)
+                return -ENOMEM;
        mutex_lock(&callback_mutex);
-        mask = cs->mems_allowed;
+        *mask = cs->mems_allowed;
        mutex_unlock(&callback_mutex);
-        return nodelist_scnprintf(page, PAGE_SIZE, mask);
+        retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
+        NODEMASK_FREE(mask);
+        return retval;
 }
 static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1997,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
        struct cpuset *cp;      /* scans cpusets being updated */
        struct cpuset *child;   /* scans child cpusets of cp */
        struct cgroup *cont;
-        nodemask_t oldmems;
+        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        if (oldmems == NULL)
+                return;
        list_add_tail((struct list_head *)&root->stack_list, &queue);
@@ -2014,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
                        continue;
-                oldmems = cp->mems_allowed;
+                *oldmems = cp->mems_allowed;
                /* Remove offline cpus and mems from this cpuset. */
                mutex_lock(&callback_mutex);
@@ -2030,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                        remove_tasks_in_empty_cpuset(cp);
                else {
                        update_tasks_cpumask(cp, NULL);
-                        update_tasks_nodemask(cp, &oldmems, NULL);
+                        update_tasks_nodemask(cp, oldmems, NULL);
                }
        }
+        NODEMASK_FREE(oldmems);
 }
 /*
@@ -2090,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
 {
+        NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
+        if (oldmems == NULL)
+                return NOTIFY_DONE;
        cgroup_lock();
        switch (action) {
        case MEM_ONLINE:
-        case MEM_OFFLINE:
+                *oldmems = top_cpuset.mems_allowed;
                mutex_lock(&callback_mutex);
                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
                mutex_unlock(&callback_mutex);
-                if (action == MEM_OFFLINE)
+                update_tasks_nodemask(&top_cpuset, oldmems, NULL);
-                        scan_for_empty_cpusets(&top_cpuset);
+                break;
+        case MEM_OFFLINE:
+                /*
+                 * needn't update top_cpuset.mems_allowed explicitly because
+                 * scan_for_empty_cpusets() will update it.
+                 */
+                scan_for_empty_cpusets(&top_cpuset);
                break;
        default:
                break;
        }
        cgroup_unlock();
+        NODEMASK_FREE(oldmems);
        return NOTIFY_OK;
 }
 #endif
diff --git a/kernel/cred.c b/kernel/cred.c
index 1ed8ca18790c..62af1816c235 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -10,6 +10,7 @@
 */
 #include <linux/module.h>
 #include <linux/cred.h>
+#include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/key.h>
 #include <linux/keyctl.h>
@@ -364,7 +365,7 @@ struct cred *prepare_usermodehelper_creds(void)
        new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
        if (!new)
-                return NULL;
+                goto free_tgcred;
        kdebug("prepare_usermodehelper_creds() alloc %p", new);
@@ -398,6 +399,12 @@ struct cred *prepare_usermodehelper_creds(void)
 error:
        put_cred(new);
        return NULL;
+free_tgcred:
+#ifdef CONFIG_KEYS
+        kfree(tgcred);
+#endif
+        return NULL;
 }
 /*
@@ -786,8 +793,6 @@ bool creds_are_invalid(const struct cred *cred)
 {
        if (cred->magic != CRED_MAGIC)
                return true;
-        if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
-                return true;
 #ifdef CONFIG_SECURITY_SELINUX
        if (selinux_is_enabled()) {
                if ((unsigned long) cred->security < PAGE_SIZE)
diff --git a/kernel/early_res.c b/kernel/early_res.c
index 3cb2c661bb78..31aa9332ef3f 100644
--- a/kernel/early_res.c
+++ b/kernel/early_res.c
@@ -333,6 +333,12 @@ void __init free_early_partial(u64 start, u64 end)
        struct early_res *r;
        int i;
+        if (start == end)
+                return;
+        if (WARN_ONCE(start > end, "  wrong range [%#llx, %#llx]\n", start, end))
+                return;
 try_next:
        i = find_overlapped_early(start, end);
        if (i >= max_early_res)
diff --git a/kernel/exit.c b/kernel/exit.c
index ce1e48c2d93d..7f2683a10ac4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -87,7 +87,7 @@ static void __exit_signal(struct task_struct *tsk)
        sighand = rcu_dereference_check(tsk->sighand,
                                        rcu_read_lock_held() ||
-                                        lockdep_is_held(&tasklist_lock));
+                                        lockdep_tasklist_lock_is_held());
        spin_lock(&sighand->siglock);
        posix_cpu_timers_exit(tsk);
@@ -953,7 +953,8 @@ NORET_TYPE void do_exit(long code)
        acct_update_integrals(tsk);
        /* sync mm's RSS info before statistics gathering */
-        sync_mm_rss(tsk, tsk->mm);
+        if (tsk->mm)
+                sync_mm_rss(tsk, tsk->mm);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index b0ec34abc0bb..44b0791b0a2e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,7 +86,14 @@ int max_threads;		/* tunable limit on nr_threads */
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
-EXPORT_SYMBOL_GPL(tasklist_lock);
+#ifdef CONFIG_PROVE_RCU
+int lockdep_tasklist_lock_is_held(void)
+{
+        return lockdep_is_held(&tasklist_lock);
+}
+EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
+#endif /* #ifdef CONFIG_PROVE_RCU */
 int nr_processes(void)
 {
@@ -833,17 +840,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
        /* Thread group counters. */
        thread_group_cputime_init(sig);
-        /* Expiration times and increments. */
-        sig->it[CPUCLOCK_PROF].expires = cputime_zero;
-        sig->it[CPUCLOCK_PROF].incr = cputime_zero;
-        sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
-        sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
-        /* Cached expiration times. */
-        sig->cputime_expires.prof_exp = cputime_zero;
-        sig->cputime_expires.virt_exp = cputime_zero;
-        sig->cputime_expires.sched_exp = 0;
        cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        if (cpu_limit != RLIM_INFINITY) {
                sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
@@ -863,7 +859,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        if (clone_flags & CLONE_THREAD)
                return 0;
-        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;
@@ -871,46 +867,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        atomic_set(&sig->count, 1);
        atomic_set(&sig->live, 1);
        init_waitqueue_head(&sig->wait_chldexit);
-        sig->flags = 0;
        if (clone_flags & CLONE_NEWPID)
                sig->flags |= SIGNAL_UNKILLABLE;
-        sig->group_exit_code = 0;
-        sig->group_exit_task = NULL;
-        sig->group_stop_count = 0;
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        sig->it_real_incr.tv64 = 0;
        sig->real_timer.function = it_real_fn;
-        sig->leader = 0;        /* session leadership doesn't inherit */
-        sig->tty_old_pgrp = NULL;
-        sig->tty = NULL;
-        sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
-        sig->gtime = cputime_zero;
-        sig->cgtime = cputime_zero;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-        sig->prev_utime = sig->prev_stime = cputime_zero;
-#endif
-        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
-        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
-        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-        sig->maxrss = sig->cmaxrss = 0;
-        task_io_accounting_init(&sig->ioac);
-        sig->sum_sched_runtime = 0;
-        taskstats_tgid_init(sig);
        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);
        posix_cpu_timers_init_group(sig);
-        acct_init_pacct(&sig->pacct);
        tty_audit_fork(sig);
        sig->oom_adj = current->signal->oom_adj;
@@ -1081,6 +1052,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->prev_utime = cputime_zero;
        p->prev_stime = cputime_zero;
 #endif
+#if defined(SPLIT_RSS_COUNTING)
+        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
+#endif
        p->default_timer_slack_ns = current->timer_slack_ns;
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 967e66143e11..03808ed342a6 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -413,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
 *
 * @return a set of per_cpu pointers to perf events
 */
-struct perf_event **
+struct perf_event * __percpu *
 register_wide_hw_breakpoint(struct perf_event_attr *attr,
                            perf_overflow_handler_t triggered)
 {
-        struct perf_event **cpu_events, **pevent, *bp;
+        struct perf_event * __percpu *cpu_events, **pevent, *bp;
        long err;
        int cpu;
        cpu_events = alloc_percpu(typeof(*cpu_events));
        if (!cpu_events)
-                return ERR_PTR(-ENOMEM);
+                return (void __percpu __force *)ERR_PTR(-ENOMEM);
        get_online_cpus();
        for_each_online_cpu(cpu) {
@@ -451,7 +451,7 @@ fail:
        put_online_cpus();
        free_percpu(cpu_events);
-        return ERR_PTR(err);
+        return (void __percpu __force *)ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -459,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
 * @cpu_events: the per cpu set of events to unregister
 */
-void unregister_wide_hw_breakpoint(struct perf_event **cpu_events)
+void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
 {
        int cpu;
        struct perf_event **pevent;
@@ -489,5 +489,4 @@ struct pmu perf_ops_bp = {
        .enable         = arch_install_hw_breakpoint,
        .disable        = arch_uninstall_hw_breakpoint,
        .read           = hw_breakpoint_pmu_read,
-        .unthrottle     = hw_breakpoint_pmu_unthrottle
 };
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index d70394f12ee9..b7091d5ca2f8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
                if (desc->chip->ack)
                        desc->chip->ack(irq);
        }
+        desc->status |= IRQ_MASKED;
+}
+static inline void mask_irq(struct irq_desc *desc, int irq)
+{
+        if (desc->chip->mask) {
+                desc->chip->mask(irq);
+                desc->status |= IRQ_MASKED;
+        }
+}
+static inline void unmask_irq(struct irq_desc *desc, int irq)
+{
+        if (desc->chip->unmask) {
+                desc->chip->unmask(irq);
+                desc->status &= ~IRQ_MASKED;
+        }
 }
 /*
@@ -484,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
        raw_spin_lock(&desc->lock);
        desc->status &= ~IRQ_INPROGRESS;
-        if (unlikely(desc->status & IRQ_ONESHOT))
+        if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
-                desc->status |= IRQ_MASKED;
+                unmask_irq(desc, irq);
-        else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
-                desc->chip->unmask(irq);
 out_unlock:
        raw_spin_unlock(&desc->lock);
 }
@@ -524,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        action = desc->action;
        if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
                desc->status |= IRQ_PENDING;
-                if (desc->chip->mask)
+                mask_irq(desc, irq);
-                        desc->chip->mask(irq);
                goto out;
        }
@@ -554,7 +568,7 @@ out:
 *      signal. The occurence is latched into the irq controller hardware
 *      and must be acked in order to be reenabled. After the ack another
 *      interrupt can happen on the same source even before the first one
- *      is handled by the assosiacted event handler. If this happens it
+ *      is handled by the associated event handler. If this happens it
 *      might be necessary to disable (mask) the interrupt depending on the
 *      controller hardware. This requires to reenable the interrupt inside
 *      of the loop which handles the interrupts which have arrived while
@@ -593,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                irqreturn_t action_ret;
                if (unlikely(!action)) {
-                        desc->chip->mask(irq);
+                        mask_irq(desc, irq);
                        goto out_unlock;
                }
@@ -605,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                if (unlikely((desc->status &
                               (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
                              (IRQ_PENDING | IRQ_MASKED))) {
-                        desc->chip->unmask(irq);
+                        unmask_irq(desc, irq);
-                        desc->status &= ~IRQ_MASKED;
                }
                desc->status &= ~IRQ_PENDING;
@@ -716,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
        __set_irq_handler(irq, handle, 0, name);
 }
-void __init set_irq_noprobe(unsigned int irq)
+void set_irq_noprobe(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -731,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq)
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
-void __init set_irq_probe(unsigned int irq)
+void set_irq_probe(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 *      automatically freed on driver detach.
 *
 *      If an IRQ allocated with this function needs to be freed
- *      separately, dev_free_irq() must be used.
+ *      separately, devm_free_irq() must be used.
 */
 int devm_request_threaded_irq(struct device *dev, unsigned int irq,
                              irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
 *      Except for the extra @dev argument, this function takes the
 *      same arguments and performs the same function as free_irq().
 *      This function instead of free_irq() should be used to manually
- *      free IRQs allocated with dev_request_irq().
+ *      free IRQs allocated with devm_request_irq().
 */
 void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
 {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb6078ca60c7..704e488730a5 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irqaction *action;
+        unsigned long flags;
        if (!desc)
                return 0;
@@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
        if (desc->status & IRQ_NOREQUEST)
                return 0;
+        raw_spin_lock_irqsave(&desc->lock, flags);
        action = desc->action;
        if (action)
                if (irqflags & action->flags & IRQF_SHARED)
                        action = NULL;
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return !action;
 }
@@ -483,8 +487,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)
 */
 static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
 {
+again:
        chip_bus_lock(irq, desc);
        raw_spin_lock_irq(&desc->lock);
+        /*
+         * Implausible though it may be we need to protect us against
+         * the following scenario:
+         *
+         * The thread is faster done than the hard interrupt handler
+         * on the other CPU. If we unmask the irq line then the
+         * interrupt can come in again and masks the line, leaves due
+         * to IRQ_INPROGRESS and the irq line is masked forever.
+         */
+        if (unlikely(desc->status & IRQ_INPROGRESS)) {
+                raw_spin_unlock_irq(&desc->lock);
+                chip_bus_sync_unlock(irq, desc);
+                cpu_relax();
+                goto again;
+        }
        if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
                desc->status &= ~IRQ_MASKED;
                desc->chip->unmask(irq);
@@ -735,6 +757,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                if (new->flags & IRQF_ONESHOT)
                        desc->status |= IRQ_ONESHOT;
+                /*
+                 * Force MSI interrupts to run with interrupts
+                 * disabled. The multi vector cards can cause stack
+                 * overflows due to nested interrupts when enough of
+                 * them are directed to a core and fire at the same
+                 * time.
+                 */
+                if (desc->msi_desc)
+                        new->flags |= IRQF_DISABLED;
                if (!(desc->status & IRQ_NOAUTOEN)) {
                        desc->depth = 0;
                        desc->status &= ~IRQ_DISABLED;
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 963559dbd858..65d3845665ac 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -6,6 +6,7 @@
 */
 #include <linux/irq.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6f50eccc79c0..7a6eb04ef6b5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -7,6 +7,7 @@
 */
 #include <linux/irq.h>
+#include <linux/gfp.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8e5288a8a355..13aff293f4de 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -21,6 +21,7 @@
 #include <linux/sched.h>        /* for cond_resched */
 #include <linux/mm.h>
 #include <linux/ctype.h>
+#include <linux/slab.h>
 #include <asm/sections.h>
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 761fdd2b3034..11f3515ca83f 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -69,9 +69,16 @@ struct kgdb_state {
        struct pt_regs          *linux_regs;
 };
+/* Exception state values */
+#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
+#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
+#define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */
+#define DCPU_SSTEP       0x8 /* CPU is single stepping */
 static struct debuggerinfo_struct {
        void                    *debuggerinfo;
        struct task_struct      *task;
+        int                     exception_state;
 } kgdb_info[NR_CPUS];
 /**
@@ -391,27 +398,22 @@ int kgdb_mem2hex(char *mem, char *buf, int count)
 /*
 * Copy the binary array pointed to by buf into mem.  Fix $, #, and
- * 0x7d escaped with 0x7d.  Return a pointer to the character after
+ * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
- * the last byte written.
+ * The input buf is overwitten with the result to write to mem.
 */
 static int kgdb_ebin2mem(char *buf, char *mem, int count)
 {
-        int err = 0;
+        int size = 0;
-        char c;
+        char *c = buf;
        while (count-- > 0) {
-                c = *buf++;
+                c[size] = *buf++;
-                if (c == 0x7d)
+                if (c[size] == 0x7d)
-                        c = *buf++ ^ 0x20;
+                        c[size] = *buf++ ^ 0x20;
+                size++;
-                err = probe_kernel_write(mem, &c, 1);
-                if (err)
-                        break;
-                mem++;
        }
-        return err;
+        return probe_kernel_write(mem, c, size);
 }
 /*
@@ -563,49 +565,6 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
 }
 /*
- * CPU debug state control:
- */
-#ifdef CONFIG_SMP
-static void kgdb_wait(struct pt_regs *regs)
-{
-        unsigned long flags;
-        int cpu;
-        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
-        kgdb_info[cpu].debuggerinfo = regs;
-        kgdb_info[cpu].task = current;
-        /*
-         * Make sure the above info reaches the primary CPU before
-         * our cpu_in_kgdb[] flag setting does:
-         */
-        smp_wmb();
-        atomic_set(&cpu_in_kgdb[cpu], 1);
-        /* Disable any cpu specific hw breakpoints */
-        kgdb_disable_hw_debug(regs);
-        /* Wait till primary CPU is done with debugging */
-        while (atomic_read(&passive_cpu_wait[cpu]))
-                cpu_relax();
-        kgdb_info[cpu].debuggerinfo = NULL;
-        kgdb_info[cpu].task = NULL;
-        /* fix up hardware debug registers on local cpu */
-        if (arch_kgdb_ops.correct_hw_break)
-                arch_kgdb_ops.correct_hw_break();
-        /* Signal the primary CPU that we are done: */
-        atomic_set(&cpu_in_kgdb[cpu], 0);
-        touch_softlockup_watchdog_sync();
-        clocksource_touch_watchdog();
-        local_irq_restore(flags);
-}
-#endif
-/*
 * Some architectures need cache flushes when we set/clear a
 * breakpoint:
 */
@@ -1400,34 +1359,13 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
        return 1;
 }
-/*
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
- * kgdb_handle_exception() - main entry point from a kernel exception
- *
- * Locking hierarchy:
- *      interface locks, if any (begin_session)
- *      kgdb lock (kgdb_active)
- */
-int
-kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
 {
-        struct kgdb_state kgdb_var;
-        struct kgdb_state *ks = &kgdb_var;
        unsigned long flags;
        int sstep_tries = 100;
        int error = 0;
        int i, cpu;
+        int trace_on = 0;
-        ks->cpu                 = raw_smp_processor_id();
-        ks->ex_vector           = evector;
-        ks->signo               = signo;
-        ks->ex_vector           = evector;
-        ks->err_code            = ecode;
-        ks->kgdb_usethreadid    = 0;
-        ks->linux_regs          = regs;
-        if (kgdb_reenter_check(ks))
-                return 0; /* Ouch, double exception ! */
 acquirelock:
        /*
         * Interrupts will be restored by the 'trap return' code, except when
@@ -1435,13 +1373,43 @@ acquirelock:
         */
        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
+        cpu = ks->cpu;
+        kgdb_info[cpu].debuggerinfo = regs;
+        kgdb_info[cpu].task = current;
+        /*
+         * Make sure the above info reaches the primary CPU before
+         * our cpu_in_kgdb[] flag setting does:
+         */
+        atomic_inc(&cpu_in_kgdb[cpu]);
        /*
-         * Acquire the kgdb_active lock:
+         * CPU will loop if it is a slave or request to become a kgdb
+         * master cpu and acquire the kgdb_active lock:
         */
-        while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
+        while (1) {
+                if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
+                        if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
+                                break;
+                } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
+                        if (!atomic_read(&passive_cpu_wait[cpu]))
+                                goto return_normal;
+                } else {
+return_normal:
+                        /* Return to normal operation by executing any
+                         * hw breakpoint fixup.
+                         */
+                        if (arch_kgdb_ops.correct_hw_break)
+                                arch_kgdb_ops.correct_hw_break();
+                        if (trace_on)
+                                tracing_on();
+                        atomic_dec(&cpu_in_kgdb[cpu]);
+                        touch_softlockup_watchdog_sync();
+                        clocksource_touch_watchdog();
+                        local_irq_restore(flags);
+                        return 0;
+                }
                cpu_relax();
+        }
        /*
         * For single stepping, try to only enter on the processor
@@ -1475,9 +1443,6 @@ acquirelock:
        if (kgdb_io_ops->pre_exception)
                kgdb_io_ops->pre_exception();
-        kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
-        kgdb_info[ks->cpu].task = current;
        kgdb_disable_hw_debug(ks->linux_regs);
        /*
@@ -1486,15 +1451,9 @@ acquirelock:
         */
        if (!kgdb_single_step) {
                for (i = 0; i < NR_CPUS; i++)
-                        atomic_set(&passive_cpu_wait[i], 1);
+                        atomic_inc(&passive_cpu_wait[i]);
        }
-        /*
-         * spin_lock code is good enough as a barrier so we don't
-         * need one here:
-         */
-        atomic_set(&cpu_in_kgdb[ks->cpu], 1);
 #ifdef CONFIG_SMP
        /* Signal the other CPUs to enter kgdb_wait() */
        if ((!kgdb_single_step) && kgdb_do_roundup)
@@ -1518,6 +1477,9 @@ acquirelock:
        kgdb_single_step = 0;
        kgdb_contthread = current;
        exception_level = 0;
+        trace_on = tracing_is_on();
+        if (trace_on)
+                tracing_off();
        /* Talk to debugger with gdbserial protocol */
        error = gdb_serial_stub(ks);
@@ -1526,13 +1488,11 @@ acquirelock:
        if (kgdb_io_ops->post_exception)
                kgdb_io_ops->post_exception();
-        kgdb_info[ks->cpu].debuggerinfo = NULL;
+        atomic_dec(&cpu_in_kgdb[ks->cpu]);
-        kgdb_info[ks->cpu].task = NULL;
-        atomic_set(&cpu_in_kgdb[ks->cpu], 0);
        if (!kgdb_single_step) {
                for (i = NR_CPUS-1; i >= 0; i--)
-                        atomic_set(&passive_cpu_wait[i], 0);
+                        atomic_dec(&passive_cpu_wait[i]);
                /*
                 * Wait till all the CPUs have quit
                 * from the debugger.
@@ -1551,6 +1511,8 @@ kgdb_restore:
                else
                        kgdb_sstep_pid = 0;
        }
+        if (trace_on)
+                tracing_on();
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
        touch_softlockup_watchdog_sync();
@@ -1560,13 +1522,52 @@ kgdb_restore:
        return error;
 }
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ *      interface locks, if any (begin_session)
+ *      kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+        struct kgdb_state kgdb_var;
+        struct kgdb_state *ks = &kgdb_var;
+        int ret;
+        ks->cpu                 = raw_smp_processor_id();
+        ks->ex_vector           = evector;
+        ks->signo               = signo;
+        ks->ex_vector           = evector;
+        ks->err_code            = ecode;
+        ks->kgdb_usethreadid    = 0;
+        ks->linux_regs          = regs;
+        if (kgdb_reenter_check(ks))
+                return 0; /* Ouch, double exception ! */
+        kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
+        ret = kgdb_cpu_enter(ks, regs);
+        kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
+        return ret;
+}
 int kgdb_nmicallback(int cpu, void *regs)
 {
 #ifdef CONFIG_SMP
+        struct kgdb_state kgdb_var;
+        struct kgdb_state *ks = &kgdb_var;
+        memset(ks, 0, sizeof(struct kgdb_state));
+        ks->cpu                 = cpu;
+        ks->linux_regs          = regs;
        if (!atomic_read(&cpu_in_kgdb[cpu]) &&
-                        atomic_read(&kgdb_active) != cpu &&
+            atomic_read(&kgdb_active) != -1 &&
-                        atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
+            atomic_read(&kgdb_active) != cpu) {
-                kgdb_wait((struct pt_regs *)regs);
+                kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
+                kgdb_cpu_enter(ks, regs);
+                kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
                return 0;
        }
 #endif
@@ -1742,11 +1743,11 @@ EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
 */
 void kgdb_breakpoint(void)
 {
-        atomic_set(&kgdb_setting_breakpoint, 1);
+        atomic_inc(&kgdb_setting_breakpoint);
        wmb(); /* Sync point before breakpoint */
        arch_kgdb_breakpoint();
        wmb(); /* Sync point after breakpoint */
-        atomic_set(&kgdb_setting_breakpoint, 0);
+        atomic_dec(&kgdb_setting_breakpoint);
 }
 EXPORT_SYMBOL_GPL(kgdb_breakpoint);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index fa034d29cf73..0ed46f3e51e9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -259,7 +259,8 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
        struct kprobe_insn_page *kip;
        list_for_each_entry(kip, &c->pages, list) {
-                long idx = ((long)slot - (long)kip->insns) / c->insn_size;
+                long idx = ((long)slot - (long)kip->insns) /
+                                (c->insn_size * sizeof(kprobe_opcode_t));
                if (idx >= 0 && idx < slots_per_page(c)) {
                        WARN_ON(kip->slot_used[idx] != SLOT_USED);
                        if (dirty) {
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6b1ccc3f0205..21fe3c426948 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(uevent_seqnum);
-/* uevent helper program, used during early boo */
+/* uevent helper program, used during early boot */
 static ssize_t uevent_helper_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
 {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 82ed0ea15194..83911c780175 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -219,7 +219,7 @@ int kthreadd(void *unused)
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, cpu_all_mask);
-        set_mems_allowed(node_possible_map);
+        set_mems_allowed(node_states[N_HIGH_MEMORY]);
        current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ca07c5c0c914..877fb306d415 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -56,7 +56,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/list.h>
-#include <linux/slab.h>
 #include <linux/stacktrace.h>
 static DEFINE_SPINLOCK(latency_lock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0c30d0455de1..2594e1ce41cb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,6 +43,7 @@
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
 #include <linux/bitops.h>
+#include <linux/gfp.h>
 #include <asm/sections.h>
@@ -582,9 +583,6 @@ static int static_obj(void *obj)
        unsigned long start = (unsigned long) &_stext,
                      end   = (unsigned long) &_end,
                      addr  = (unsigned long) obj;
-#ifdef CONFIG_SMP
-        int i;
-#endif
        /*
         * static variable?
@@ -595,24 +593,16 @@ static int static_obj(void *obj)
        if (arch_is_kernel_data(addr))
                return 1;
-#ifdef CONFIG_SMP
        /*
-         * percpu var?
+         * in-kernel percpu var?
         */
-        for_each_possible_cpu(i) {
+        if (is_kernel_percpu_address(addr))
-                start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
+                return 1;
-                end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
-                                        + per_cpu_offset(i);
-                if ((addr >= start) && (addr < end))
-                        return 1;
-        }
-#endif
        /*
-         * module var?
+         * module static or percpu var?
         */
-        return is_module_address(addr);
+        return is_module_address(addr) || is_module_percpu_address(addr);
 }
 /*
@@ -3211,8 +3201,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
        unsigned long flags;
-        trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
        if (unlikely(current->lockdep_recursion))
                return;
@@ -3220,6 +3208,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
        __lock_acquire(lock, subclass, trylock, read, check,
                       irqs_disabled_flags(flags), nest_lock, ip, 0);
        current->lockdep_recursion = 0;
@@ -3232,14 +3221,13 @@ void lock_release(struct lockdep_map *lock, int nested,
 {
        unsigned long flags;
-        trace_lock_release(lock, nested, ip);
        if (unlikely(current->lockdep_recursion))
                return;
        raw_local_irq_save(flags);
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_release(lock, nested, ip);
        __lock_release(lock, nested, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
@@ -3413,8 +3401,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
        unsigned long flags;
-        trace_lock_contended(lock, ip);
        if (unlikely(!lock_stat))
                return;
@@ -3424,6 +3410,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
        raw_local_irq_save(flags);
        check_flags(flags);
        current->lockdep_recursion = 1;
+        trace_lock_contended(lock, ip);
        __lock_contended(lock, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
@@ -3822,6 +3809,7 @@ void lockdep_rcu_dereference(const char *file, const int line)
        printk("%s:%d invoked rcu_dereference_check() without protection!\n",
                        file, line);
        printk("\nother info that might help us debug this:\n\n");
+        printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
        dump_stack();
diff --git a/kernel/module.c b/kernel/module.c
index c968d3606dca..1016b75b026a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,27 +370,33 @@ EXPORT_SYMBOL_GPL(find_module);
 #ifdef CONFIG_SMP
-static void *percpu_modalloc(unsigned long size, unsigned long align,
+static inline void __percpu *mod_percpu(struct module *mod)
-                             const char *name)
 {
-        void *ptr;
+        return mod->percpu;
+}
+static int percpu_modalloc(struct module *mod,
+                           unsigned long size, unsigned long align)
+{
        if (align > PAGE_SIZE) {
                printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
-                       name, align, PAGE_SIZE);
+                       mod->name, align, PAGE_SIZE);
                align = PAGE_SIZE;
        }
-        ptr = __alloc_reserved_percpu(size, align);
+        mod->percpu = __alloc_reserved_percpu(size, align);
-        if (!ptr)
+        if (!mod->percpu) {
                printk(KERN_WARNING
                       "Could not allocate %lu bytes percpu data\n", size);
-        return ptr;
+                return -ENOMEM;
+        }
+        mod->percpu_size = size;
+        return 0;
 }
-static void percpu_modfree(void *freeme)
+static void percpu_modfree(struct module *mod)
 {
-        free_percpu(freeme);
+        free_percpu(mod->percpu);
 }
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
@@ -400,24 +406,62 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
        return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
 }
-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+static void percpu_modcopy(struct module *mod,
+                           const void *from, unsigned long size)
 {
        int cpu;
        for_each_possible_cpu(cpu)
-                memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+                memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
+}
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+        struct module *mod;
+        unsigned int cpu;
+        preempt_disable();
+        list_for_each_entry_rcu(mod, &modules, list) {
+                if (!mod->percpu_size)
+                        continue;
+                for_each_possible_cpu(cpu) {
+                        void *start = per_cpu_ptr(mod->percpu, cpu);
+                        if ((void *)addr >= start &&
+                            (void *)addr < start + mod->percpu_size) {
+                                preempt_enable();
+                                return true;
+                        }
+                }
+        }
+        preempt_enable();
+        return false;
 }
 #else /* ... !CONFIG_SMP */
-static inline void *percpu_modalloc(unsigned long size, unsigned long align,
+static inline void __percpu *mod_percpu(struct module *mod)
-                                    const char *name)
 {
        return NULL;
 }
-static inline void percpu_modfree(void *pcpuptr)
+static inline int percpu_modalloc(struct module *mod,
+                                  unsigned long size, unsigned long align)
+{
+        return -ENOMEM;
+}
+static inline void percpu_modfree(struct module *mod)
 {
-        BUG();
 }
 static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                        Elf_Shdr *sechdrs,
@@ -425,12 +469,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
 {
        return 0;
 }
-static inline void percpu_modcopy(void *pcpudst, const void *src,
+static inline void percpu_modcopy(struct module *mod,
-                                  unsigned long size)
+                                  const void *from, unsigned long size)
 {
        /* pcpusec should be 0, and size of that section should be 0. */
        BUG_ON(size != 0);
 }
+bool is_module_percpu_address(unsigned long addr)
+{
+        return false;
+}
 #endif /* CONFIG_SMP */
@@ -473,11 +521,13 @@ static void module_unload_init(struct module *mod)
        int cpu;
        INIT_LIST_HEAD(&mod->modules_which_use_me);
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
-                per_cpu_ptr(mod->refptr, cpu)->count = 0;
+                per_cpu_ptr(mod->refptr, cpu)->incs = 0;
+                per_cpu_ptr(mod->refptr, cpu)->decs = 0;
+        }
        /* Hold reference count during initialization. */
-        __this_cpu_write(mod->refptr->count, 1);
+        __this_cpu_write(mod->refptr->incs, 1);
        /* Backwards compatibility macros put refcount during init. */
        mod->waiter = current;
 }
@@ -616,12 +666,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 unsigned int module_refcount(struct module *mod)
 {
-        unsigned int total = 0;
+        unsigned int incs = 0, decs = 0;
        int cpu;
        for_each_possible_cpu(cpu)
-                total += per_cpu_ptr(mod->refptr, cpu)->count;
+                decs += per_cpu_ptr(mod->refptr, cpu)->decs;
-        return total;
+        /*
+         * ensure the incs are added up after the decs.
+         * module_put ensures incs are visible before decs with smp_wmb.
+         *
+         * This 2-count scheme avoids the situation where the refcount
+         * for CPU0 is read, then CPU0 increments the module refcount,
+         * then CPU1 drops that refcount, then the refcount for CPU1 is
+         * read. We would record a decrement but not its corresponding
+         * increment so we would see a low count (disaster).
+         *
+         * Rare situation? But module_refcount can be preempted, and we
+         * might be tallying up 4096+ CPUs. So it is not impossible.
+         */
+        smp_rmb();
+        for_each_possible_cpu(cpu)
+                incs += per_cpu_ptr(mod->refptr, cpu)->incs;
+        return incs - decs;
 }
 EXPORT_SYMBOL(module_refcount);
@@ -798,10 +864,11 @@ void module_put(struct module *module)
 {
        if (module) {
                preempt_disable();
-                __this_cpu_dec(module->refptr->count);
+                smp_wmb(); /* see comment in module_refcount */
+                __this_cpu_inc(module->refptr->decs);
                trace_module_put(module, _RET_IP_,
-                                 __this_cpu_read(module->refptr->count));
+                                 __this_cpu_read(module->refptr->decs));
                /* Maybe they're waiting for us to drop reference? */
                if (unlikely(!module_is_live(module)))
                        wake_up_process(module->waiter);
@@ -1400,8 +1467,7 @@ static void free_module(struct module *mod)
        /* This may be NULL, but that's OK */
        module_free(mod, mod->module_init);
        kfree(mod->args);
-        if (mod->percpu)
+        percpu_modfree(mod);
-                percpu_modfree(mod->percpu);
 #if defined(CONFIG_MODULE_UNLOAD)
        if (mod->refptr)
                free_percpu(mod->refptr);
@@ -1520,7 +1586,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
                default:
                        /* Divert to percpu allocation if a percpu var. */
                        if (sym[i].st_shndx == pcpuindex)
-                                secbase = (unsigned long)mod->percpu;
+                                secbase = (unsigned long)mod_percpu(mod);
                        else
                                secbase = sechdrs[sym[i].st_shndx].sh_addr;
                        sym[i].st_value += secbase;
@@ -1954,7 +2020,7 @@ static noinline struct module *load_module(void __user *umod,
        unsigned int modindex, versindex, infoindex, pcpuindex;
        struct module *mod;
        long err = 0;
-        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
+        void *ptr = NULL; /* Stops spurious gcc warning */
        unsigned long symoffs, stroffs, *strmap;
        mm_segment_t old_fs;
@@ -2094,15 +2160,11 @@ static noinline struct module *load_module(void __user *umod,
        if (pcpuindex) {
                /* We have a special allocation for this section. */
-                percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
+                err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
-                                         sechdrs[pcpuindex].sh_addralign,
+                                      sechdrs[pcpuindex].sh_addralign);
-                                         mod->name);
+                if (err)
-                if (!percpu) {
-                        err = -ENOMEM;
                        goto free_mod;
-                }
                sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
-                mod->percpu = percpu;
        }
        /* Determine total sizes, and put offsets in sh_entsize.  For now
@@ -2317,7 +2379,7 @@ static noinline struct module *load_module(void __user *umod,
        sort_extable(mod->extable, mod->extable + mod->num_exentries);
        /* Finally, copy percpu area over. */
-        percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
+        percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
                       sechdrs[pcpuindex].sh_size);
        add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
@@ -2409,8 +2471,7 @@ static noinline struct module *load_module(void __user *umod,
        module_free(mod, mod->module_core);
        /* mod will be freed with core. Don't access it beyond this line! */
 free_percpu:
-        if (percpu)
+        percpu_modfree(mod);
-                percpu_modfree(percpu);
 free_mod:
        kfree(args);
        kfree(strmap);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..f74e6c00e26d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -13,6 +13,7 @@
 *             Pavel Emelianov <xemul@openvz.org>
 */
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
@@ -24,7 +25,18 @@
 static struct kmem_cache *nsproxy_cachep;
-struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+struct nsproxy init_nsproxy = {
+        .count  = ATOMIC_INIT(1),
+        .uts_ns = &init_uts_ns,
+#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
+        .ipc_ns = &init_ipc_ns,
+#endif
+        .mnt_ns = NULL,
+        .pid_ns = &init_pid_ns,
+#ifdef CONFIG_NET
+        .net_ns = &init_net,
+#endif
+};
 static inline struct nsproxy *create_nsproxy(void)
 {
diff --git a/kernel/padata.c b/kernel/padata.c
index 93caf65ff57c..fd03513c7327 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -25,6 +25,7 @@
 #include <linux/padata.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/rcupdate.h>
 #define MAX_SEQ_NR INT_MAX - NR_CPUS
diff --git a/kernel/params.c b/kernel/params.c
index d55a53ec9234..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -401,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
 }
 /* sysfs output in /sys/modules/XYZ/parameters/ */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_attr(n) container_of(n, struct module_attribute, attr)
-#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
 extern struct kernel_param __start___param[], __stop___param[];
@@ -420,7 +420,7 @@ struct module_param_attrs
 };
 #ifdef CONFIG_SYSFS
-#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
 static ssize_t param_attr_show(struct module_attribute *mattr,
                               struct module *mod, char *buf)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f40560b86544..3d1552d3c12b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,7 @@
 #include <linux/smp.h>
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -56,21 +57,6 @@ static atomic_t nr_task_events __read_mostly;
 */
 int sysctl_perf_event_paranoid __read_mostly = 1;
-static inline bool perf_paranoid_tracepoint_raw(void)
-{
-        return sysctl_perf_event_paranoid > -1;
-}
-static inline bool perf_paranoid_cpu(void)
-{
-        return sysctl_perf_event_paranoid > 0;
-}
-static inline bool perf_paranoid_kernel(void)
-{
-        return sysctl_perf_event_paranoid > 1;
-}
 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
 /*
@@ -96,10 +82,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
 void __weak hw_perf_disable(void)               { barrier(); }
 void __weak hw_perf_enable(void)                { barrier(); }
-void __weak hw_perf_event_setup(int cpu)        { barrier(); }
-void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
-void __weak hw_perf_event_setup_offline(int cpu)        { barrier(); }
 int __weak
 hw_perf_group_sched_in(struct perf_event *group_leader,
               struct perf_cpu_context *cpuctx,
@@ -112,25 +94,15 @@ void __weak perf_event_print_debug(void)	{ }
 static DEFINE_PER_CPU(int, perf_disable_count);
-void __perf_disable(void)
-{
-        __get_cpu_var(perf_disable_count)++;
-}
-bool __perf_enable(void)
-{
-        return !--__get_cpu_var(perf_disable_count);
-}
 void perf_disable(void)
 {
-        __perf_disable();
+        if (!__get_cpu_var(perf_disable_count)++)
-        hw_perf_disable();
+                hw_perf_disable();
 }
 void perf_enable(void)
 {
-        if (__perf_enable())
+        if (!--__get_cpu_var(perf_disable_count))
                hw_perf_enable();
 }
@@ -1193,11 +1165,9 @@ void perf_event_task_sched_out(struct task_struct *task,
        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent;
-        struct pt_regs *regs;
        int do_switch = 1;
-        regs = task_pt_regs(task);
+        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
-        perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
        if (likely(!ctx || !cpuctx->task_ctx))
                return;
@@ -1553,12 +1523,15 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                 */
                if (interrupts == MAX_INTERRUPTS) {
                        perf_log_throttle(event, 1);
+                        perf_disable();
                        event->pmu->unthrottle(event);
+                        perf_enable();
                }
                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;
+                perf_disable();
                event->pmu->read(event);
                now = atomic64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
@@ -1566,6 +1539,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
                if (delta > 0)
                        perf_adjust_period(event, TICK_NSEC, delta);
+                perf_enable();
        }
        raw_spin_unlock(&ctx->lock);
 }
@@ -1575,9 +1549,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 */
 static void rotate_ctx(struct perf_event_context *ctx)
 {
-        if (!ctx->nr_events)
-                return;
        raw_spin_lock(&ctx->lock);
        /* Rotate the first entry last of non-pinned groups */
@@ -1590,19 +1561,28 @@ void perf_event_task_tick(struct task_struct *curr)
 {
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
+        int rotate = 0;
        if (!atomic_read(&nr_events))
                return;
        cpuctx = &__get_cpu_var(perf_cpu_context);
-        ctx = curr->perf_event_ctxp;
+        if (cpuctx->ctx.nr_events &&
+            cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
+                rotate = 1;
-        perf_disable();
+        ctx = curr->perf_event_ctxp;
+        if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
+                rotate = 1;
        perf_ctx_adjust_freq(&cpuctx->ctx);
        if (ctx)
                perf_ctx_adjust_freq(ctx);
+        if (!rotate)
+                return;
+        perf_disable();
        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
                task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
@@ -1614,7 +1594,6 @@ void perf_event_task_tick(struct task_struct *curr)
        cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
        if (ctx)
                task_ctx_sched_in(curr, EVENT_FLEXIBLE);
        perf_enable();
 }
@@ -2806,6 +2785,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
        return NULL;
 }
+__weak
+void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
+{
+}
 /*
 * Output
 */
@@ -3391,15 +3376,23 @@ static void perf_event_task_output(struct perf_event *event,
                                     struct perf_task_event *task_event)
 {
        struct perf_output_handle handle;
-        int size;
        struct task_struct *task = task_event->task;
-        int ret;
+        unsigned long flags;
+        int size, ret;
+        /*
+         * If this CPU attempts to acquire an rq lock held by a CPU spinning
+         * in perf_output_lock() from interrupt context, it's game over.
+         */
+        local_irq_save(flags);
        size  = task_event->event_id.header.size;
        ret = perf_output_begin(&handle, event, size, 0, 0);
-        if (ret)
+        if (ret) {
+                local_irq_restore(flags);
                return;
+        }
        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3410,6 +3403,7 @@ static void perf_event_task_output(struct perf_event *event,
        perf_output_put(&handle, task_event->event_id);
        perf_output_end(&handle);
+        local_irq_restore(flags);
 }
 static int perf_event_task_match(struct perf_event *event)
@@ -4123,8 +4117,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
        if (rctx < 0)
                return;
-        data.addr = addr;
+        perf_sample_data_init(&data, addr);
-        data.raw  = NULL;
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
@@ -4169,11 +4162,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        struct perf_event *event;
        u64 period;
-        event   = container_of(hrtimer, struct perf_event, hw.hrtimer);
+        event = container_of(hrtimer, struct perf_event, hw.hrtimer);
        event->pmu->read(event);
-        data.addr = 0;
+        perf_sample_data_init(&data, 0);
-        data.raw = NULL;
        data.period = event->hw.last_period;
        regs = get_irq_regs();
        /*
@@ -4335,26 +4327,20 @@ static const struct pmu perf_ops_task_clock = {
 #ifdef CONFIG_EVENT_TRACING
 void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
-                          int entry_size)
+                   int entry_size, struct pt_regs *regs)
 {
+        struct perf_sample_data data;
        struct perf_raw_record raw = {
                .size = entry_size,
                .data = record,
        };
-        struct perf_sample_data data = {
+        perf_sample_data_init(&data, addr);
-                .addr = addr,
+        data.raw = &raw;
-                .raw = &raw,
-        };
-        struct pt_regs *regs = get_irq_regs();
-        if (!regs)
-                regs = task_pt_regs(current);
        /* Trace events already protected against recursion */
        do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
-                                &data, regs);
+                         &data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -4370,7 +4356,7 @@ static int perf_tp_event_match(struct perf_event *event,
 static void tp_perf_event_destroy(struct perf_event *event)
 {
-        ftrace_profile_disable(event->attr.config);
+        perf_trace_disable(event->attr.config);
 }
 static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4384,7 +4370,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
                        !capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
-        if (ftrace_profile_enable(event->attr.config))
+        if (perf_trace_enable(event->attr.config))
                return NULL;
        event->destroy = tp_perf_event_destroy;
@@ -4463,8 +4449,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
        struct perf_sample_data sample;
        struct pt_regs *regs = data;
-        sample.raw = NULL;
+        perf_sample_data_init(&sample, bp->attr.bp_addr);
-        sample.addr = bp->attr.bp_addr;
        if (!perf_exclude_event(bp, regs))
                perf_swevent_add(bp, 1, 1, &sample, regs);
@@ -4912,7 +4897,7 @@ err_fput_free_put_context:
 err_free_put_context:
        if (err < 0)
-                kfree(event);
+                free_event(event);
 err_put_context:
        if (err < 0)
@@ -5392,18 +5377,26 @@ int perf_event_init_task(struct task_struct *child)
        return ret;
 }
+static void __init perf_event_init_all_cpus(void)
+{
+        int cpu;
+        struct perf_cpu_context *cpuctx;
+        for_each_possible_cpu(cpu) {
+                cpuctx = &per_cpu(perf_cpu_context, cpu);
+                __perf_event_init_context(&cpuctx->ctx, NULL);
+        }
+}
 static void __cpuinit perf_event_init_cpu(int cpu)
 {
        struct perf_cpu_context *cpuctx;
        cpuctx = &per_cpu(perf_cpu_context, cpu);
-        __perf_event_init_context(&cpuctx->ctx, NULL);
        spin_lock(&perf_resource_lock);
        cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
        spin_unlock(&perf_resource_lock);
-        hw_perf_event_setup(cpu);
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5443,20 +5436,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
                perf_event_init_cpu(cpu);
                break;
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                hw_perf_event_setup_online(cpu);
-                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                perf_event_exit_cpu(cpu);
                break;
-        case CPU_DEAD:
-                hw_perf_event_setup_offline(cpu);
-                break;
        default:
                break;
        }
@@ -5474,6 +5458,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
 void __init perf_event_init(void)
 {
+        perf_event_init_all_cpus();
        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
                        (void *)(long)smp_processor_id());
        perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
diff --git a/kernel/pid.c b/kernel/pid.c
index 86b296943e5f..aebb30d9c233 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
-                first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock));
+                first = rcu_dereference_check(pid->tasks[type].first,
+                                              rcu_read_lock_held() ||
+                                              lockdep_tasklist_lock_is_held());
                if (first)
                        result = hlist_entry(first, struct task_struct, pids[(type)].node);
        }
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..a5aff94e1f0b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -13,6 +13,7 @@
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
+#include <linux/slab.h>
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
@@ -161,13 +162,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
                rcu_read_lock();
                /*
-                 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
+                 * Any nested-container's init processes won't ignore the
-                 * any nested-container's init processes don't ignore the
+                 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
-                 * signal
                 */
                task = pid_task(find_vpid(nr), PIDTYPE_PID);
                if (task)
-                        force_sig(SIGKILL, task);
+                        send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
                rcu_read_unlock();
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 1a22dfd42df9..bc7704b3a443 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1061,9 +1061,9 @@ static void check_thread_timers(struct task_struct *tsk,
        }
 }
-static void stop_process_timers(struct task_struct *tsk)
+static void stop_process_timers(struct signal_struct *sig)
 {
-        struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+        struct thread_group_cputimer *cputimer = &sig->cputimer;
        unsigned long flags;
        if (!cputimer->running)
@@ -1072,6 +1072,10 @@ static void stop_process_timers(struct task_struct *tsk)
        spin_lock_irqsave(&cputimer->lock, flags);
        cputimer->running = 0;
        spin_unlock_irqrestore(&cputimer->lock, flags);
+        sig->cputime_expires.prof_exp = cputime_zero;
+        sig->cputime_expires.virt_exp = cputime_zero;
+        sig->cputime_expires.sched_exp = 0;
 }
 static u32 onecputick;
@@ -1133,7 +1137,7 @@ static void check_process_timers(struct task_struct *tsk,
            list_empty(&timers[CPUCLOCK_VIRT]) &&
            cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
            list_empty(&timers[CPUCLOCK_SCHED])) {
-                stop_process_timers(tsk);
+                stop_process_timers(sig);
                return;
        }
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index da5288ec2392..aa9e916da4d5 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/gfp.h>
 #include <scsi/scsi_scan.h>
 #include <asm/suspend.h>
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
index 39ac698ef836..fdcad9ed5a7b 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/hibernate_nvs.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/suspend.h>
 /*
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5ade1bdcf366..71ae29052ab6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only)
                printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
                                "(%d tasks refusing to freeze):\n",
                                elapsed_csecs / 100, elapsed_csecs % 100, todo);
-                show_state();
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        task_lock(p);
                        if (freezing(p) && !freezer_should_skip(p))
-                                printk(KERN_ERR " %s\n", p->comm);
+                                sched_show_task(p);
                        cancel_freezing(p);
                        task_unlock(p);
                } while_each_thread(g, p);
@@ -145,7 +144,7 @@ static void thaw_tasks(bool nosig_only)
                if (nosig_only && should_send_signal(p))
                        continue;
-                if (cgroup_frozen(p))
+                if (cgroup_freezing_or_frozen(p))
                        continue;
                thaw_process(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 830cadecbdfc..be861c26dda7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -26,6 +26,7 @@
 #include <linux/console.h>
 #include <linux/highmem.h>
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 44cce10b582d..56e7dbb8b996 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -15,6 +15,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/gfp.h>
 #include "power.h"
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1d575733d4e1..66824d71983a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -23,6 +23,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pm.h>
+#include <linux/slab.h>
 #include "power.h"
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4d2289626a84..a8c96212bc1b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -420,7 +420,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                         * User space encodes device types as two-byte values,
                         * so we need to recode them
                         */
-                        swdev = old_decode_dev(swap_area.dev);
+                        swdev = new_decode_dev(swap_area.dev);
                        if (swdev) {
                                offset = swap_area.offset;
                                data->swap = swap_type_of(swdev, offset, NULL);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f1125c1a6321..03a7ea1579f6 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/kernel_stat.h>
+#include <linux/hardirq.h>
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
@@ -66,6 +67,35 @@ EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
 int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+int debug_lockdep_rcu_enabled(void)
+{
+        return rcu_scheduler_active && debug_locks &&
+               current->lockdep_recursion == 0;
+}
+EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
+/**
+ * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
+ *
+ * Check for bottom half being disabled, which covers both the
+ * CONFIG_PROVE_RCU and not cases.  Note that if someone uses
+ * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
+ * will show the situation.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
+ */
+int rcu_read_lock_bh_held(void)
+{
+        if (!debug_lockdep_rcu_enabled())
+                return 1;
+        return in_softirq();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /*
 * This function is invoked towards the end of the scheduler's initialization
 * process.  Before this is called, the idle task might contain
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1439eb504c22..4a525a30e08e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -246,12 +246,21 @@ struct rcu_data {
 #define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ)  /* for rsp->jiffies_stall */
-#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ)  /* for rsp->jiffies_stall */
+#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_RAT_DELAY             2         /* Allow other CPUs time */
+#define RCU_STALL_DELAY_DELTA          (5 * HZ)
-                                                  /*  to take at least one */
+#else
-                                                  /*  scheduling clock irq */
+#define RCU_STALL_DELAY_DELTA          0
-                                                  /*  before ratting on them. */
+#endif
+#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ + RCU_STALL_DELAY_DELTA)
+                                                /* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA)
+                                                /* for rsp->jiffies_stall */
+#define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
+                                                /*  to take at least one */
+                                                /*  scheduling clock irq */
+                                                /*  before ratting on them. */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 464ad2cdee00..79b53bda8943 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1010,6 +1010,10 @@ int rcu_needs_cpu(int cpu)
        int c = 0;
        int thatcpu;
+        /* Check for being in the holdoff period. */
+        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
+                return rcu_needs_cpu_quick_check(cpu);
        /* Don't bother unless we are the last non-dyntick-idle CPU. */
        for_each_cpu_not(thatcpu, nohz_cpu_mask)
                if (thatcpu != cpu) {
@@ -1041,10 +1045,8 @@ int rcu_needs_cpu(int cpu)
        }
        /* If RCU callbacks are still pending, RCU still needs this CPU. */
-        if (c) {
+        if (c)
                raise_softirq(RCU_SOFTIRQ);
-                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
-        }
        return c;
 }
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40b..c7eaa37a768b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,7 +10,6 @@
 #include <linux/types.h>
 #include <linux/parser.h>
 #include <linux/fs.h>
-#include <linux/slab.h>
 #include <linux/res_counter.h>
 #include <linux/uaccess.h>
 #include <linux/mm.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 2d5be5d9bf5f..9c358e263534 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -219,19 +219,34 @@ void release_child_resources(struct resource *r)
 }
 /**
- * request_resource - request and reserve an I/O or memory resource
+ * request_resource_conflict - request and reserve an I/O or memory resource
 * @root: root resource descriptor
 * @new: resource descriptor desired by caller
 *
- * Returns 0 for success, negative error code on error.
+ * Returns 0 for success, conflict resource on error.
 */
-int request_resource(struct resource *root, struct resource *new)
+struct resource *request_resource_conflict(struct resource *root, struct resource *new)
 {
        struct resource *conflict;
        write_lock(&resource_lock);
        conflict = __request_resource(root, new);
        write_unlock(&resource_lock);
+        return conflict;
+}
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
+int request_resource(struct resource *root, struct resource *new)
+{
+        struct resource *conflict;
+        conflict = request_resource_conflict(root, new);
        return conflict ? -EBUSY : 0;
 }
@@ -474,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
 }
 /**
- * insert_resource - Inserts a resource in the resource tree
+ * insert_resource_conflict - Inserts resource in the resource tree
 * @parent: parent of the new resource
 * @new: new resource to insert
 *
- * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ * Returns 0 on success, conflict resource if the resource can't be inserted.
 *
- * This function is equivalent to request_resource when no conflict
+ * This function is equivalent to request_resource_conflict when no conflict
 * happens. If a conflict happens, and the conflicting resources
 * entirely fit within the range of the new resource, then the new
 * resource is inserted and the conflicting resources become children of
 * the new resource.
 */
-int insert_resource(struct resource *parent, struct resource *new)
+struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
 {
        struct resource *conflict;
        write_lock(&resource_lock);
        conflict = __insert_resource(parent, new);
        write_unlock(&resource_lock);
+        return conflict;
+}
+/**
+ * insert_resource - Inserts a resource in the resource tree
+ * @parent: parent of the new resource
+ * @new: new resource to insert
+ *
+ * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ */
+int insert_resource(struct resource *parent, struct resource *new)
+{
+        struct resource *conflict;
+        conflict = insert_resource_conflict(parent, new);
        return conflict ? -EBUSY : 0;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 150b6988de49..3c2a54f70ffe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -322,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p)
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
+        /*
+         * Strictly speaking this rcu_read_lock() is not needed since the
+         * task_group is tied to the cgroup, which in turn can never go away
+         * as long as there are tasks attached to it.
+         *
+         * However since task_group() uses task_subsys_state() which is an
+         * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
+         */
+        rcu_read_lock();
 #ifdef CONFIG_FAIR_GROUP_SCHED
        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
        p->se.parent = task_group(p)->se[cpu];
@@ -331,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
        p->rt.parent = task_group(p)->rt_se[cpu];
 #endif
+        rcu_read_unlock();
 }
 #else
@@ -2359,7 +2370,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 {
        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-        struct rq *rq, *orig_rq;
+        struct rq *rq;
        if (!sched_feat(SYNC_WAKEUPS))
                wake_flags &= ~WF_SYNC;
@@ -2367,7 +2378,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
        this_cpu = get_cpu();
        smp_wmb();
-        rq = orig_rq = task_rq_lock(p, &flags);
+        rq = task_rq_lock(p, &flags);
        update_rq_clock(rq);
        if (!(p->state & state))
                goto out;
@@ -2650,7 +2661,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 {
        unsigned long flags;
        struct rq *rq;
-        int cpu = get_cpu();
+        int cpu __maybe_unused = get_cpu();
 #ifdef CONFIG_SMP
        /*
@@ -3779,7 +3790,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
         * the mutex owner just released it and exited.
         */
        if (probe_kernel_address(&owner->cpu, cpu))
-                goto out;
+                return 0;
 #else
        cpu = owner->cpu;
 #endif
@@ -3789,14 +3800,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
         * the cpu field may no longer be valid.
         */
        if (cpu >= nr_cpumask_bits)
-                goto out;
+                return 0;
        /*
         * We need to validate that we can do a
         * get_cpu() and that we have the percpu area.
         */
        if (!cpu_online(cpu))
-                goto out;
+                return 0;
        rq = cpu_rq(cpu);
@@ -3815,7 +3826,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                cpu_relax();
        }
-out:
        return 1;
 }
 #endif
@@ -4902,7 +4913,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
        int ret;
        cpumask_var_t mask;
-        if (len < cpumask_size())
+        if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+                return -EINVAL;
+        if (len & (sizeof(unsigned long)-1))
                return -EINVAL;
        if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -4910,10 +4923,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
        ret = sched_getaffinity(pid, mask);
        if (ret == 0) {
-                if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+                size_t retlen = min_t(size_t, len, cpumask_size());
+                if (copy_to_user(user_mask_ptr, mask, retlen))
                        ret = -EFAULT;
                else
-                        ret = cpumask_size();
+                        ret = retlen;
        }
        free_cpumask_var(mask);
@@ -5383,7 +5398,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
                get_task_struct(mt);
                task_rq_unlock(rq, &flags);
-                wake_up_process(rq->migration_thread);
+                wake_up_process(mt);
                put_task_struct(mt);
                wait_for_completion(&req.done);
                tlb_migrate_finish(p->mm);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 82095bf2099f..e6871cb3fc83 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -27,6 +27,7 @@
 *  of the License.
 */
+#include <linux/gfp.h>
 #include "sched_cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -56,7 +57,7 @@ static int convert_prio(int prio)
 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
 *
 * Note: This function returns the recommended CPUs as calculated during the
- * current invokation.  By the time the call returns, the CPUs may have in
+ * current invocation.  By the time the call returns, the CPUs may have in
 * fact changed priorities any number of times.  While not ideal, it is not
 * an issue of correctness since the normal rebalancer logic will correct
 * any discrepancies created by racing against the uncertainty of the current
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 67f95aada4b9..9b49db144037 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -518,8 +518,4 @@ void proc_sched_set_task(struct task_struct *p)
        p->se.nr_wakeups_idle                   = 0;
        p->sched_info.bkl_count                 = 0;
 #endif
-        p->se.sum_exec_runtime                  = 0;
-        p->se.prev_sum_exec_runtime             = 0;
-        p->nvcsw                                = 0;
-        p->nivcsw                               = 0;
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3e1fd96c6cf9..5a5ea2cd924f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3476,7 +3476,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 static inline int on_null_domain(int cpu)
 {
-        return !rcu_dereference(cpu_rq(cpu)->sd);
+        return !rcu_dereference_sched(cpu_rq(cpu)->sd);
 }
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5a6ed1f0990a..b5b920ae2ea7 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1146,7 +1146,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
                if (next && next->prio < idx)
                        continue;
                list_for_each_entry(rt_se, array->queue + idx, run_list) {
-                        struct task_struct *p = rt_task_of(rt_se);
+                        struct task_struct *p;
+                        if (!rt_entity_is_task(rt_se))
+                                continue;
+                        p = rt_task_of(rt_se);
                        if (pick_rt_task(rq, p, cpu)) {
                                next = p;
                                break;
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 7494bbf5a270..7d3f4fa9ef4f 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
                        goto cancelled;
                /* the timer holds a reference whilst it is pending */
-                ret = work->ops->get_ref(work);
+                ret = slow_work_get_ref(work);
                if (ret < 0)
                        goto cant_get_ref;
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 321f3c59d732..a29ebd1ef41d 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
 */
 static inline void slow_work_set_thread_pid(int id, pid_t pid)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        slow_work_pids[id] = pid;
 #endif
 }
 static inline void slow_work_mark_time(struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        work->mark = CURRENT_TIME;
 #endif
 }
 static inline void slow_work_begin_exec(int id, struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        slow_work_execs[id] = work;
 #endif
 }
 static inline void slow_work_end_exec(int id, struct slow_work *work)
 {
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
        write_lock(&slow_work_execs_lock);
        slow_work_execs[id] = NULL;
        write_unlock(&slow_work_execs_lock);
diff --git a/kernel/smp.c b/kernel/smp.c
index 9867b6bfefce..3fc697336183 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
+#include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 0d4c7898ab80..4b493f67dcb5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -155,11 +155,11 @@ void softlockup_tick(void)
         * Wake up the high-prio watchdog task twice per
         * threshold timespan.
         */
-        if (now > touch_ts + softlockup_thresh/2)
+        if (time_after(now - softlockup_thresh/2, touch_ts))
                wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
        /* Warn about unreasonable delays: */
-        if (now <= (touch_ts + softlockup_thresh))
+        if (time_before_eq(now - softlockup_thresh, touch_ts))
                return;
        per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index bde4295774c8..2980da3fd509 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -30,7 +30,6 @@
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/srcu.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index 9814e43fb23b..7cb426a58965 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,8 +33,10 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
+#include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
+#include <linux/gfp.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -1114,6 +1116,15 @@ out:
 DECLARE_RWSEM(uts_sem);
+#ifdef COMPAT_UTS_MACHINE
+#define override_architecture(name) \
+        (personality(current->personality) == PER_LINUX32 && \
+         copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
+                      sizeof(COMPAT_UTS_MACHINE)))
+#else
+#define override_architecture(name)     0
+#endif
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 {
        int errno = 0;
@@ -1122,9 +1133,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
        if (copy_to_user(name, utsname(), sizeof *name))
                errno = -EFAULT;
        up_read(&uts_sem);
+        if (!errno && override_architecture(name))
+                errno = -EFAULT;
        return errno;
 }
+#ifdef __ARCH_WANT_SYS_OLD_UNAME
+/*
+ * Old cruft
+ */
+SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
+{
+        int error = 0;
+        if (!name)
+                return -EFAULT;
+        down_read(&uts_sem);
+        if (copy_to_user(name, utsname(), sizeof(*name)))
+                error = -EFAULT;
+        up_read(&uts_sem);
+        if (!error && override_architecture(name))
+                error = -EFAULT;
+        return error;
+}
+SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
+{
+        int error;
+        if (!name)
+                return -EFAULT;
+        if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
+                return -EFAULT;
+        down_read(&uts_sem);
+        error = __copy_to_user(&name->sysname, &utsname()->sysname,
+                               __OLD_UTS_LEN);
+        error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->nodename, &utsname()->nodename,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->release, &utsname()->release,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->release + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->version, &utsname()->version,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->version + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->machine, &utsname()->machine,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+        up_read(&uts_sem);
+        if (!error && override_architecture(name))
+                error = -EFAULT;
+        return error ? -EFAULT : 0;
+}
+#endif
 SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 {
        int errno;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7d..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(sys_ipc);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
 cond_syscall(sys_flock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0ef19c614f6d..8686b0f5fc12 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
+#include <linux/signal.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -60,13 +61,23 @@
 #include <asm/stacktrace.h>
 #include <asm/io.h>
 #endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+#include <linux/acct.h>
+#endif
+#ifdef CONFIG_RT_MUTEXES
+#include <linux/rtmutex.h>
+#endif
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
+#include <linux/lockdep.h>
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+#include <scsi/sg.h>
+#endif
 #if defined(CONFIG_SYSCTL)
 /* External variables not in a header file. */
-extern int C_A_D;
-extern int print_fatal_signals;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
@@ -88,9 +99,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifndef CONFIG_MMU
 extern int sysctl_nr_trim_pages;
 #endif
-#ifdef CONFIG_RCU_TORTURE_TEST
-extern int rcutorture_runnable;
-#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 #ifdef CONFIG_BLOCK
 extern int blk_iopoll_enabled;
 #endif
@@ -120,14 +128,6 @@ static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
-#ifdef CONFIG_MODULES
-extern char modprobe_path[];
-extern int modules_disabled;
-#endif
-#ifdef CONFIG_CHR_DEV_SG
-extern int sg_big_buff;
-#endif
 #ifdef CONFIG_SPARC
 #include <asm/system.h>
 #endif
@@ -149,10 +149,6 @@ extern int sysctl_userprocess_debug;
 extern int spin_retry;
 #endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
-extern int acct_parm[];
-#endif
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
@@ -160,10 +156,6 @@ extern int unaligned_dump_stack;
 extern struct ratelimit_state printk_ratelimit_state;
-#ifdef CONFIG_RT_MUTEXES
-extern int max_lock_depth;
-#endif
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -202,9 +194,6 @@ extern struct ctl_table epoll_table[];
 int sysctl_legacy_va_layout;
 #endif
-extern int prove_locking;
-extern int lock_stat;
 /* The default sysctl tables: */
 static struct ctl_table root_table[] = {
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8cd50d8f9bde..59030570f5ca 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
 #include <linux/file.h>
 #include <linux/ctype.h>
 #include <linux/netdevice.h>
+#include <linux/slab.h>
 #ifdef CONFIG_SYSCTL_SYSCALL
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 899ca51be5e8..11281d5792bd 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,7 @@
 #include <linux/delayacct.h>
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
+#include <linux/slab.h>
 #include <linux/cgroupstats.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
diff --git a/kernel/time.c b/kernel/time.c
index 804798005d19..656dccfe1cbb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,7 +35,6 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/fs.h>
-#include <linux/slab.h>
 #include <linux/math64.h>
 #include <linux/ptrace.h>
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f663d23e85e..1f5dde637457 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -592,6 +592,10 @@ static inline void clocksource_select(void) { }
 */
 static int __init clocksource_done_booting(void)
 {
+        mutex_lock(&clocksource_mutex);
+        curr_clocksource = clocksource_default_clock();
+        mutex_unlock(&clocksource_mutex);
        finished_booting = 1;
        /*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0a8a213016f0..aada0e52680a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
 #include "tick-internal.h"
+/* Limit min_delta to a jiffie */
+#define MIN_DELTA_LIMIT         (NSEC_PER_SEC / HZ)
+static int tick_increase_min_delta(struct clock_event_device *dev)
+{
+        /* Nothing to do if we already reached the limit */
+        if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
+                return -ETIME;
+        if (dev->min_delta_ns < 5000)
+                dev->min_delta_ns = 5000;
+        else
+                dev->min_delta_ns += dev->min_delta_ns >> 1;
+        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
+                dev->min_delta_ns = MIN_DELTA_LIMIT;
+        printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+               dev->name ? dev->name : "?",
+               (unsigned long long) dev->min_delta_ns);
+        return 0;
+}
 /**
 * tick_program_event internal worker function
 */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
                if (!ret || !force)
                        return ret;
+                dev->retries++;
                /*
-                 * We tried 2 times to program the device with the given
+                 * We tried 3 times to program the device with the given
-                 * min_delta_ns. If that's not working then we double it
+                 * min_delta_ns. If that's not working then we increase it
                 * and emit a warning.
                 */
                if (++i > 2) {
                        /* Increase the min. delta and try again */
-                        if (!dev->min_delta_ns)
+                        if (tick_increase_min_delta(dev)) {
-                                dev->min_delta_ns = 5000;
+                                /*
-                        else
+                                 * Get out of the loop if min_delta_ns
-                                dev->min_delta_ns += dev->min_delta_ns >> 1;
+                                 * hit the limit already. That's
+                                 * better than staying here forever.
-                        printk(KERN_WARNING
+                                 *
-                               "CE: %s increasing min_delta_ns to %llu nsec\n",
+                                 * We clear next_event so we have a
-                               dev->name ? dev->name : "?",
+                                 * chance that the box survives.
-                               (unsigned long long) dev->min_delta_ns << 1);
+                                 */
+                                printk(KERN_WARNING
+                                       "CE: Reprogramming failure. Giving up\n");
+                                dev->next_event.tv64 = KTIME_MAX;
+                                return -ETIME;
+                        }
                        i = 0;
                }
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 12f5c55090be..ac38fbb176cc 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -19,6 +19,7 @@
 #include <linux/timecompare.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/math64.h>
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 16736379a9ca..39f6177fafac 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -818,7 +818,8 @@ void update_wall_time(void)
        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
                offset = logarithmic_accumulation(offset, shift);
-                shift--;
+                if(offset < timekeeper.cycle_interval<<shift)
+                        shift--;
        }
        /* correct the clock when NTP error is too big */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdfb8dd1050c..1a4a7dd78777 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -228,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
        SEQ_printf(m, " event_handler:  ");
        print_name_offset(m, dev->event_handler);
        SEQ_printf(m, "\n");
+        SEQ_printf(m, " retries:        %lu\n", dev->retries);
 }
 static void timer_list_show_tickdevices(struct seq_file *m)
@@ -257,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Timer List Version: v0.5\n");
+        SEQ_printf(m, "Timer List Version: v0.6\n");
        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387f..aeb6a54f2771 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
 #include <linux/kallsyms.h>
 #include <linux/perf_event.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -880,6 +881,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
        if (base->running_timer == timer)
                goto out;
+        timer_stats_timer_clear_start_info(timer);
        ret = 0;
        if (timer_pending(timer)) {
                detach_timer(timer, 1);
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d00c6fe23f54..78edc6490038 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
-obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
+obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 07f945a99430..b3bc91a3f510 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -21,6 +21,7 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/debugfs.h>
 #include <linux/smp_lock.h>
 #include <linux/time.h>
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 83783579378f..2404b59b3097 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -24,9 +24,11 @@
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
+#include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/hash.h>
+#include <linux/rcupdate.h>
 #include <trace/events/sched.h>
@@ -84,22 +86,22 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
 ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/*
-static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
+ * Traverse the ftrace_list, invoking all entries.  The reason that we
-#endif
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism.  The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
-        struct ftrace_ops *op = ftrace_list;
+        struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
-        /* in case someone actually ports this to alpha! */
-        read_barrier_depends();
        while (op != &ftrace_list_end) {
-                /* silly alpha */
-                read_barrier_depends();
                op->func(ip, parent_ip);
-                op = op->next;
+                op = rcu_dereference_raw(op->next); /*see above*/
        };
 }
@@ -154,8 +156,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
         * the ops->next pointer is valid before another CPU sees
         * the ops pointer included into the ftrace_list.
         */
-        smp_wmb();
+        rcu_assign_pointer(ftrace_list, ops);
-        ftrace_list = ops;
        if (ftrace_enabled) {
                ftrace_func_t func;
@@ -2276,6 +2277,8 @@ __setup("ftrace_filter=", set_ftrace_filter);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
 static int __init set_graph_function(char *str)
 {
        strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -3351,6 +3354,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 {
        /* Make sure we do not use the parent ret_stack */
        t->ret_stack = NULL;
+        t->curr_ret_stack = -1;
        if (ftrace_graph_active) {
                struct ftrace_ret_stack *ret_stack;
@@ -3360,7 +3364,6 @@ void ftrace_graph_init_task(struct task_struct *t)
                                GFP_KERNEL);
                if (!ret_stack)
                        return;
-                t->curr_ret_stack = -1;
                atomic_set(&t->tracing_graph_pause, 0);
                atomic_set(&t->trace_overrun, 0);
                t->ftrace_timestamp = 0;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 9f4f565b01e6..a22582a06161 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -9,7 +9,6 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0287f9f52f5a..41ca394feb22 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
@@ -207,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 #define RB_EVNT_MIN_SIZE        8U      /* two 32bit words */
+#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+# define RB_FORCE_8BYTE_ALIGNMENT       0
+# define RB_ARCH_ALIGNMENT              RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT       1
+# define RB_ARCH_ALIGNMENT              8U
+#endif
 /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
 #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -1201,18 +1210,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
        for (i = 0; i < nr_pages; i++) {
                if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-                        return;
+                        goto out;
                p = cpu_buffer->pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
                free_buffer_page(bpage);
        }
        if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
-                return;
+                goto out;
        rb_reset_cpu(cpu_buffer);
        rb_check_pages(cpu_buffer);
+out:
        spin_unlock_irq(&cpu_buffer->reader_lock);
 }
@@ -1229,7 +1239,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        for (i = 0; i < nr_pages; i++) {
                if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
-                        return;
+                        goto out;
                p = pages->next;
                bpage = list_entry(p, struct buffer_page, list);
                list_del_init(&bpage->list);
@@ -1238,6 +1248,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        rb_reset_cpu(cpu_buffer);
        rb_check_pages(cpu_buffer);
+out:
        spin_unlock_irq(&cpu_buffer->reader_lock);
 }
@@ -1547,7 +1558,7 @@ rb_update_event(struct ring_buffer_event *event,
        case 0:
                length -= RB_EVNT_HDR_SIZE;
-                if (length > RB_MAX_SMALL_DATA)
+                if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
                        event->array[0] = length;
                else
                        event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1722,11 +1733,11 @@ static unsigned rb_calculate_event_length(unsigned length)
        if (!length)
                length = 1;
-        if (length > RB_MAX_SMALL_DATA)
+        if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
                length += sizeof(event.array[0]);
        length += RB_EVNT_HDR_SIZE;
-        length = ALIGN(length, RB_ALIGNMENT);
+        length = ALIGN(length, RB_ARCH_ALIGNMENT);
        return length;
 }
@@ -2233,12 +2244,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
        if (ring_buffer_flags != RB_BUFFERS_ON)
                return NULL;
-        if (atomic_read(&buffer->record_disabled))
-                return NULL;
        /* If we are tracing schedule, we don't want to recurse */
        resched = ftrace_preempt_disable();
+        if (atomic_read(&buffer->record_disabled))
+                goto out_nocheck;
        if (trace_recursive_lock())
                goto out_nocheck;
@@ -2470,11 +2481,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
        if (ring_buffer_flags != RB_BUFFERS_ON)
                return -EBUSY;
-        if (atomic_read(&buffer->record_disabled))
-                return -EBUSY;
        resched = ftrace_preempt_disable();
+        if (atomic_read(&buffer->record_disabled))
+                goto out;
        cpu = raw_smp_processor_id();
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -2542,7 +2553,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
 * @buffer: The ring buffer to enable writes
 *
 * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
 */
 void ring_buffer_record_enable(struct ring_buffer *buffer)
 {
@@ -2578,7 +2589,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 * @cpu: The CPU to enable.
 *
 * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
 */
 void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ed01fdba4a55..44f916a04065 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -33,10 +33,10 @@
 #include <linux/kdebug.h>
 #include <linux/string.h>
 #include <linux/rwsem.h>
+#include <linux/slab.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
-#include <linux/gfp.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -374,6 +374,21 @@ static int __init set_buf_size(char *str)
 }
 __setup("trace_buf_size=", set_buf_size);
+static int __init set_tracing_thresh(char *str)
+{
+        unsigned long threshhold;
+        int ret;
+        if (!str)
+                return 0;
+        ret = strict_strtoul(str, 0, &threshhold);
+        if (ret < 0)
+                return 0;
+        tracing_thresh = threshhold * 1000;
+        return 1;
+}
+__setup("tracing_thresh=", set_tracing_thresh);
 unsigned long nsecs_to_usecs(unsigned long nsecs)
 {
        return nsecs / 1000;
@@ -579,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 static arch_spinlock_t ftrace_max_lock =
        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+unsigned long __read_mostly     tracing_thresh;
 #ifdef CONFIG_TRACER_MAX_TRACE
 unsigned long __read_mostly     tracing_max_latency;
-unsigned long __read_mostly     tracing_thresh;
 /*
 * Copy the new maximum trace into the separate maximum-trace
@@ -592,7 +608,7 @@ static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
        struct trace_array_cpu *data = tr->data[cpu];
-        struct trace_array_cpu *max_data = tr->data[cpu];
+        struct trace_array_cpu *max_data;
        max_tr.cpu = cpu;
        max_tr.time_start = data->preempt_timestamp;
@@ -602,7 +618,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
        max_data->critical_start = data->critical_start;
        max_data->critical_end = data->critical_end;
-        memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
+        memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
        max_data->pid = tsk->pid;
        max_data->uid = task_uid(tsk);
        max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
@@ -824,10 +840,10 @@ out:
        mutex_unlock(&trace_types_lock);
 }
-static void __tracing_reset(struct trace_array *tr, int cpu)
+static void __tracing_reset(struct ring_buffer *buffer, int cpu)
 {
        ftrace_disable_cpu();
-        ring_buffer_reset_cpu(tr->buffer, cpu);
+        ring_buffer_reset_cpu(buffer, cpu);
        ftrace_enable_cpu();
 }
@@ -839,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu)
        /* Make sure all commits have finished */
        synchronize_sched();
-        __tracing_reset(tr, cpu);
+        __tracing_reset(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -857,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
        tr->time_start = ftrace_now(tr->cpu);
        for_each_online_cpu(cpu)
-                __tracing_reset(tr, cpu);
+                __tracing_reset(buffer, cpu);
        ring_buffer_record_enable(buffer);
 }
@@ -934,6 +950,8 @@ void tracing_start(void)
                goto out;
        }
+        /* Prevent the buffers from switching */
+        arch_spin_lock(&ftrace_max_lock);
        buffer = global_trace.buffer;
        if (buffer)
@@ -943,6 +961,8 @@ void tracing_start(void)
        if (buffer)
                ring_buffer_record_enable(buffer);
+        arch_spin_unlock(&ftrace_max_lock);
        ftrace_start();
 out:
        spin_unlock_irqrestore(&tracing_start_lock, flags);
@@ -964,6 +984,9 @@ void tracing_stop(void)
        if (trace_stop_count++)
                goto out;
+        /* Prevent the buffers from switching */
+        arch_spin_lock(&ftrace_max_lock);
        buffer = global_trace.buffer;
        if (buffer)
                ring_buffer_record_disable(buffer);
@@ -972,6 +995,8 @@ void tracing_stop(void)
        if (buffer)
                ring_buffer_record_disable(buffer);
+        arch_spin_unlock(&ftrace_max_lock);
 out:
        spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
@@ -1259,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
                return;
+        /*
+         * NMIs can not handle page faults, even with fix ups.
+         * The save user stack can (and often does) fault.
+         */
+        if (unlikely(in_nmi()))
+                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
                                          sizeof(*entry), flags, pc);
        if (!event)
@@ -1703,6 +1735,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                ftrace_enable_cpu();
+                iter->leftover = 0;
                for (p = iter; p && l < *pos; p = s_next(m, p, &l))
                        ;
@@ -4248,10 +4281,10 @@ static __init int tracer_init_debugfs(void)
 #ifdef CONFIG_TRACER_MAX_TRACE
        trace_create_file("tracing_max_latency", 0644, d_tracer,
                        &tracing_max_latency, &tracing_max_lat_fops);
+#endif
        trace_create_file("tracing_thresh", 0644, d_tracer,
                        &tracing_thresh, &tracing_max_lat_fops);
-#endif
        trace_create_file("README", 0444, d_tracer,
                        NULL, &tracing_readme_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd05bcaf91b0..2825ef2c0b15 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
+extern unsigned long tracing_thresh;
 #ifdef CONFIG_TRACER_MAX_TRACE
 extern unsigned long tracing_max_latency;
-extern unsigned long tracing_thresh;
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
@@ -550,7 +551,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
 * struct trace_parser - servers for reading the user input separated by spaces
 * @cont: set if the input is not complete - no final space char was found
 * @buffer: holds the parsed user input
- * @idx: user input lenght
+ * @idx: user input length
 * @size: buffer size
 */
 struct trace_parser {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 84a3a7ba072a..9d589d8dcd1a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -13,6 +13,7 @@
 * Tracer plugins will chose a default from these clocks.
 */
 #include <linux/spinlock.h>
+#include <linux/irqflags.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
@@ -83,7 +84,7 @@ u64 notrace trace_clock_global(void)
        int this_cpu;
        u64 now;
-        raw_local_irq_save(flags);
+        local_irq_save(flags);
        this_cpu = raw_smp_processor_id();
        now = cpu_clock(this_cpu);
@@ -109,7 +110,7 @@ u64 notrace trace_clock_global(void)
        arch_spin_unlock(&trace_clock_struct.lock);
 out:
-        raw_local_irq_restore(flags);
+        local_irq_restore(flags);
        return now;
 }
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_perf.c
index f0d693005075..0565bb42566f 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_perf.c
@@ -1,32 +1,41 @@
 /*
- * trace event based perf counter profiling
+ * trace event based perf event profiling/tracing
 *
 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
- *
+ * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
 */
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include "trace.h"
+DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
+EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
+EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
 static char *perf_trace_buf;
 static char *perf_trace_buf_nmi;
-typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
+/*
+ * Force it to be aligned to unsigned long to avoid misaligned accesses
+ * suprises
+ */
+typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
+        perf_trace_t;
 /* Count the events in use (per event id, not per instance) */
-static int      total_profile_count;
+static int      total_ref_count;
-static int ftrace_profile_enable_event(struct ftrace_event_call *event)
+static int perf_trace_event_enable(struct ftrace_event_call *event)
 {
        char *buf;
        int ret = -ENOMEM;
-        if (event->profile_count++ > 0)
+        if (event->perf_refcount++ > 0)
                return 0;
-        if (!total_profile_count) {
+        if (!total_ref_count) {
                buf = (char *)alloc_percpu(perf_trace_t);
                if (!buf)
                        goto fail_buf;
@@ -40,35 +49,35 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
                rcu_assign_pointer(perf_trace_buf_nmi, buf);
        }
-        ret = event->profile_enable(event);
+        ret = event->perf_event_enable(event);
        if (!ret) {
-                total_profile_count++;
+                total_ref_count++;
                return 0;
        }
 fail_buf_nmi:
-        if (!total_profile_count) {
+        if (!total_ref_count) {
                free_percpu(perf_trace_buf_nmi);
                free_percpu(perf_trace_buf);
                perf_trace_buf_nmi = NULL;
                perf_trace_buf = NULL;
        }
 fail_buf:
-        event->profile_count--;
+        event->perf_refcount--;
        return ret;
 }
-int ftrace_profile_enable(int event_id)
+int perf_trace_enable(int event_id)
 {
        struct ftrace_event_call *event;
        int ret = -EINVAL;
        mutex_lock(&event_mutex);
        list_for_each_entry(event, &ftrace_events, list) {
-                if (event->id == event_id && event->profile_enable &&
+                if (event->id == event_id && event->perf_event_enable &&
                    try_module_get(event->mod)) {
-                        ret = ftrace_profile_enable_event(event);
+                        ret = perf_trace_event_enable(event);
                        break;
                }
        }
@@ -77,16 +86,16 @@ int ftrace_profile_enable(int event_id)
        return ret;
 }
-static void ftrace_profile_disable_event(struct ftrace_event_call *event)
+static void perf_trace_event_disable(struct ftrace_event_call *event)
 {
        char *buf, *nmi_buf;
-        if (--event->profile_count > 0)
+        if (--event->perf_refcount > 0)
                return;
-        event->profile_disable(event);
+        event->perf_event_disable(event);
-        if (!--total_profile_count) {
+        if (!--total_ref_count) {
                buf = perf_trace_buf;
                rcu_assign_pointer(perf_trace_buf, NULL);
@@ -104,14 +113,14 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
        }
 }
-void ftrace_profile_disable(int event_id)
+void perf_trace_disable(int event_id)
 {
        struct ftrace_event_call *event;
        mutex_lock(&event_mutex);
        list_for_each_entry(event, &ftrace_events, list) {
                if (event->id == event_id) {
-                        ftrace_profile_disable_event(event);
+                        perf_trace_event_disable(event);
                        module_put(event->mod);
                        break;
                }
@@ -119,13 +128,15 @@ void ftrace_profile_disable(int event_id)
        mutex_unlock(&event_mutex);
 }
-__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
+__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
-                                        int *rctxp, unsigned long *irq_flags)
+                                       int *rctxp, unsigned long *irq_flags)
 {
        struct trace_entry *entry;
        char *trace_buf, *raw_data;
        int pc, cpu;
+        BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
        pc = preempt_count();
        /* Protect the per cpu buffer, begin the rcu read side */
@@ -138,9 +149,9 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
        cpu = smp_processor_id();
        if (in_nmi())
-                trace_buf = rcu_dereference(perf_trace_buf_nmi);
+                trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
        else
-                trace_buf = rcu_dereference(perf_trace_buf);
+                trace_buf = rcu_dereference_sched(perf_trace_buf);
        if (!trace_buf)
                goto err;
@@ -148,7 +159,7 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
        raw_data = per_cpu_ptr(trace_buf, cpu);
        /* zero the dead bytes from align to not leak stack to user */
-        *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+        memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
        entry = (struct trace_entry *)raw_data;
        tracing_generic_entry_update(entry, *irq_flags, pc);
@@ -161,4 +172,4 @@ err_recursion:
        local_irq_restore(*irq_flags);
        return NULL;
 }
-EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
+EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3f972ad98d04..c697c7043349 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,6 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/slab.h>
 #include <linux/delay.h>
 #include <asm/setup.h>
@@ -938,7 +939,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                trace_create_file("enable", 0644, call->dir, call,
                                  enable);
-        if (call->id && call->profile_enable)
+        if (call->id && call->perf_event_enable)
                trace_create_file("id", 0444, call->dir, call,
                                  id);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 4615f62a04f1..88c0b6dbd7fe 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,6 +22,7 @@
 #include <linux/ctype.h>
 #include <linux/mutex.h>
 #include <linux/perf_event.h>
+#include <linux/slab.h>
 #include "trace.h"
 #include "trace_output.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3fc2a575664f..9aed1a5cf553 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,6 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -237,6 +238,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        return ret;
 }
+int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+{
+        if (tracing_thresh)
+                return 1;
+        else
+                return trace_graph_entry(trace);
+}
 static void __trace_graph_return(struct trace_array *tr,
                                struct ftrace_graph_ret *trace,
                                unsigned long flags,
@@ -290,13 +299,26 @@ void set_graph_array(struct trace_array *tr)
        smp_mb();
 }
+void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+{
+        if (tracing_thresh &&
+            (trace->rettime - trace->calltime < tracing_thresh))
+                return;
+        else
+                trace_graph_return(trace);
+}
 static int graph_trace_init(struct trace_array *tr)
 {
        int ret;
        set_graph_array(tr);
-        ret = register_ftrace_graph(&trace_graph_return,
+        if (tracing_thresh)
-                                    &trace_graph_entry);
+                ret = register_ftrace_graph(&trace_graph_thresh_return,
+                                            &trace_graph_thresh_entry);
+        else
+                ret = register_ftrace_graph(&trace_graph_return,
+                                            &trace_graph_entry);
        if (ret)
                return ret;
        tracing_start_cmdline_record();
@@ -920,7 +942,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        } else {
-                ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func);
+                ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func);
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 505c92273b1a..1251e367bae9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1214,7 +1214,7 @@ static int set_print_fmt(struct trace_probe *tp)
 #ifdef CONFIG_PERF_EVENTS
 /* Kprobe profile handler */
-static __kprobes void kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_perf_func(struct kprobe *kp,
                                         struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1227,11 +1227,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
        __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
                     "profile buffer not large enough"))
                return;
-        entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+        entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
        if (!entry)
                return;
@@ -1240,11 +1240,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp,
        for (i = 0; i < tp->nr_args; i++)
                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-        ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
+        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
 }
 /* Kretprobe profile handler */
-static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
                                            struct pt_regs *regs)
 {
        struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1257,11 +1257,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
        __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
                     "profile buffer not large enough"))
                return;
-        entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+        entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags);
        if (!entry)
                return;
@@ -1271,10 +1271,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
        for (i = 0; i < tp->nr_args; i++)
                entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
-        ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
+        perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
+                               irq_flags, regs);
 }
-static int probe_profile_enable(struct ftrace_event_call *call)
+static int probe_perf_enable(struct ftrace_event_call *call)
 {
        struct trace_probe *tp = (struct trace_probe *)call->data;
@@ -1286,7 +1287,7 @@ static int probe_profile_enable(struct ftrace_event_call *call)
                return enable_kprobe(&tp->rp.kp);
 }
-static void probe_profile_disable(struct ftrace_event_call *call)
+static void probe_perf_disable(struct ftrace_event_call *call)
 {
        struct trace_probe *tp = (struct trace_probe *)call->data;
@@ -1311,7 +1312,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
                kprobe_trace_func(kp, regs);
 #ifdef CONFIG_PERF_EVENTS
        if (tp->flags & TP_FLAG_PROFILE)
-                kprobe_profile_func(kp, regs);
+                kprobe_perf_func(kp, regs);
 #endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
@@ -1325,7 +1326,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
                kretprobe_trace_func(ri, regs);
 #ifdef CONFIG_PERF_EVENTS
        if (tp->flags & TP_FLAG_PROFILE)
-                kretprobe_profile_func(ri, regs);
+                kretprobe_perf_func(ri, regs);
 #endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
@@ -1358,8 +1359,8 @@ static int register_probe_event(struct trace_probe *tp)
        call->unregfunc = probe_event_disable;
 #ifdef CONFIG_PERF_EVENTS
-        call->profile_enable = probe_profile_enable;
+        call->perf_event_enable = probe_perf_enable;
-        call->profile_disable = probe_profile_disable;
+        call->perf_event_disable = probe_perf_disable;
 #endif
        call->data = tp;
        ret = trace_add_event_call(call);
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 94103cdcf9d8..d59cd6879477 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -23,6 +23,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 #include "trace_output.h"
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0acd834659ed..017fa376505d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <asm/atomic.h>
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d67..81003b4d617f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,6 +3,7 @@
 #include <linux/stringify.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 static inline int trace_valid_entry(struct trace_entry *entry)
 {
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index a4bb239eb987..96cffb269e73 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,6 +10,7 @@
 #include <linux/list.h>
+#include <linux/slab.h>
 #include <linux/rbtree.h>
 #include <linux/debugfs.h>
 #include "trace_stat.h"
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index cba47d7935cc..4d6d711717f2 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
 #include <trace/syscall.h>
 #include <trace/events/syscalls.h>
+#include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
@@ -428,12 +429,12 @@ core_initcall(init_ftrace_syscalls);
 #ifdef CONFIG_PERF_EVENTS
-static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
+static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
-static int sys_prof_refcount_enter;
+static int sys_perf_refcount_enter;
-static int sys_prof_refcount_exit;
+static int sys_perf_refcount_exit;
-static void prof_syscall_enter(struct pt_regs *regs, long id)
+static void perf_syscall_enter(struct pt_regs *regs, long id)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_enter *rec;
@@ -443,7 +444,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        int size;
        syscall_nr = syscall_get_nr(current, regs);
-        if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
+        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -455,11 +456,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        size = ALIGN(size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                      "profile buffer not large enough"))
+                      "perf buffer not large enough"))
                return;
-        rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
+        rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
                                sys_data->enter_event->id, &rctx, &flags);
        if (!rec)
                return;
@@ -467,10 +468,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
        rec->nr = syscall_nr;
        syscall_get_arguments(current, regs, 0, sys_data->nb_args,
                               (unsigned long *)&rec->args);
-        ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
 }
-int prof_sysenter_enable(struct ftrace_event_call *call)
+int perf_sysenter_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -478,34 +479,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call)
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        if (!sys_prof_refcount_enter)
+        if (!sys_perf_refcount_enter)
-                ret = register_trace_sys_enter(prof_syscall_enter);
+                ret = register_trace_sys_enter(perf_syscall_enter);
        if (ret) {
                pr_info("event trace: Could not activate"
                                "syscall entry trace point");
        } else {
-                set_bit(num, enabled_prof_enter_syscalls);
+                set_bit(num, enabled_perf_enter_syscalls);
-                sys_prof_refcount_enter++;
+                sys_perf_refcount_enter++;
        }
        mutex_unlock(&syscall_trace_lock);
        return ret;
 }
-void prof_sysenter_disable(struct ftrace_event_call *call)
+void perf_sysenter_disable(struct ftrace_event_call *call)
 {
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        sys_prof_refcount_enter--;
+        sys_perf_refcount_enter--;
-        clear_bit(num, enabled_prof_enter_syscalls);
+        clear_bit(num, enabled_perf_enter_syscalls);
-        if (!sys_prof_refcount_enter)
+        if (!sys_perf_refcount_enter)
-                unregister_trace_sys_enter(prof_syscall_enter);
+                unregister_trace_sys_enter(perf_syscall_enter);
        mutex_unlock(&syscall_trace_lock);
 }
-static void prof_syscall_exit(struct pt_regs *regs, long ret)
+static void perf_syscall_exit(struct pt_regs *regs, long ret)
 {
        struct syscall_metadata *sys_data;
        struct syscall_trace_exit *rec;
@@ -515,7 +516,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        int size;
        syscall_nr = syscall_get_nr(current, regs);
-        if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
+        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -530,11 +531,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
         * Impossible, but be paranoid with the future
         * How to put this check outside runtime?
         */
-        if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
+        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-                "exit event has grown above profile buffer size"))
+                "exit event has grown above perf buffer size"))
                return;
-        rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
+        rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
                                sys_data->exit_event->id, &rctx, &flags);
        if (!rec)
                return;
@@ -542,10 +543,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
        rec->nr = syscall_nr;
        rec->ret = syscall_get_return_value(current, regs);
-        ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
 }
-int prof_sysexit_enable(struct ftrace_event_call *call)
+int perf_sysexit_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -553,30 +554,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        if (!sys_prof_refcount_exit)
+        if (!sys_perf_refcount_exit)
-                ret = register_trace_sys_exit(prof_syscall_exit);
+                ret = register_trace_sys_exit(perf_syscall_exit);
        if (ret) {
                pr_info("event trace: Could not activate"
                                "syscall exit trace point");
        } else {
-                set_bit(num, enabled_prof_exit_syscalls);
+                set_bit(num, enabled_perf_exit_syscalls);
-                sys_prof_refcount_exit++;
+                sys_perf_refcount_exit++;
        }
        mutex_unlock(&syscall_trace_lock);
        return ret;
 }
-void prof_sysexit_disable(struct ftrace_event_call *call)
+void perf_sysexit_disable(struct ftrace_event_call *call)
 {
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        mutex_lock(&syscall_trace_lock);
-        sys_prof_refcount_exit--;
+        sys_perf_refcount_exit--;
-        clear_bit(num, enabled_prof_exit_syscalls);
+        clear_bit(num, enabled_perf_exit_syscalls);
-        if (!sys_prof_refcount_exit)
+        if (!sys_perf_refcount_exit)
-                unregister_trace_sys_exit(prof_syscall_exit);
+                unregister_trace_sys_exit(perf_syscall_exit);
        mutex_unlock(&syscall_trace_lock);
 }
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 40cafb07dffd..cc2d2faa7d9e 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
 #include <trace/events/workqueue.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
+#include <linux/slab.h>
 #include <linux/kref.h>
 #include "trace_stat.h"
 #include "trace.h"