52 files changed, 2413 insertions, 501 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 6aebdeb2aa34..a987aa1676b5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
-            async.o
+            async.o range.o
+obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o
 obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
@@ -90,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
diff --git a/kernel/acct.c b/kernel/acct.c
index a6605ca921b6..24f8c81fc48d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -588,16 +588,6 @@ out:
 }
 /**
- * acct_init_pacct - initialize a new pacct_struct
- * @pacct: per-process accounting info struct to initialize
- */
-void acct_init_pacct(struct pacct_struct *pacct)
-{
-        memset(pacct, 0, sizeof(struct pacct_struct));
-        pacct->ac_utime = pacct->ac_stime = cputime_zero;
-}
-/**
 * acct_collect - collect accounting information into pacct_struct
 * @exitcode: task exit code
 * @group_dead: not 0, if this thread is the last one in the process.
diff --git a/kernel/audit.c b/kernel/audit.c
index 5feed232be9d..78f7f86aa238 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -398,7 +398,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
        skb_get(skb);
        err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
        if (err < 0) {
-                BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+                BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
                printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
                audit_log_lost("auditd dissapeared\n");
                audit_pid = 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 4b05bd9479db..028e85663f27 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -548,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule)
        return 0;
 }
+static int compare_root(struct vfsmount *mnt, void *arg)
+{
+        return mnt->mnt_root->d_inode == arg;
+}
 void audit_trim_trees(void)
 {
        struct list_head cursor;
@@ -559,7 +564,6 @@ void audit_trim_trees(void)
                struct path path;
                struct vfsmount *root_mnt;
                struct node *node;
-                struct list_head list;
                int err;
                tree = container_of(cursor.next, struct audit_tree, list);
@@ -577,24 +581,16 @@ void audit_trim_trees(void)
                if (!root_mnt)
                        goto skip_it;
-                list_add_tail(&list, &root_mnt->mnt_list);
                spin_lock(&hash_lock);
                list_for_each_entry(node, &tree->chunks, list) {
-                        struct audit_chunk *chunk = find_chunk(node);
+                        struct inode *inode = find_chunk(node)->watch.inode;
-                        struct inode *inode = chunk->watch.inode;
-                        struct vfsmount *mnt;
                        node->index |= 1U<<31;
-                        list_for_each_entry(mnt, &list, mnt_list) {
+                        if (iterate_mounts(compare_root, inode, root_mnt))
-                                if (mnt->mnt_root->d_inode == inode) {
+                                node->index &= ~(1U<<31);
-                                        node->index &= ~(1U<<31);
-                                        break;
-                                }
-                        }
                }
                spin_unlock(&hash_lock);
                trim_marked(tree);
                put_tree(tree);
-                list_del_init(&list);
                drop_collected_mounts(root_mnt);
 skip_it:
                mutex_lock(&audit_filter_mutex);
@@ -603,22 +599,6 @@ skip_it:
        mutex_unlock(&audit_filter_mutex);
 }
-static int is_under(struct vfsmount *mnt, struct dentry *dentry,
-                    struct path *path)
-{
-        if (mnt != path->mnt) {
-                for (;;) {
-                        if (mnt->mnt_parent == mnt)
-                                return 0;
-                        if (mnt->mnt_parent == path->mnt)
-                                        break;
-                        mnt = mnt->mnt_parent;
-                }
-                dentry = mnt->mnt_mountpoint;
-        }
-        return is_subdir(dentry, path->dentry);
-}
 int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
 {
@@ -638,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree)
        put_tree(tree);
 }
+static int tag_mount(struct vfsmount *mnt, void *arg)
+{
+        return tag_chunk(mnt->mnt_root->d_inode, arg);
+}
 /* called with audit_filter_mutex */
 int audit_add_tree_rule(struct audit_krule *rule)
 {
        struct audit_tree *seed = rule->tree, *tree;
        struct path path;
-        struct vfsmount *mnt, *p;
+        struct vfsmount *mnt;
-        struct list_head list;
        int err;
        list_for_each_entry(tree, &tree_list, list) {
@@ -670,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule)
                err = -ENOMEM;
                goto Err;
        }
-        list_add_tail(&list, &mnt->mnt_list);
        get_tree(tree);
-        list_for_each_entry(p, &list, mnt_list) {
+        err = iterate_mounts(tag_mount, tree, mnt);
-                err = tag_chunk(p->mnt_root->d_inode, tree);
-                if (err)
-                        break;
-        }
-        list_del(&list);
        drop_collected_mounts(mnt);
        if (!err) {
@@ -714,31 +691,23 @@ int audit_tag_tree(char *old, char *new)
 {
        struct list_head cursor, barrier;
        int failed = 0;
-        struct path path;
+        struct path path1, path2;
        struct vfsmount *tagged;
-        struct list_head list;
-        struct vfsmount *mnt;
-        struct dentry *dentry;
        int err;
-        err = kern_path(new, 0, &path);
+        err = kern_path(new, 0, &path2);
        if (err)
                return err;
-        tagged = collect_mounts(&path);
+        tagged = collect_mounts(&path2);
-        path_put(&path);
+        path_put(&path2);
        if (!tagged)
                return -ENOMEM;
-        err = kern_path(old, 0, &path);
+        err = kern_path(old, 0, &path1);
        if (err) {
                drop_collected_mounts(tagged);
                return err;
        }
-        mnt = mntget(path.mnt);
-        dentry = dget(path.dentry);
-        path_put(&path);
-        list_add_tail(&list, &tagged->mnt_list);
        mutex_lock(&audit_filter_mutex);
        list_add(&barrier, &tree_list);
@@ -746,7 +715,7 @@ int audit_tag_tree(char *old, char *new)
        while (cursor.next != &tree_list) {
                struct audit_tree *tree;
-                struct vfsmount *p;
+                int good_one = 0;
                tree = container_of(cursor.next, struct audit_tree, list);
                get_tree(tree);
@@ -754,30 +723,19 @@ int audit_tag_tree(char *old, char *new)
                list_add(&cursor, &tree->list);
                mutex_unlock(&audit_filter_mutex);
-                err = kern_path(tree->pathname, 0, &path);
+                err = kern_path(tree->pathname, 0, &path2);
-                if (err) {
+                if (!err) {
-                        put_tree(tree);
+                        good_one = path_is_under(&path1, &path2);
-                        mutex_lock(&audit_filter_mutex);
+                        path_put(&path2);
-                        continue;
                }
-                spin_lock(&vfsmount_lock);
+                if (!good_one) {
-                if (!is_under(mnt, dentry, &path)) {
-                        spin_unlock(&vfsmount_lock);
-                        path_put(&path);
                        put_tree(tree);
                        mutex_lock(&audit_filter_mutex);
                        continue;
                }
-                spin_unlock(&vfsmount_lock);
-                path_put(&path);
-                list_for_each_entry(p, &list, mnt_list) {
-                        failed = tag_chunk(p->mnt_root->d_inode, tree);
-                        if (failed)
-                                break;
-                }
+                failed = iterate_mounts(tag_mount, tree, tagged);
                if (failed) {
                        put_tree(tree);
                        mutex_lock(&audit_filter_mutex);
@@ -818,10 +776,8 @@ int audit_tag_tree(char *old, char *new)
        }
        list_del(&barrier);
        list_del(&cursor);
-        list_del(&list);
        mutex_unlock(&audit_filter_mutex);
-        dput(dentry);
+        path_put(&path1);
-        mntput(mnt);
        drop_collected_mounts(tagged);
        return failed;
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fc0f928167e7..f3a461c0970a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1988,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 /**
 * audit_inode_child - collect inode info for created/removed objects
- * @dname: inode's dentry name
 * @dentry: dentry being audited
 * @parent: inode of dentry parent
 *
@@ -2000,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 * must be hooked prior, in order to capture the target inode during
 * unsuccessful attempts.
 */
-void __audit_inode_child(const char *dname, const struct dentry *dentry,
+void __audit_inode_child(const struct dentry *dentry,
                         const struct inode *parent)
 {
        int idx;
        struct audit_context *context = current->audit_context;
        const char *found_parent = NULL, *found_child = NULL;
        const struct inode *inode = dentry->d_inode;
+        const char *dname = dentry->d_name.name;
        int dirlen = 0;
        if (!context->in_syscall)
@@ -2014,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
        if (inode)
                handle_one(inode);
-        /* determine matching parent */
-        if (!dname)
-                goto add_names;
        /* parent is more likely, look for it first */
        for (idx = 0; idx < context->name_count; idx++) {
diff --git a/kernel/capability.c b/kernel/capability.c
index 7f876e60521f..9e4697e9b276 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
        if (pid && (pid != task_pid_vnr(current))) {
                struct task_struct *target;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                target = find_task_by_vpid(pid);
                if (!target)
@@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
                else
                        ret = security_capget(target, pEp, pIp, pPp);
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
        } else
                ret = security_capget(current, pEp, pIp, pPp);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4fd90e129772..ef909a329750 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4,6 +4,10 @@
 *  Based originally on the cpuset system, extracted by Paul Menage
 *  Copyright (C) 2006 Google, Inc
 *
+ *  Notifications support
+ *  Copyright (C) 2009 Nokia Corporation
+ *  Author: Kirill A. Shutemov
+ *
 *  Copyright notices from the original cpuset code:
 *  --------------------------------------------------
 *  Copyright (C) 2003 BULL SA.
@@ -44,6 +48,7 @@
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
+#include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
@@ -52,15 +57,21 @@
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
+#include <linux/eventfd.h>
+#include <linux/poll.h>
 #include <asm/atomic.h>
 static DEFINE_MUTEX(cgroup_mutex);
-/* Generate an array of cgroup subsystem pointers */
+/*
+ * Generate an array of cgroup subsystem pointers. At boot time, this is
+ * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
+ * registered after that. The mutable section of this array is protected by
+ * cgroup_mutex.
+ */
 #define SUBSYS(_x) &_x ## _subsys,
+static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
-static struct cgroup_subsys *subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
@@ -147,6 +158,35 @@ struct css_id {
        unsigned short stack[0]; /* Array of Length (depth+1) */
 };
+/*
+ * cgroup_event represents events which userspace want to recieve.
+ */
+struct cgroup_event {
+        /*
+         * Cgroup which the event belongs to.
+         */
+        struct cgroup *cgrp;
+        /*
+         * Control file which the event associated.
+         */
+        struct cftype *cft;
+        /*
+         * eventfd to signal userspace about the event.
+         */
+        struct eventfd_ctx *eventfd;
+        /*
+         * Each of these stored in a list by the cgroup.
+         */
+        struct list_head list;
+        /*
+         * All fields below needed to unregister event when
+         * userspace closes eventfd.
+         */
+        poll_table pt;
+        wait_queue_head_t *wqh;
+        wait_queue_t wait;
+        struct work_struct remove;
+};
 /* The list of hierarchy roots */
@@ -250,7 +290,8 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
-static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+static int cgroup_init_idr(struct cgroup_subsys *ss,
+                           struct cgroup_subsys_state *css);
 /* css_set_lock protects the list of css_set objects, and the
 * chain of tasks off each css_set.  Nests outside task->alloc_lock
@@ -448,8 +489,11 @@ static struct css_set *find_existing_css_set(
        struct hlist_node *node;
        struct css_set *cg;
-        /* Built the set of subsystem state objects that we want to
+        /*
-         * see in the new css_set */
+         * Build the set of subsystem state objects that we want to see in the
+         * new css_set. while subsystems can change globally, the entries here
+         * won't change, so no need for locking.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                if (root->subsys_bits & (1UL << i)) {
                        /* Subsystem is in this hierarchy. So we want
@@ -696,6 +740,7 @@ void cgroup_lock(void)
 {
        mutex_lock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_lock);
 /**
 * cgroup_unlock - release lock on cgroup changes
@@ -706,6 +751,7 @@ void cgroup_unlock(void)
 {
        mutex_unlock(&cgroup_mutex);
 }
+EXPORT_SYMBOL_GPL(cgroup_unlock);
 /*
 * A couple of forward declarations required, due to cyclic reference loop:
@@ -757,6 +803,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
                        if (ret)
                                break;
                }
        return ret;
 }
@@ -884,7 +931,11 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
        css_put(css);
 }
+/*
+ * Call with cgroup_mutex held. Drops reference counts on modules, including
+ * any duplicate ones that parse_cgroupfs_options took. If this function
+ * returns an error, no reference counts are touched.
+ */
 static int rebind_subsystems(struct cgroupfs_root *root,
                              unsigned long final_bits)
 {
@@ -892,6 +943,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        struct cgroup *cgrp = &root->top_cgroup;
        int i;
+        BUG_ON(!mutex_is_locked(&cgroup_mutex));
        removed_bits = root->actual_subsys_bits & ~final_bits;
        added_bits = final_bits & ~root->actual_subsys_bits;
        /* Check that any added subsystems are currently free */
@@ -900,6 +953,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                struct cgroup_subsys *ss = subsys[i];
                if (!(bit & added_bits))
                        continue;
+                /*
+                 * Nobody should tell us to do a subsys that doesn't exist:
+                 * parse_cgroupfs_options should catch that case and refcounts
+                 * ensure that subsystems won't disappear once selected.
+                 */
+                BUG_ON(ss == NULL);
                if (ss->root != &rootnode) {
                        /* Subsystem isn't free */
                        return -EBUSY;
@@ -919,6 +978,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                unsigned long bit = 1UL << i;
                if (bit & added_bits) {
                        /* We're binding this subsystem to this hierarchy */
+                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i]);
                        BUG_ON(!dummytop->subsys[i]);
                        BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -930,8 +990,10 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        if (ss->bind)
                                ss->bind(ss, cgrp);
                        mutex_unlock(&ss->hierarchy_mutex);
+                        /* refcount was already taken, and we're keeping it */
                } else if (bit & removed_bits) {
                        /* We're removing this subsystem */
+                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
                        mutex_lock(&ss->hierarchy_mutex);
@@ -942,9 +1004,20 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        subsys[i]->root = &rootnode;
                        list_move(&ss->sibling, &rootnode.subsys_list);
                        mutex_unlock(&ss->hierarchy_mutex);
+                        /* subsystem is now free - drop reference on module */
+                        module_put(ss->module);
                } else if (bit & final_bits) {
                        /* Subsystem state should already exist */
+                        BUG_ON(ss == NULL);
                        BUG_ON(!cgrp->subsys[i]);
+                        /*
+                         * a refcount was taken, but we already had one, so
+                         * drop the extra reference.
+                         */
+                        module_put(ss->module);
+#ifdef CONFIG_MODULE_UNLOAD
+                        BUG_ON(ss->module && !module_refcount(ss->module));
+#endif
                } else {
                        /* Subsystem state shouldn't exist */
                        BUG_ON(cgrp->subsys[i]);
@@ -986,13 +1059,20 @@ struct cgroup_sb_opts {
 };
-/* Convert a hierarchy specifier into a bitmask of subsystems and
+/*
- * flags. */
+ * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
-static int parse_cgroupfs_options(char *data,
+ * with cgroup_mutex held to protect the subsys[] array. This function takes
-                                     struct cgroup_sb_opts *opts)
+ * refcounts on subsystems to be used, unless it returns error, in which case
+ * no refcounts are taken.
+ */
+static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
        char *token, *o = data ?: "all";
        unsigned long mask = (unsigned long)-1;
+        int i;
+        bool module_pin_failed = false;
+        BUG_ON(!mutex_is_locked(&cgroup_mutex));
 #ifdef CONFIG_CPUSETS
        mask = ~(1UL << cpuset_subsys_id);
@@ -1005,10 +1085,11 @@ static int parse_cgroupfs_options(char *data,
                        return -EINVAL;
                if (!strcmp(token, "all")) {
                        /* Add all non-disabled subsystems */
-                        int i;
                        opts->subsys_bits = 0;
                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                struct cgroup_subsys *ss = subsys[i];
+                                if (ss == NULL)
+                                        continue;
                                if (!ss->disabled)
                                        opts->subsys_bits |= 1ul << i;
                        }
@@ -1026,7 +1107,6 @@ static int parse_cgroupfs_options(char *data,
                        if (!opts->release_agent)
                                return -ENOMEM;
                } else if (!strncmp(token, "name=", 5)) {
-                        int i;
                        const char *name = token + 5;
                        /* Can't specify an empty name */
                        if (!strlen(name))
@@ -1050,9 +1130,10 @@ static int parse_cgroupfs_options(char *data,
                                return -ENOMEM;
                } else {
                        struct cgroup_subsys *ss;
-                        int i;
                        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                ss = subsys[i];
+                                if (ss == NULL)
+                                        continue;
                                if (!strcmp(token, ss->name)) {
                                        if (!ss->disabled)
                                                set_bit(i, &opts->subsys_bits);
@@ -1087,9 +1168,54 @@ static int parse_cgroupfs_options(char *data,
        if (!opts->subsys_bits && !opts->name)
                return -EINVAL;
+        /*
+         * Grab references on all the modules we'll need, so the subsystems
+         * don't dance around before rebind_subsystems attaches them. This may
+         * take duplicate reference counts on a subsystem that's already used,
+         * but rebind_subsystems handles this case.
+         */
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                unsigned long bit = 1UL << i;
+                if (!(bit & opts->subsys_bits))
+                        continue;
+                if (!try_module_get(subsys[i]->module)) {
+                        module_pin_failed = true;
+                        break;
+                }
+        }
+        if (module_pin_failed) {
+                /*
+                 * oops, one of the modules was going away. this means that we
+                 * raced with a module_delete call, and to the user this is
+                 * essentially a "subsystem doesn't exist" case.
+                 */
+                for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) {
+                        /* drop refcounts only on the ones we took */
+                        unsigned long bit = 1UL << i;
+                        if (!(bit & opts->subsys_bits))
+                                continue;
+                        module_put(subsys[i]->module);
+                }
+                return -ENOENT;
+        }
        return 0;
 }
+static void drop_parsed_module_refcounts(unsigned long subsys_bits)
+{
+        int i;
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                unsigned long bit = 1UL << i;
+                if (!(bit & subsys_bits))
+                        continue;
+                module_put(subsys[i]->module);
+        }
+}
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 {
        int ret = 0;
@@ -1106,21 +1232,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                goto out_unlock;
-        /* Don't allow flags to change at remount */
+        /* Don't allow flags or name to change at remount */
-        if (opts.flags != root->flags) {
+        if (opts.flags != root->flags ||
-                ret = -EINVAL;
+            (opts.name && strcmp(opts.name, root->name))) {
-                goto out_unlock;
-        }
-        /* Don't allow name to change at remount */
-        if (opts.name && strcmp(opts.name, root->name)) {
                ret = -EINVAL;
+                drop_parsed_module_refcounts(opts.subsys_bits);
                goto out_unlock;
        }
        ret = rebind_subsystems(root, opts.subsys_bits);
-        if (ret)
+        if (ret) {
+                drop_parsed_module_refcounts(opts.subsys_bits);
                goto out_unlock;
+        }
        /* (re)populate subsystem files */
        cgroup_populate_dir(cgrp);
@@ -1151,6 +1275,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
+        INIT_LIST_HEAD(&cgrp->event_list);
+        spin_lock_init(&cgrp->event_list_lock);
 }
 static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1306,7 +1432,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        struct cgroupfs_root *new_root;
        /* First find the desired set of subsystems */
+        mutex_lock(&cgroup_mutex);
        ret = parse_cgroupfs_options(data, &opts);
+        mutex_unlock(&cgroup_mutex);
        if (ret)
                goto out_err;
@@ -1317,7 +1445,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        new_root = cgroup_root_from_opts(&opts);
        if (IS_ERR(new_root)) {
                ret = PTR_ERR(new_root);
-                goto out_err;
+                goto drop_modules;
        }
        opts.new_root = new_root;
@@ -1326,7 +1454,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
                cgroup_drop_root(opts.new_root);
-                goto out_err;
+                goto drop_modules;
        }
        root = sb->s_fs_info;
@@ -1382,6 +1510,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                        free_cg_links(&tmp_cg_links);
                        goto drop_new_super;
                }
+                /*
+                 * There must be no failure case after here, since rebinding
+                 * takes care of subsystems' refcounts, which are explicitly
+                 * dropped in the failure exit path.
+                 */
                /* EBUSY should be the only error here */
                BUG_ON(ret);
@@ -1420,6 +1553,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                 * any) is not needed
                 */
                cgroup_drop_root(opts.new_root);
+                /* no subsys rebinding, so refcounts don't change */
+                drop_parsed_module_refcounts(opts.subsys_bits);
        }
        simple_set_mnt(mnt, sb);
@@ -1429,6 +1564,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 drop_new_super:
        deactivate_locked_super(sb);
+ drop_modules:
+        drop_parsed_module_refcounts(opts.subsys_bits);
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
@@ -1542,6 +1679,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        memmove(buf, start, buf + buflen - start);
        return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_path);
 /**
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
@@ -1554,7 +1692,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
        int retval = 0;
-        struct cgroup_subsys *ss;
+        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
        struct css_set *cg;
        struct css_set *newcg;
@@ -1568,8 +1706,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
                        retval = ss->can_attach(ss, cgrp, tsk, false);
-                        if (retval)
+                        if (retval) {
-                                return retval;
+                                /*
+                                 * Remember on which subsystem the can_attach()
+                                 * failed, so that we only call cancel_attach()
+                                 * against the subsystems whose can_attach()
+                                 * succeeded. (See below)
+                                 */
+                                failed_ss = ss;
+                                goto out;
+                        }
                }
        }
@@ -1583,14 +1729,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         */
        newcg = find_css_set(cg, cgrp);
        put_css_set(cg);
-        if (!newcg)
+        if (!newcg) {
-                return -ENOMEM;
+                retval = -ENOMEM;
+                goto out;
+        }
        task_lock(tsk);
        if (tsk->flags & PF_EXITING) {
                task_unlock(tsk);
                put_css_set(newcg);
-                return -ESRCH;
+                retval = -ESRCH;
+                goto out;
        }
        rcu_assign_pointer(tsk->cgroups, newcg);
        task_unlock(tsk);
@@ -1616,7 +1765,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         * is no longer empty.
         */
        cgroup_wakeup_rmdir_waiter(cgrp);
-        return 0;
+out:
+        if (retval) {
+                for_each_subsys(root, ss) {
+                        if (ss == failed_ss)
+                                /*
+                                 * This subsystem was the one that failed the
+                                 * can_attach() check earlier, so we don't need
+                                 * to call cancel_attach() against it or any
+                                 * remaining subsystems.
+                                 */
+                                break;
+                        if (ss->cancel_attach)
+                                ss->cancel_attach(ss, cgrp, tsk, false);
+                }
+        }
+        return retval;
 }
 /*
@@ -1682,6 +1846,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
        }
        return true;
 }
+EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
@@ -1950,6 +2115,16 @@ static const struct inode_operations cgroup_dir_inode_operations = {
        .rename = cgroup_rename,
 };
+/*
+ * Check if a file is a control file
+ */
+static inline struct cftype *__file_cft(struct file *file)
+{
+        if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+                return ERR_PTR(-EINVAL);
+        return __d_cft(file->f_dentry);
+}
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                                struct super_block *sb)
 {
@@ -2069,6 +2244,7 @@ int cgroup_add_file(struct cgroup *cgrp,
                error = PTR_ERR(dentry);
        return error;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_file);
 int cgroup_add_files(struct cgroup *cgrp,
                        struct cgroup_subsys *subsys,
@@ -2083,6 +2259,7 @@ int cgroup_add_files(struct cgroup *cgrp,
        }
        return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_add_files);
 /**
 * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2468,7 +2645,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 {
        struct cgroup_pidlist *l;
        /* don't need task_nsproxy() if we're looking at ourself */
-        struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
+        struct pid_namespace *ns = current->nsproxy->pid_ns;
        /*
         * We can't drop the pidlist_mutex before taking the l->mutex in case
         * the last ref-holder is trying to remove l from the list at the same
@@ -2478,8 +2656,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        mutex_lock(&cgrp->pidlist_mutex);
        list_for_each_entry(l, &cgrp->pidlists, links) {
                if (l->key.type == type && l->key.ns == ns) {
-                        /* found a matching list - drop the extra refcount */
-                        put_pid_ns(ns);
                        /* make sure l doesn't vanish out from under us */
                        down_write(&l->mutex);
                        mutex_unlock(&cgrp->pidlist_mutex);
@@ -2490,13 +2666,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
        if (!l) {
                mutex_unlock(&cgrp->pidlist_mutex);
-                put_pid_ns(ns);
                return l;
        }
        init_rwsem(&l->mutex);
        down_write(&l->mutex);
        l->key.type = type;
-        l->key.ns = ns;
+        l->key.ns = get_pid_ns(ns);
        l->use_count = 0; /* don't increment here */
        l->list = NULL;
        l->owner = cgrp;
@@ -2804,6 +2979,174 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 }
 /*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void cgroup_event_remove(struct work_struct *work)
+{
+        struct cgroup_event *event = container_of(work, struct cgroup_event,
+                        remove);
+        struct cgroup *cgrp = event->cgrp;
+        /* TODO: check return code */
+        event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+        eventfd_ctx_put(event->eventfd);
+        kfree(event);
+        dput(cgrp->dentry);
+}
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
+                int sync, void *key)
+{
+        struct cgroup_event *event = container_of(wait,
+                        struct cgroup_event, wait);
+        struct cgroup *cgrp = event->cgrp;
+        unsigned long flags = (unsigned long)key;
+        if (flags & POLLHUP) {
+                remove_wait_queue_locked(event->wqh, &event->wait);
+                spin_lock(&cgrp->event_list_lock);
+                list_del(&event->list);
+                spin_unlock(&cgrp->event_list_lock);
+                /*
+                 * We are in atomic context, but cgroup_event_remove() may
+                 * sleep, so we have to call it in workqueue.
+                 */
+                schedule_work(&event->remove);
+        }
+        return 0;
+}
+static void cgroup_event_ptable_queue_proc(struct file *file,
+                wait_queue_head_t *wqh, poll_table *pt)
+{
+        struct cgroup_event *event = container_of(pt,
+                        struct cgroup_event, pt);
+        event->wqh = wqh;
+        add_wait_queue(wqh, &event->wait);
+}
+/*
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buffer)
+{
+        struct cgroup_event *event = NULL;
+        unsigned int efd, cfd;
+        struct file *efile = NULL;
+        struct file *cfile = NULL;
+        char *endp;
+        int ret;
+        efd = simple_strtoul(buffer, &endp, 10);
+        if (*endp != ' ')
+                return -EINVAL;
+        buffer = endp + 1;
+        cfd = simple_strtoul(buffer, &endp, 10);
+        if ((*endp != ' ') && (*endp != '\0'))
+                return -EINVAL;
+        buffer = endp + 1;
+        event = kzalloc(sizeof(*event), GFP_KERNEL);
+        if (!event)
+                return -ENOMEM;
+        event->cgrp = cgrp;
+        INIT_LIST_HEAD(&event->list);
+        init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
+        init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
+        INIT_WORK(&event->remove, cgroup_event_remove);
+        efile = eventfd_fget(efd);
+        if (IS_ERR(efile)) {
+                ret = PTR_ERR(efile);
+                goto fail;
+        }
+        event->eventfd = eventfd_ctx_fileget(efile);
+        if (IS_ERR(event->eventfd)) {
+                ret = PTR_ERR(event->eventfd);
+                goto fail;
+        }
+        cfile = fget(cfd);
+        if (!cfile) {
+                ret = -EBADF;
+                goto fail;
+        }
+        /* the process need read permission on control file */
+        ret = file_permission(cfile, MAY_READ);
+        if (ret < 0)
+                goto fail;
+        event->cft = __file_cft(cfile);
+        if (IS_ERR(event->cft)) {
+                ret = PTR_ERR(event->cft);
+                goto fail;
+        }
+        if (!event->cft->register_event || !event->cft->unregister_event) {
+                ret = -EINVAL;
+                goto fail;
+        }
+        ret = event->cft->register_event(cgrp, event->cft,
+                        event->eventfd, buffer);
+        if (ret)
+                goto fail;
+        if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
+                event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+                ret = 0;
+                goto fail;
+        }
+        /*
+         * Events should be removed after rmdir of cgroup directory, but before
+         * destroying subsystem state objects. Let's take reference to cgroup
+         * directory dentry to do that.
+         */
+        dget(cgrp->dentry);
+        spin_lock(&cgrp->event_list_lock);
+        list_add(&event->list, &cgrp->event_list);
+        spin_unlock(&cgrp->event_list_lock);
+        fput(cfile);
+        fput(efile);
+        return 0;
+fail:
+        if (cfile)
+                fput(cfile);
+        if (event && event->eventfd && !IS_ERR(event->eventfd))
+                eventfd_ctx_put(event->eventfd);
+        if (!IS_ERR_OR_NULL(efile))
+                fput(efile);
+        kfree(event);
+        return ret;
+}
+/*
 * for the common functions, 'private' gives the type of file
 */
 /* for hysterical raisins, we can't put this on the older files */
@@ -2828,6 +3171,11 @@ static struct cftype files[] = {
                .read_u64 = cgroup_read_notify_on_release,
                .write_u64 = cgroup_write_notify_on_release,
        },
+        {
+                .name = CGROUP_FILE_GENERIC_PREFIX "event_control",
+                .write_string = cgroup_write_event_control,
+                .mode = S_IWUGO,
+        },
 };
 static struct cftype cft_release_agent = {
@@ -2892,8 +3240,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
        /* We need to take each hierarchy_mutex in a consistent order */
        int i;
+        /*
+         * No worry about a race with rebind_subsystems that might mess up the
+         * locking order, since both parties are under cgroup_mutex.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                if (ss->root == root)
                        mutex_lock(&ss->hierarchy_mutex);
        }
@@ -2905,6 +3259,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                if (ss->root == root)
                        mutex_unlock(&ss->hierarchy_mutex);
        }
@@ -3028,11 +3384,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
         * synchronization other than RCU, and the subsystem linked
         * list isn't RCU-safe */
        int i;
+        /*
+         * We won't need to lock the subsys array, because the subsystems
+         * we're concerned about aren't going anywhere since our cgroup root
+         * has a reference on them.
+         */
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                struct cgroup_subsys_state *css;
-                /* Skip subsystems not in this hierarchy */
+                /* Skip subsystems not present or not in this hierarchy */
-                if (ss->root != cgrp->root)
+                if (ss == NULL || ss->root != cgrp->root)
                        continue;
                css = cgrp->subsys[ss->subsys_id];
                /* When called from check_for_release() it's possible
@@ -3106,6 +3467,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        struct dentry *d;
        struct cgroup *parent;
        DEFINE_WAIT(wait);
+        struct cgroup_event *event, *tmp;
        int ret;
        /* the vfs holds both inode->i_mutex already */
@@ -3189,6 +3551,20 @@ again:
        set_bit(CGRP_RELEASABLE, &parent->flags);
        check_for_release(parent);
+        /*
+         * Unregister events and notify userspace.
+         * Notify userspace about cgroup removing only after rmdir of cgroup
+         * directory to avoid race between userspace and kernelspace
+         */
+        spin_lock(&cgrp->event_list_lock);
+        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
+                list_del(&event->list);
+                remove_wait_queue(event->wqh, &event->wait);
+                eventfd_signal(event->eventfd, 1);
+                schedule_work(&event->remove);
+        }
+        spin_unlock(&cgrp->event_list_lock);
        mutex_unlock(&cgroup_mutex);
        return 0;
 }
@@ -3223,7 +3599,196 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        mutex_init(&ss->hierarchy_mutex);
        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
        ss->active = 1;
+        /* this function shouldn't be used with modular subsystems, since they
+         * need to register a subsys_id, among other things */
+        BUG_ON(ss->module);
+}
+/**
+ * cgroup_load_subsys: load and register a modular subsystem at runtime
+ * @ss: the subsystem to load
+ *
+ * This function should be called in a modular subsystem's initcall. If the
+ * subsytem is built as a module, it will be assigned a new subsys_id and set
+ * up for use. If the subsystem is built-in anyway, work is delegated to the
+ * simpler cgroup_init_subsys.
+ */
+int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
+{
+        int i;
+        struct cgroup_subsys_state *css;
+        /* check name and function validity */
+        if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
+            ss->create == NULL || ss->destroy == NULL)
+                return -EINVAL;
+        /*
+         * we don't support callbacks in modular subsystems. this check is
+         * before the ss->module check for consistency; a subsystem that could
+         * be a module should still have no callbacks even if the user isn't
+         * compiling it as one.
+         */
+        if (ss->fork || ss->exit)
+                return -EINVAL;
+        /*
+         * an optionally modular subsystem is built-in: we want to do nothing,
+         * since cgroup_init_subsys will have already taken care of it.
+         */
+        if (ss->module == NULL) {
+                /* a few sanity checks */
+                BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT);
+                BUG_ON(subsys[ss->subsys_id] != ss);
+                return 0;
+        }
+        /*
+         * need to register a subsys id before anything else - for example,
+         * init_cgroup_css needs it.
+         */
+        mutex_lock(&cgroup_mutex);
+        /* find the first empty slot in the array */
+        for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) {
+                if (subsys[i] == NULL)
+                        break;
+        }
+        if (i == CGROUP_SUBSYS_COUNT) {
+                /* maximum number of subsystems already registered! */
+                mutex_unlock(&cgroup_mutex);
+                return -EBUSY;
+        }
+        /* assign ourselves the subsys_id */
+        ss->subsys_id = i;
+        subsys[i] = ss;
+        /*
+         * no ss->create seems to need anything important in the ss struct, so
+         * this can happen first (i.e. before the rootnode attachment).
+         */
+        css = ss->create(ss, dummytop);
+        if (IS_ERR(css)) {
+                /* failure case - need to deassign the subsys[] slot. */
+                subsys[i] = NULL;
+                mutex_unlock(&cgroup_mutex);
+                return PTR_ERR(css);
+        }
+        list_add(&ss->sibling, &rootnode.subsys_list);
+        ss->root = &rootnode;
+        /* our new subsystem will be attached to the dummy hierarchy. */
+        init_cgroup_css(css, ss, dummytop);
+        /* init_idr must be after init_cgroup_css because it sets css->id. */
+        if (ss->use_id) {
+                int ret = cgroup_init_idr(ss, css);
+                if (ret) {
+                        dummytop->subsys[ss->subsys_id] = NULL;
+                        ss->destroy(ss, dummytop);
+                        subsys[i] = NULL;
+                        mutex_unlock(&cgroup_mutex);
+                        return ret;
+                }
+        }
+        /*
+         * Now we need to entangle the css into the existing css_sets. unlike
+         * in cgroup_init_subsys, there are now multiple css_sets, so each one
+         * will need a new pointer to it; done by iterating the css_set_table.
+         * furthermore, modifying the existing css_sets will corrupt the hash
+         * table state, so each changed css_set will need its hash recomputed.
+         * this is all done under the css_set_lock.
+         */
+        write_lock(&css_set_lock);
+        for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+                struct css_set *cg;
+                struct hlist_node *node, *tmp;
+                struct hlist_head *bucket = &css_set_table[i], *new_bucket;
+                hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
+                        /* skip entries that we already rehashed */
+                        if (cg->subsys[ss->subsys_id])
+                                continue;
+                        /* remove existing entry */
+                        hlist_del(&cg->hlist);
+                        /* set new value */
+                        cg->subsys[ss->subsys_id] = css;
+                        /* recompute hash and restore entry */
+                        new_bucket = css_set_hash(cg->subsys);
+                        hlist_add_head(&cg->hlist, new_bucket);
+                }
+        }
+        write_unlock(&css_set_lock);
+        mutex_init(&ss->hierarchy_mutex);
+        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
+        ss->active = 1;
+        /* success! */
+        mutex_unlock(&cgroup_mutex);
+        return 0;
 }
+EXPORT_SYMBOL_GPL(cgroup_load_subsys);
+/**
+ * cgroup_unload_subsys: unload a modular subsystem
+ * @ss: the subsystem to unload
+ *
+ * This function should be called in a modular subsystem's exitcall. When this
+ * function is invoked, the refcount on the subsystem's module will be 0, so
+ * the subsystem will not be attached to any hierarchy.
+ */
+void cgroup_unload_subsys(struct cgroup_subsys *ss)
+{
+        struct cg_cgroup_link *link;
+        struct hlist_head *hhead;
+        BUG_ON(ss->module == NULL);
+        /*
+         * we shouldn't be called if the subsystem is in use, and the use of
+         * try_module_get in parse_cgroupfs_options should ensure that it
+         * doesn't start being used while we're killing it off.
+         */
+        BUG_ON(ss->root != &rootnode);
+        mutex_lock(&cgroup_mutex);
+        /* deassign the subsys_id */
+        BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);
+        subsys[ss->subsys_id] = NULL;
+        /* remove subsystem from rootnode's list of subsystems */
+        list_del(&ss->sibling);
+        /*
+         * disentangle the css from all css_sets attached to the dummytop. as
+         * in loading, we need to pay our respects to the hashtable gods.
+         */
+        write_lock(&css_set_lock);
+        list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
+                struct css_set *cg = link->cg;
+                hlist_del(&cg->hlist);
+                BUG_ON(!cg->subsys[ss->subsys_id]);
+                cg->subsys[ss->subsys_id] = NULL;
+                hhead = css_set_hash(cg->subsys);
+                hlist_add_head(&cg->hlist, hhead);
+        }
+        write_unlock(&css_set_lock);
+        /*
+         * remove subsystem's css from the dummytop and free it - need to free
+         * before marking as null because ss->destroy needs the cgrp->subsys
+         * pointer to find their state. note that this also takes care of
+         * freeing the css_id.
+         */
+        ss->destroy(ss, dummytop);
+        dummytop->subsys[ss->subsys_id] = NULL;
+        mutex_unlock(&cgroup_mutex);
+}
+EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
 /**
 * cgroup_init_early - cgroup initialization at system boot
@@ -3253,7 +3818,8 @@ int __init cgroup_init_early(void)
        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
                INIT_HLIST_HEAD(&css_set_table[i]);
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                BUG_ON(!ss->name);
@@ -3288,12 +3854,13 @@ int __init cgroup_init(void)
        if (err)
                return err;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+        /* at bootup time, we don't worry about modular subsystems */
+        for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
                if (!ss->early_init)
                        cgroup_init_subsys(ss);
                if (ss->use_id)
-                        cgroup_subsys_init_idr(ss);
+                        cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
        }
        /* Add init_css_set to the hash table */
@@ -3397,9 +3964,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
        int i;
        seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+        /*
+         * ideally we don't want subsystems moving around while we do this.
+         * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+         * subsys/hierarchy state.
+         */
        mutex_lock(&cgroup_mutex);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
+                if (ss == NULL)
+                        continue;
                seq_printf(m, "%s\t%d\t%d\t%d\n",
                           ss->name, ss->root->hierarchy_id,
                           ss->root->number_of_cgroups, !ss->disabled);
@@ -3457,7 +4031,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
 {
        if (need_forkexit_callback) {
                int i;
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * forkexit callbacks are only supported for builtin
+                 * subsystems, and the builtin section of the subsys array is
+                 * immutable, so we don't need to lock the subsys array here.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->fork)
                                ss->fork(ss, child);
@@ -3526,7 +4105,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        struct css_set *cg;
        if (run_callbacks && need_forkexit_callback) {
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * modular subsystems can't use callbacks, so no need to lock
+                 * the subsys array
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (ss->exit)
                                ss->exit(ss, tsk);
@@ -3720,12 +4303,13 @@ static void check_for_release(struct cgroup *cgrp)
        }
 }
-void __css_put(struct cgroup_subsys_state *css)
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css, int count)
 {
        struct cgroup *cgrp = css->cgroup;
        int val;
        rcu_read_lock();
-        val = atomic_dec_return(&css->refcnt);
+        val = atomic_sub_return(count, &css->refcnt);
        if (val == 1) {
                if (notify_on_release(cgrp)) {
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -3736,6 +4320,7 @@ void __css_put(struct cgroup_subsys_state *css)
        rcu_read_unlock();
        WARN_ON_ONCE(val < 1);
 }
+EXPORT_SYMBOL_GPL(__css_put);
 /*
 * Notify userspace when a cgroup is released, by running the
@@ -3817,8 +4402,11 @@ static int __init cgroup_disable(char *str)
        while ((token = strsep(&str, ",")) != NULL) {
                if (!*token)
                        continue;
+                /*
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                 * cgroup_disable, being at boot time, can't know about module
+                 * subsystems, so we don't worry about them.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
                        if (!strcmp(token, ss->name)) {
@@ -3848,6 +4436,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
                return cssid->id;
        return 0;
 }
+EXPORT_SYMBOL_GPL(css_id);
 unsigned short css_depth(struct cgroup_subsys_state *css)
 {
@@ -3857,6 +4446,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
                return cssid->depth;
        return 0;
 }
+EXPORT_SYMBOL_GPL(css_depth);
 bool css_is_ancestor(struct cgroup_subsys_state *child,
                    const struct cgroup_subsys_state *root)
@@ -3893,6 +4483,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        spin_unlock(&ss->id_lock);
        call_rcu(&id->rcu_head, __free_css_id_cb);
 }
+EXPORT_SYMBOL_GPL(free_css_id);
 /*
 * This is called by init or create(). Then, calls to this function are
@@ -3942,15 +4533,14 @@ err_out:
 }
-static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
+                                            struct cgroup_subsys_state *rootcss)
 {
        struct css_id *newid;
-        struct cgroup_subsys_state *rootcss;
        spin_lock_init(&ss->id_lock);
        idr_init(&ss->idr);
-        rootcss = init_css_set.subsys[ss->subsys_id];
        newid = get_new_cssid(ss, 0);
        if (IS_ERR(newid))
                return PTR_ERR(newid);
@@ -4010,6 +4600,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
        return rcu_dereference(cssid->css);
 }
+EXPORT_SYMBOL_GPL(css_lookup);
 /**
 * css_get_next - lookup next cgroup under specified hierarchy.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 677f25376a38..f8cced2692b3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -338,7 +338,7 @@ int __cpuinit cpu_up(unsigned int cpu)
        if (!cpu_possible(cpu)) {
                printk(KERN_ERR "can't online cpu %d because it is not "
                        "configured as may-hotadd at boot time\n", cpu);
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#if defined(CONFIG_IA64)
                printk(KERN_ERR "please check additional_cpus= boot "
                                "parameter\n");
 #endif
diff --git a/kernel/early_res.c b/kernel/early_res.c
new file mode 100644
index 000000000000..3cb2c661bb78
--- /dev/null
+++ b/kernel/early_res.c
@@ -0,0 +1,578 @@
+/*
+ * early_res, could be used to replace bootmem
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
+#include <linux/early_res.h>
+/*
+ * Early reserved memory areas.
+ */
+/*
+ * need to make sure this one is bigger enough before
+ * find_fw_memmap_area could be used
+ */
+#define MAX_EARLY_RES_X 32
+struct early_res {
+        u64 start, end;
+        char name[15];
+        char overlap_ok;
+};
+static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata;
+static int max_early_res __initdata = MAX_EARLY_RES_X;
+static struct early_res *early_res __initdata = &early_res_x[0];
+static int early_res_count __initdata;
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+        int i;
+        struct early_res *r;
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                r = &early_res[i];
+                if (end > r->start && start < r->end)
+                        break;
+        }
+        return i;
+}
+/*
+ * Drop the i-th range from the early reservation map,
+ * by copying any higher ranges down one over it, and
+ * clearing what had been the last slot.
+ */
+static void __init drop_range(int i)
+{
+        int j;
+        for (j = i + 1; j < max_early_res && early_res[j].end; j++)
+                ;
+        memmove(&early_res[i], &early_res[i + 1],
+               (j - 1 - i) * sizeof(struct early_res));
+        early_res[j - 1].end = 0;
+        early_res_count--;
+}
+static void __init drop_range_partial(int i, u64 start, u64 end)
+{
+        u64 common_start, common_end;
+        u64 old_start, old_end;
+        old_start = early_res[i].start;
+        old_end = early_res[i].end;
+        common_start = max(old_start, start);
+        common_end = min(old_end, end);
+        /* no overlap ? */
+        if (common_start >= common_end)
+                return;
+        if (old_start < common_start) {
+                /* make head segment */
+                early_res[i].end = common_start;
+                if (old_end > common_end) {
+                        char name[15];
+                        /*
+                         * Save a local copy of the name, since the
+                         * early_res array could get resized inside
+                         * reserve_early_without_check() ->
+                         * __check_and_double_early_res(), which would
+                         * make the current name pointer invalid.
+                         */
+                        strncpy(name, early_res[i].name,
+                                         sizeof(early_res[i].name) - 1);
+                        /* add another for left over on tail */
+                        reserve_early_without_check(common_end, old_end, name);
+                }
+                return;
+        } else {
+                if (old_end > common_end) {
+                        /* reuse the entry for tail left */
+                        early_res[i].start = common_end;
+                        return;
+                }
+                /* all covered */
+                drop_range(i);
+        }
+}
+/*
+ * Split any existing ranges that:
+ *  1) are marked 'overlap_ok', and
+ *  2) overlap with the stated range [start, end)
+ * into whatever portion (if any) of the existing range is entirely
+ * below or entirely above the stated range.  Drop the portion
+ * of the existing range that overlaps with the stated range,
+ * which will allow the caller of this routine to then add that
+ * stated range without conflicting with any existing range.
+ */
+static void __init drop_overlaps_that_are_ok(u64 start, u64 end)
+{
+        int i;
+        struct early_res *r;
+        u64 lower_start, lower_end;
+        u64 upper_start, upper_end;
+        char name[15];
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                r = &early_res[i];
+                /* Continue past non-overlapping ranges */
+                if (end <= r->start || start >= r->end)
+                        continue;
+                /*
+                 * Leave non-ok overlaps as is; let caller
+                 * panic "Overlapping early reservations"
+                 * when it hits this overlap.
+                 */
+                if (!r->overlap_ok)
+                        return;
+                /*
+                 * We have an ok overlap.  We will drop it from the early
+                 * reservation map, and add back in any non-overlapping
+                 * portions (lower or upper) as separate, overlap_ok,
+                 * non-overlapping ranges.
+                 */
+                /* 1. Note any non-overlapping (lower or upper) ranges. */
+                strncpy(name, r->name, sizeof(name) - 1);
+                lower_start = lower_end = 0;
+                upper_start = upper_end = 0;
+                if (r->start < start) {
+                        lower_start = r->start;
+                        lower_end = start;
+                }
+                if (r->end > end) {
+                        upper_start = end;
+                        upper_end = r->end;
+                }
+                /* 2. Drop the original ok overlapping range */
+                drop_range(i);
+                i--;            /* resume for-loop on copied down entry */
+                /* 3. Add back in any non-overlapping ranges. */
+                if (lower_end)
+                        reserve_early_overlap_ok(lower_start, lower_end, name);
+                if (upper_end)
+                        reserve_early_overlap_ok(upper_start, upper_end, name);
+        }
+}
+static void __init __reserve_early(u64 start, u64 end, char *name,
+                                                int overlap_ok)
+{
+        int i;
+        struct early_res *r;
+        i = find_overlapped_early(start, end);
+        if (i >= max_early_res)
+                panic("Too many early reservations");
+        r = &early_res[i];
+        if (r->end)
+                panic("Overlapping early reservations "
+                      "%llx-%llx %s to %llx-%llx %s\n",
+                      start, end - 1, name ? name : "", r->start,
+                      r->end - 1, r->name);
+        r->start = start;
+        r->end = end;
+        r->overlap_ok = overlap_ok;
+        if (name)
+                strncpy(r->name, name, sizeof(r->name) - 1);
+        early_res_count++;
+}
+/*
+ * A few early reservtations come here.
+ *
+ * The 'overlap_ok' in the name of this routine does -not- mean it
+ * is ok for these reservations to overlap an earlier reservation.
+ * Rather it means that it is ok for subsequent reservations to
+ * overlap this one.
+ *
+ * Use this entry point to reserve early ranges when you are doing
+ * so out of "Paranoia", reserving perhaps more memory than you need,
+ * just in case, and don't mind a subsequent overlapping reservation
+ * that is known to be needed.
+ *
+ * The drop_overlaps_that_are_ok() call here isn't really needed.
+ * It would be needed if we had two colliding 'overlap_ok'
+ * reservations, so that the second such would not panic on the
+ * overlap with the first.  We don't have any such as of this
+ * writing, but might as well tolerate such if it happens in
+ * the future.
+ */
+void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
+{
+        drop_overlaps_that_are_ok(start, end);
+        __reserve_early(start, end, name, 1);
+}
+static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end)
+{
+        u64 start, end, size, mem;
+        struct early_res *new;
+        /* do we have enough slots left ? */
+        if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
+                return;
+        /* double it */
+        mem = -1ULL;
+        size = sizeof(struct early_res) * max_early_res * 2;
+        if (early_res == early_res_x)
+                start = 0;
+        else
+                start = early_res[0].end;
+        end = ex_start;
+        if (start + size < end)
+                mem = find_fw_memmap_area(start, end, size,
+                                         sizeof(struct early_res));
+        if (mem == -1ULL) {
+                start = ex_end;
+                end = get_max_mapped();
+                if (start + size < end)
+                        mem = find_fw_memmap_area(start, end, size,
+                                                 sizeof(struct early_res));
+        }
+        if (mem == -1ULL)
+                panic("can not find more space for early_res array");
+        new = __va(mem);
+        /* save the first one for own */
+        new[0].start = mem;
+        new[0].end = mem + size;
+        new[0].overlap_ok = 0;
+        /* copy old to new */
+        if (early_res == early_res_x) {
+                memcpy(&new[1], &early_res[0],
+                         sizeof(struct early_res) * max_early_res);
+                memset(&new[max_early_res+1], 0,
+                         sizeof(struct early_res) * (max_early_res - 1));
+                early_res_count++;
+        } else {
+                memcpy(&new[1], &early_res[1],
+                         sizeof(struct early_res) * (max_early_res - 1));
+                memset(&new[max_early_res], 0,
+                         sizeof(struct early_res) * max_early_res);
+        }
+        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+        early_res = new;
+        max_early_res *= 2;
+        printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
+                max_early_res, mem, mem + size - 1);
+}
+/*
+ * Most early reservations come here.
+ *
+ * We first have drop_overlaps_that_are_ok() drop any pre-existing
+ * 'overlap_ok' ranges, so that we can then reserve this memory
+ * range without risk of panic'ing on an overlapping overlap_ok
+ * early reservation.
+ */
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+        if (start >= end)
+                return;
+        __check_and_double_early_res(start, end);
+        drop_overlaps_that_are_ok(start, end);
+        __reserve_early(start, end, name, 0);
+}
+void __init reserve_early_without_check(u64 start, u64 end, char *name)
+{
+        struct early_res *r;
+        if (start >= end)
+                return;
+        __check_and_double_early_res(start, end);
+        r = &early_res[early_res_count];
+        r->start = start;
+        r->end = end;
+        r->overlap_ok = 0;
+        if (name)
+                strncpy(r->name, name, sizeof(r->name) - 1);
+        early_res_count++;
+}
+void __init free_early(u64 start, u64 end)
+{
+        struct early_res *r;
+        int i;
+        i = find_overlapped_early(start, end);
+        r = &early_res[i];
+        if (i >= max_early_res || r->end != end || r->start != start)
+                panic("free_early on not reserved area: %llx-%llx!",
+                         start, end - 1);
+        drop_range(i);
+}
+void __init free_early_partial(u64 start, u64 end)
+{
+        struct early_res *r;
+        int i;
+try_next:
+        i = find_overlapped_early(start, end);
+        if (i >= max_early_res)
+                return;
+        r = &early_res[i];
+        /* hole ? */
+        if (r->end >= end && r->start <= start) {
+                drop_range_partial(i, start, end);
+                return;
+        }
+        drop_range_partial(i, start, end);
+        goto try_next;
+}
+#ifdef CONFIG_NO_BOOTMEM
+static void __init subtract_early_res(struct range *range, int az)
+{
+        int i, count;
+        u64 final_start, final_end;
+        int idx = 0;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        /* need to skip first one ?*/
+        if (early_res != early_res_x)
+                idx = 1;
+#define DEBUG_PRINT_EARLY_RES 1
+#if DEBUG_PRINT_EARLY_RES
+        printk(KERN_INFO "Subtract (%d early reservations)\n", count);
+#endif
+        for (i = idx; i < count; i++) {
+                struct early_res *r = &early_res[i];
+#if DEBUG_PRINT_EARLY_RES
+                printk(KERN_INFO "  #%d [%010llx - %010llx] %15s\n", i,
+                        r->start, r->end, r->name);
+#endif
+                final_start = PFN_DOWN(r->start);
+                final_end = PFN_UP(r->end);
+                if (final_start >= final_end)
+                        continue;
+                subtract_range(range, az, final_start, final_end);
+        }
+}
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+        int i, count;
+        u64 start = 0, end;
+        u64 size;
+        u64 mem;
+        struct range *range;
+        int nr_range;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        count *= 2;
+        size = sizeof(struct range) * count;
+        end = get_max_mapped();
+#ifdef MAX_DMA32_PFN
+        if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
+                start = MAX_DMA32_PFN << PAGE_SHIFT;
+#endif
+        mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
+        if (mem == -1ULL)
+                panic("can not find more space for range free");
+        range = __va(mem);
+        /* use early_node_map[] and early_res to get range array at first */
+        memset(range, 0, size);
+        nr_range = 0;
+        /* need to go over early_node_map to find out good range for node */
+        nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
+#ifdef CONFIG_X86_32
+        subtract_range(range, count, max_low_pfn, -1ULL);
+#endif
+        subtract_early_res(range, count);
+        nr_range = clean_sort_range(range, count);
+        /* need to clear it ? */
+        if (nodeid == MAX_NUMNODES) {
+                memset(&early_res[0], 0,
+                         sizeof(struct early_res) * max_early_res);
+                early_res = NULL;
+                max_early_res = 0;
+        }
+        *rangep = range;
+        return nr_range;
+}
+#else
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+        int i, count;
+        u64 final_start, final_end;
+        int idx = 0;
+        count  = 0;
+        for (i = 0; i < max_early_res && early_res[i].end; i++)
+                count++;
+        /* need to skip first one ?*/
+        if (early_res != early_res_x)
+                idx = 1;
+        printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+                         count - idx, max_early_res, start, end);
+        for (i = idx; i < count; i++) {
+                struct early_res *r = &early_res[i];
+                printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
+                        r->start, r->end, r->name);
+                final_start = max(start, r->start);
+                final_end = min(end, r->end);
+                if (final_start >= final_end) {
+                        printk(KERN_CONT "\n");
+                        continue;
+                }
+                printk(KERN_CONT " ==> [%010llx - %010llx]\n",
+                        final_start, final_end);
+                reserve_bootmem_generic(final_start, final_end - final_start,
+                                BOOTMEM_DEFAULT);
+        }
+        /* clear them */
+        memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+        early_res = NULL;
+        max_early_res = 0;
+        early_res_count = 0;
+}
+#endif
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+        int i;
+        u64 addr = *addrp;
+        int changed = 0;
+        struct early_res *r;
+again:
+        i = find_overlapped_early(addr, addr + size);
+        r = &early_res[i];
+        if (i < max_early_res && r->end) {
+                *addrp = addr = round_up(r->end, align);
+                changed = 1;
+                goto again;
+        }
+        return changed;
+}
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+        int i;
+        u64 addr = *addrp, last;
+        u64 size = *sizep;
+        int changed = 0;
+again:
+        last = addr + size;
+        for (i = 0; i < max_early_res && early_res[i].end; i++) {
+                struct early_res *r = &early_res[i];
+                if (last > r->start && addr < r->start) {
+                        size = r->start - addr;
+                        changed = 1;
+                        goto again;
+                }
+                if (last > r->end && addr < r->end) {
+                        addr = round_up(r->end, align);
+                        size = last - addr;
+                        changed = 1;
+                        goto again;
+                }
+                if (last <= r->end && addr >= r->start) {
+                        (*sizep)++;
+                        return 0;
+                }
+        }
+        if (changed) {
+                *addrp = addr;
+                *sizep = size;
+        }
+        return changed;
+}
+/*
+ * Find a free area with specified alignment in a specific range.
+ * only with the area.between start to end is active range from early_node_map
+ * so they are good as RAM
+ */
+u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+                         u64 size, u64 align)
+{
+        u64 addr, last;
+        addr = round_up(ei_start, align);
+        if (addr < start)
+                addr = round_up(start, align);
+        if (addr >= ei_last)
+                goto out;
+        while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+                ;
+        last = addr + size;
+        if (last > ei_last)
+                goto out;
+        if (last > end)
+                goto out;
+        return addr;
+out:
+        return -1ULL;
+}
+u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start,
+                         u64 *sizep, u64 align)
+{
+        u64 addr, last;
+        addr = round_up(ei_start, align);
+        if (addr < start)
+                addr = round_up(start, align);
+        if (addr >= ei_last)
+                goto out;
+        *sizep = ei_last - addr;
+        while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last)
+                ;
+        last = addr + *sizep;
+        if (last > ei_last)
+                goto out;
+        return addr;
+out:
+        return -1ULL;
+}
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
new file mode 100644
index 000000000000..ff915efef66d
--- /dev/null
+++ b/kernel/elfcore.c
@@ -0,0 +1,28 @@
+#include <linux/elf.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <asm/elf.h>
+Elf_Half __weak elf_core_extra_phdrs(void)
+{
+        return 0;
+}
+int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size,
+                                      unsigned long limit)
+{
+        return 1;
+}
+int __weak elf_core_write_extra_data(struct file *file, size_t *size,
+                                     unsigned long limit)
+{
+        return 1;
+}
+size_t __weak elf_core_extra_data_size(void)
+{
+        return 0;
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index 45ed043b8bf5..ce1e48c2d93d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code)
                                preempt_count());
        acct_update_integrals(tsk);
+        /* sync mm's RSS info before statistics gathering */
+        sync_mm_rss(tsk, tsk->mm);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                hrtimer_cancel(&tsk->signal->real_timer);
@@ -1188,7 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
        if (unlikely(wo->wo_flags & WNOWAIT)) {
                int exit_code = p->exit_code;
-                int why, status;
+                int why;
                get_task_struct(p);
                read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index 17bbf093356d..1beb6c303c41 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -329,15 +329,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                if (!tmp)
                        goto fail_nomem;
                *tmp = *mpnt;
+                INIT_LIST_HEAD(&tmp->anon_vma_chain);
                pol = mpol_dup(vma_policy(mpnt));
                retval = PTR_ERR(pol);
                if (IS_ERR(pol))
                        goto fail_nomem_policy;
                vma_set_policy(tmp, pol);
+                if (anon_vma_fork(tmp, mpnt))
+                        goto fail_nomem_anon_vma_fork;
                tmp->vm_flags &= ~VM_LOCKED;
                tmp->vm_mm = mm;
                tmp->vm_next = NULL;
-                anon_vma_link(tmp);
                file = tmp->vm_file;
                if (file) {
                        struct inode *inode = file->f_path.dentry->d_inode;
@@ -392,6 +394,8 @@ out:
        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
        return retval;
+fail_nomem_anon_vma_fork:
+        mpol_put(pol);
 fail_nomem_policy:
        kmem_cache_free(vm_area_cachep, tmp);
 fail_nomem:
@@ -455,8 +459,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
                (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
        mm->core_state = NULL;
        mm->nr_ptes = 0;
-        set_mm_counter(mm, file_rss, 0);
+        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
-        set_mm_counter(mm, anon_rss, 0);
        spin_lock_init(&mm->page_table_lock);
        mm->free_area_cache = TASK_UNMAPPED_BASE;
        mm->cached_hole_size = ~0UL;
@@ -825,23 +828,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 */
 static void posix_cpu_timers_init_group(struct signal_struct *sig)
 {
+        unsigned long cpu_limit;
        /* Thread group counters. */
        thread_group_cputime_init(sig);
-        /* Expiration times and increments. */
+        cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-        sig->it[CPUCLOCK_PROF].expires = cputime_zero;
+        if (cpu_limit != RLIM_INFINITY) {
-        sig->it[CPUCLOCK_PROF].incr = cputime_zero;
+                sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
-        sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
-        sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
-        /* Cached expiration times. */
-        sig->cputime_expires.prof_exp = cputime_zero;
-        sig->cputime_expires.virt_exp = cputime_zero;
-        sig->cputime_expires.sched_exp = 0;
-        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
-                sig->cputime_expires.prof_exp =
-                        secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
                sig->cputimer.running = 1;
        }
@@ -858,7 +852,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        if (clone_flags & CLONE_THREAD)
                return 0;
-        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;
@@ -866,46 +860,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        atomic_set(&sig->count, 1);
        atomic_set(&sig->live, 1);
        init_waitqueue_head(&sig->wait_chldexit);
-        sig->flags = 0;
        if (clone_flags & CLONE_NEWPID)
                sig->flags |= SIGNAL_UNKILLABLE;
-        sig->group_exit_code = 0;
-        sig->group_exit_task = NULL;
-        sig->group_stop_count = 0;
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        sig->it_real_incr.tv64 = 0;
        sig->real_timer.function = it_real_fn;
-        sig->leader = 0;        /* session leadership doesn't inherit */
-        sig->tty_old_pgrp = NULL;
-        sig->tty = NULL;
-        sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
-        sig->gtime = cputime_zero;
-        sig->cgtime = cputime_zero;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-        sig->prev_utime = sig->prev_stime = cputime_zero;
-#endif
-        sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
-        sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
-        sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
-        sig->maxrss = sig->cmaxrss = 0;
-        task_io_accounting_init(&sig->ioac);
-        sig->sum_sched_runtime = 0;
-        taskstats_tgid_init(sig);
        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);
        posix_cpu_timers_init_group(sig);
-        acct_init_pacct(&sig->pacct);
        tty_audit_fork(sig);
        sig->oom_adj = current->signal->oom_adj;
@@ -1034,7 +1003,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
        retval = -EAGAIN;
        if (atomic_read(&p->real_cred->user->processes) >=
-                        p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
+                        task_rlimit(p, RLIMIT_NPROC)) {
                if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
                    p->real_cred->user != INIT_USER)
                        goto bad_fork_free;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa28f666..42ec11b2af8a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,11 +18,7 @@
 #include "internals.h"
-/**
+static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
- *      dynamic_irq_init - initialize a dynamically allocated irq
- *      @irq:   irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
 {
        struct irq_desc *desc;
        unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
        desc->depth = 1;
        desc->msi_desc = NULL;
        desc->handler_data = NULL;
-        desc->chip_data = NULL;
+        if (!keep_chip_data)
+                desc->chip_data = NULL;
        desc->action = NULL;
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
 }
 /**
- *      dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *      dynamic_irq_init - initialize a dynamically allocated irq
 *      @irq:   irq number to initialize
 */
-void dynamic_irq_cleanup(unsigned int irq)
+void dynamic_irq_init(unsigned int irq)
+{
+        dynamic_irq_init_x(irq, false);
+}
+/**
+ *      dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ *
+ *      does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_init_keep_chip_data(unsigned int irq)
+{
+        dynamic_irq_init_x(irq, true);
+}
+static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq)
        }
        desc->msi_desc = NULL;
        desc->handler_data = NULL;
-        desc->chip_data = NULL;
+        if (!keep_chip_data)
+                desc->chip_data = NULL;
        desc->handle_irq = handle_bad_irq;
        desc->chip = &no_irq_chip;
        desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq)
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
+/**
+ *      dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+        dynamic_irq_cleanup_x(irq, false);
+}
+/**
+ *      dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
+ *      @irq:   irq number to initialize
+ *
+ *      does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
+{
+        dynamic_irq_cleanup_x(irq, true);
+}
 /**
 *      set_irq_chip - set the irq chip for an irq
@@ -520,7 +554,7 @@ out:
 *      signal. The occurence is latched into the irq controller hardware
 *      and must be acked in order to be reenabled. After the ack another
 *      interrupt can happen on the same source even before the first one
- *      is handled by the assosiacted event handler. If this happens it
+ *      is handled by the associated event handler. If this happens it
 *      might be necessary to disable (mask) the interrupt depending on the
 *      controller hardware. This requires to reenable the interrupt inside
 *      of the loop which handles the interrupts which have arrived while
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index d06df9c41cba..1ef4ffcdfa55 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 *      automatically freed on driver detach.
 *
 *      If an IRQ allocated with this function needs to be freed
- *      separately, dev_free_irq() must be used.
+ *      separately, devm_free_irq() must be used.
 */
 int devm_request_threaded_irq(struct device *dev, unsigned int irq,
                              irq_handler_t handler, irq_handler_t thread_fn,
@@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq);
 *      Except for the extra @dev argument, this function takes the
 *      same arguments and performs the same function as free_irq().
 *      This function instead of free_irq() should be used to manually
- *      free IRQs allocated with dev_request_irq().
+ *      free IRQs allocated with devm_request_irq().
 */
 void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id)
 {
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 814940e7f485..76d5a671bfe1 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -19,7 +19,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <linux/bootmem.h>
+#include <linux/radix-tree.h>
 #include <trace/events/irq.h>
 #include "internals.h"
@@ -87,12 +87,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
        void *ptr;
-        if (slab_is_available())
+        ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-                ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+                           GFP_ATOMIC, node);
-                                   GFP_ATOMIC, node);
-        else
-                ptr = alloc_bootmem_node(NODE_DATA(node),
-                                nr * sizeof(*desc->kstat_irqs));
        /*
         * don't overwite if can not get new one
@@ -132,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 */
 DEFINE_RAW_SPINLOCK(sparse_irq_lock);
-struct irq_desc **irq_desc_ptrs __read_mostly;
+static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
+static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+        radix_tree_insert(&irq_desc_tree, irq, desc);
+}
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+        return radix_tree_lookup(&irq_desc_tree, irq);
+}
+void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+        void **ptr;
+        ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
+        if (ptr)
+                radix_tree_replace_slot(ptr, desc);
+}
 static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
        [0 ... NR_IRQS_LEGACY-1] = {
@@ -164,9 +179,6 @@ int __init early_irq_init(void)
        legacy_count = ARRAY_SIZE(irq_desc_legacy);
        node = first_online_node;
-        /* allocate irq_desc_ptrs array based on nr_irqs */
-        irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
        /* allocate based on nr_cpu_ids */
        kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
                                          sizeof(int), GFP_NOWAIT, node);
@@ -180,23 +192,12 @@ int __init early_irq_init(void)
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
                alloc_desc_masks(&desc[i], node, true);
                init_desc_masks(&desc[i]);
-                irq_desc_ptrs[i] = desc + i;
+                set_irq_desc(i, &desc[i]);
        }
-        for (i = legacy_count; i < nr_irqs; i++)
-                irq_desc_ptrs[i] = NULL;
        return arch_early_irq_init();
 }
-struct irq_desc *irq_to_desc(unsigned int irq)
-{
-        if (irq_desc_ptrs && irq < nr_irqs)
-                return irq_desc_ptrs[irq];
-        return NULL;
-}
 struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 {
        struct irq_desc *desc;
@@ -208,21 +209,18 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
                return NULL;
        }
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc)
                return desc;
        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
        /* We have to check it to avoid races with another CPU */
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc)
                goto out_unlock;
-        if (slab_is_available())
+        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-                desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-        else
-                desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
        printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
        if (!desc) {
@@ -231,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
        }
        init_one_irq_desc(irq, desc, node);
-        irq_desc_ptrs[irq] = desc;
+        set_irq_desc(irq, desc);
 out_unlock:
        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b2821f070a3d..c63f3bc88f0b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc);
 extern raw_spinlock_t sparse_irq_lock;
 #ifdef CONFIG_SPARSE_IRQ
-/* irq_desc_ptrs allocated at boot time */
+void replace_irq_desc(unsigned int irq, struct irq_desc *desc);
-extern struct irq_desc **irq_desc_ptrs;
-#else
-/* irq_desc_ptrs is a fixed size array */
-extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
 #endif
 #ifdef CONFIG_PROC_FS
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 26bac9d8f860..963559dbd858 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -70,7 +70,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        raw_spin_lock_irqsave(&sparse_irq_lock, flags);
        /* We have to check it to avoid races with another CPU */
-        desc = irq_desc_ptrs[irq];
+        desc = irq_to_desc(irq);
        if (desc && old_desc != desc)
                goto out_unlock;
@@ -90,7 +90,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
                goto out_unlock;
        }
-        irq_desc_ptrs[irq] = desc;
+        replace_irq_desc(irq, desc);
        raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
        /* free the old one */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ef077fb73155..87ebe8adc474 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -41,7 +41,7 @@
 #include <asm/sections.h>
 /* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t* crash_notes;
+note_buf_t __percpu *crash_notes;
 /* vmcoreinfo stuff */
 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ccec774c716d..fa034d29cf73 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -42,9 +42,11 @@
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/sysctl.h>
 #include <linux/kdebug.h>
 #include <linux/memory.h>
 #include <linux/ftrace.h>
+#include <linux/cpu.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -105,57 +107,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
 * stepping on the instruction on a vmalloced/kmalloced/data page
 * is a recipe for disaster
 */
-#define INSNS_PER_PAGE  (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
 struct kprobe_insn_page {
        struct list_head list;
        kprobe_opcode_t *insns;         /* Page of instruction slots */
-        char slot_used[INSNS_PER_PAGE];
        int nused;
        int ngarbage;
+        char slot_used[];
+};
+#define KPROBE_INSN_PAGE_SIZE(slots)                    \
+        (offsetof(struct kprobe_insn_page, slot_used) + \
+         (sizeof(char) * (slots)))
+struct kprobe_insn_cache {
+        struct list_head pages; /* list of kprobe_insn_page */
+        size_t insn_size;       /* size of instruction slot */
+        int nr_garbage;
 };
+static int slots_per_page(struct kprobe_insn_cache *c)
+{
+        return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
+}
 enum kprobe_slot_state {
        SLOT_CLEAN = 0,
        SLOT_DIRTY = 1,
        SLOT_USED = 2,
 };
-static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
+static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
-static LIST_HEAD(kprobe_insn_pages);
+static struct kprobe_insn_cache kprobe_insn_slots = {
-static int kprobe_garbage_slots;
+        .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
-static int collect_garbage_slots(void);
+        .insn_size = MAX_INSN_SIZE,
+        .nr_garbage = 0,
+};
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
 /**
 * __get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
 */
-static kprobe_opcode_t __kprobes *__get_insn_slot(void)
+static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 {
        struct kprobe_insn_page *kip;
 retry:
-        list_for_each_entry(kip, &kprobe_insn_pages, list) {
+        list_for_each_entry(kip, &c->pages, list) {
-                if (kip->nused < INSNS_PER_PAGE) {
+                if (kip->nused < slots_per_page(c)) {
                        int i;
-                        for (i = 0; i < INSNS_PER_PAGE; i++) {
+                        for (i = 0; i < slots_per_page(c); i++) {
                                if (kip->slot_used[i] == SLOT_CLEAN) {
                                        kip->slot_used[i] = SLOT_USED;
                                        kip->nused++;
-                                        return kip->insns + (i * MAX_INSN_SIZE);
+                                        return kip->insns + (i * c->insn_size);
                                }
                        }
-                        /* Surprise!  No unused slots.  Fix kip->nused. */
+                        /* kip->nused is broken. Fix it. */
-                        kip->nused = INSNS_PER_PAGE;
+                        kip->nused = slots_per_page(c);
+                        WARN_ON(1);
                }
        }
        /* If there are any garbage slots, collect it and try again. */
-        if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+        if (c->nr_garbage && collect_garbage_slots(c) == 0)
                goto retry;
-        }
-        /* All out of space.  Need to allocate a new page. Use slot 0. */
+        /* All out of space.  Need to allocate a new page. */
-        kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+        kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
        if (!kip)
                return NULL;
@@ -170,20 +189,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
                return NULL;
        }
        INIT_LIST_HEAD(&kip->list);
-        list_add(&kip->list, &kprobe_insn_pages);
+        memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
-        memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
        kip->slot_used[0] = SLOT_USED;
        kip->nused = 1;
        kip->ngarbage = 0;
+        list_add(&kip->list, &c->pages);
        return kip->insns;
 }
 kprobe_opcode_t __kprobes *get_insn_slot(void)
 {
-        kprobe_opcode_t *ret;
+        kprobe_opcode_t *ret = NULL;
        mutex_lock(&kprobe_insn_mutex);
-        ret = __get_insn_slot();
+        ret = __get_insn_slot(&kprobe_insn_slots);
        mutex_unlock(&kprobe_insn_mutex);
        return ret;
 }
@@ -199,7 +221,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
                 * so as not to have to set it up again the
                 * next time somebody inserts a probe.
                 */
-                if (!list_is_singular(&kprobe_insn_pages)) {
+                if (!list_is_singular(&kip->list)) {
                        list_del(&kip->list);
                        module_free(NULL, kip->insns);
                        kfree(kip);
@@ -209,51 +231,84 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
        return 0;
 }
-static int __kprobes collect_garbage_slots(void)
+static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
 {
        struct kprobe_insn_page *kip, *next;
        /* Ensure no-one is interrupted on the garbages */
        synchronize_sched();
-        list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
+        list_for_each_entry_safe(kip, next, &c->pages, list) {
                int i;
                if (kip->ngarbage == 0)
                        continue;
                kip->ngarbage = 0;      /* we will collect all garbages */
-                for (i = 0; i < INSNS_PER_PAGE; i++) {
+                for (i = 0; i < slots_per_page(c); i++) {
                        if (kip->slot_used[i] == SLOT_DIRTY &&
                            collect_one_slot(kip, i))
                                break;
                }
        }
-        kprobe_garbage_slots = 0;
+        c->nr_garbage = 0;
        return 0;
 }
-void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
+static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
+                                       kprobe_opcode_t *slot, int dirty)
 {
        struct kprobe_insn_page *kip;
-        mutex_lock(&kprobe_insn_mutex);
+        list_for_each_entry(kip, &c->pages, list) {
-        list_for_each_entry(kip, &kprobe_insn_pages, list) {
+                long idx = ((long)slot - (long)kip->insns) / c->insn_size;
-                if (kip->insns <= slot &&
+                if (idx >= 0 && idx < slots_per_page(c)) {
-                    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
+                        WARN_ON(kip->slot_used[idx] != SLOT_USED);
-                        int i = (slot - kip->insns) / MAX_INSN_SIZE;
                        if (dirty) {
-                                kip->slot_used[i] = SLOT_DIRTY;
+                                kip->slot_used[idx] = SLOT_DIRTY;
                                kip->ngarbage++;
+                                if (++c->nr_garbage > slots_per_page(c))
+                                        collect_garbage_slots(c);
                        } else
-                                collect_one_slot(kip, i);
+                                collect_one_slot(kip, idx);
-                        break;
+                        return;
                }
        }
+        /* Could not free this slot. */
+        WARN_ON(1);
+}
-        if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
-                collect_garbage_slots();
+{
+        mutex_lock(&kprobe_insn_mutex);
+        __free_insn_slot(&kprobe_insn_slots, slot, dirty);
        mutex_unlock(&kprobe_insn_mutex);
 }
+#ifdef CONFIG_OPTPROBES
+/* For optimized_kprobe buffer */
+static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
+static struct kprobe_insn_cache kprobe_optinsn_slots = {
+        .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
+        /* .insn_size is initialized later */
+        .nr_garbage = 0,
+};
+/* Get a slot for optimized_kprobe buffer */
+kprobe_opcode_t __kprobes *get_optinsn_slot(void)
+{
+        kprobe_opcode_t *ret = NULL;
+        mutex_lock(&kprobe_optinsn_mutex);
+        ret = __get_insn_slot(&kprobe_optinsn_slots);
+        mutex_unlock(&kprobe_optinsn_mutex);
+        return ret;
+}
+void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
+{
+        mutex_lock(&kprobe_optinsn_mutex);
+        __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
+        mutex_unlock(&kprobe_optinsn_mutex);
+}
+#endif
 #endif
 /* We have preemption disabled.. so it is safe to use __ versions */
@@ -284,23 +339,401 @@ struct kprobe __kprobes *get_kprobe(void *addr)
                if (p->addr == addr)
                        return p;
        }
+        return NULL;
+}
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+/* Return true if the kprobe is an aggregator */
+static inline int kprobe_aggrprobe(struct kprobe *p)
+{
+        return p->pre_handler == aggr_pre_handler;
+}
+/*
+ * Keep all fields in the kprobe consistent
+ */
+static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+}
+#ifdef CONFIG_OPTPROBES
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobes_allow_optimization;
+/*
+ * Call all pre_handler on the list, but ignores its return value.
+ * This must be called from arch-dep optimized caller.
+ */
+void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+        struct kprobe *kp;
+        list_for_each_entry_rcu(kp, &p->list, list) {
+                if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
+                        set_kprobe_instance(kp);
+                        kp->pre_handler(kp, regs);
+                }
+                reset_kprobe_instance();
+        }
+}
+/* Return true(!0) if the kprobe is ready for optimization. */
+static inline int kprobe_optready(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if (kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                return arch_prepared_optinsn(&op->optinsn);
+        }
+        return 0;
+}
+/*
+ * Return an optimized kprobe whose optimizing code replaces
+ * instructions including addr (exclude breakpoint).
+ */
+struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+{
+        int i;
+        struct kprobe *p = NULL;
+        struct optimized_kprobe *op;
+        /* Don't check i == 0, since that is a breakpoint case. */
+        for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
+                p = get_kprobe((void *)(addr - i));
+        if (p && kprobe_optready(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (arch_within_optimized_kprobe(op, addr))
+                        return p;
+        }
        return NULL;
 }
+/* Optimization staging list, protected by kprobe_mutex */
+static LIST_HEAD(optimizing_list);
+static void kprobe_optimizer(struct work_struct *work);
+static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
+#define OPTIMIZE_DELAY 5
+/* Kprobe jump optimizer */
+static __kprobes void kprobe_optimizer(struct work_struct *work)
+{
+        struct optimized_kprobe *op, *tmp;
+        /* Lock modules while optimizing kprobes */
+        mutex_lock(&module_mutex);
+        mutex_lock(&kprobe_mutex);
+        if (kprobes_all_disarmed || !kprobes_allow_optimization)
+                goto end;
+        /*
+         * Wait for quiesence period to ensure all running interrupts
+         * are done. Because optprobe may modify multiple instructions
+         * there is a chance that Nth instruction is interrupted. In that
+         * case, running interrupt can return to 2nd-Nth byte of jump
+         * instruction. This wait is for avoiding it.
+         */
+        synchronize_sched();
+        /*
+         * The optimization/unoptimization refers online_cpus via
+         * stop_machine() and cpu-hotplug modifies online_cpus.
+         * And same time, text_mutex will be held in cpu-hotplug and here.
+         * This combination can cause a deadlock (cpu-hotplug try to lock
+         * text_mutex but stop_machine can not be done because online_cpus
+         * has been changed)
+         * To avoid this deadlock, we need to call get_online_cpus()
+         * for preventing cpu-hotplug outside of text_mutex locking.
+         */
+        get_online_cpus();
+        mutex_lock(&text_mutex);
+        list_for_each_entry_safe(op, tmp, &optimizing_list, list) {
+                WARN_ON(kprobe_disabled(&op->kp));
+                if (arch_optimize_kprobe(op) < 0)
+                        op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+                list_del_init(&op->list);
+        }
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
+end:
+        mutex_unlock(&kprobe_mutex);
+        mutex_unlock(&module_mutex);
+}
+/* Optimize kprobe if p is ready to be optimized */
+static __kprobes void optimize_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        /* Check if the kprobe is disabled or not ready for optimization. */
+        if (!kprobe_optready(p) || !kprobes_allow_optimization ||
+            (kprobe_disabled(p) || kprobes_all_disarmed))
+                return;
+        /* Both of break_handler and post_handler are not supported. */
+        if (p->break_handler || p->post_handler)
+                return;
+        op = container_of(p, struct optimized_kprobe, kp);
+        /* Check there is no other kprobes at the optimized instructions */
+        if (arch_check_optimized_kprobe(op) < 0)
+                return;
+        /* Check if it is already optimized. */
+        if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
+                return;
+        op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
+        list_add(&op->list, &optimizing_list);
+        if (!delayed_work_pending(&optimizing_work))
+                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
+}
+/* Unoptimize a kprobe if p is optimized */
+static __kprobes void unoptimize_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) {
+                op = container_of(p, struct optimized_kprobe, kp);
+                if (!list_empty(&op->list))
+                        /* Dequeue from the optimization queue */
+                        list_del_init(&op->list);
+                else
+                        /* Replace jump with break */
+                        arch_unoptimize_kprobe(op);
+                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+}
+/* Remove optimized instructions */
+static void __kprobes kill_optimized_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        if (!list_empty(&op->list)) {
+                /* Dequeue from the optimization queue */
+                list_del_init(&op->list);
+                op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+        }
+        /* Don't unoptimize, because the target code will be freed. */
+        arch_remove_optimized_kprobe(op);
+}
+/* Try to prepare optimized instructions */
+static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_prepare_optimized_kprobe(op);
+}
+/* Free optimized instructions and optimized_kprobe */
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = container_of(p, struct optimized_kprobe, kp);
+        arch_remove_optimized_kprobe(op);
+        kfree(op);
+}
+/* Allocate new optimized_kprobe and try to prepare optimized instructions */
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+        struct optimized_kprobe *op;
+        op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
+        if (!op)
+                return NULL;
+        INIT_LIST_HEAD(&op->list);
+        op->kp.addr = p->addr;
+        arch_prepare_optimized_kprobe(op);
+        return &op->kp;
+}
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
+/*
+ * Prepare an optimized_kprobe and optimize it
+ * NOTE: p must be a normal registered kprobe
+ */
+static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
+{
+        struct kprobe *ap;
+        struct optimized_kprobe *op;
+        ap = alloc_aggr_kprobe(p);
+        if (!ap)
+                return;
+        op = container_of(ap, struct optimized_kprobe, kp);
+        if (!arch_prepared_optinsn(&op->optinsn)) {
+                /* If failed to setup optimizing, fallback to kprobe */
+                free_aggr_kprobe(ap);
+                return;
+        }
+        init_aggr_kprobe(ap, p);
+        optimize_kprobe(ap);
+}
+#ifdef CONFIG_SYSCTL
+static void __kprobes optimize_all_kprobes(void)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct kprobe *p;
+        unsigned int i;
+        /* If optimization is already allowed, just return */
+        if (kprobes_allow_optimization)
+                return;
+        kprobes_allow_optimization = true;
+        mutex_lock(&text_mutex);
+        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+                head = &kprobe_table[i];
+                hlist_for_each_entry_rcu(p, node, head, hlist)
+                        if (!kprobe_disabled(p))
+                                optimize_kprobe(p);
+        }
+        mutex_unlock(&text_mutex);
+        printk(KERN_INFO "Kprobes globally optimized\n");
+}
+static void __kprobes unoptimize_all_kprobes(void)
+{
+        struct hlist_head *head;
+        struct hlist_node *node;
+        struct kprobe *p;
+        unsigned int i;
+        /* If optimization is already prohibited, just return */
+        if (!kprobes_allow_optimization)
+                return;
+        kprobes_allow_optimization = false;
+        printk(KERN_INFO "Kprobes globally unoptimized\n");
+        get_online_cpus();      /* For avoiding text_mutex deadlock */
+        mutex_lock(&text_mutex);
+        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+                head = &kprobe_table[i];
+                hlist_for_each_entry_rcu(p, node, head, hlist) {
+                        if (!kprobe_disabled(p))
+                                unoptimize_kprobe(p);
+                }
+        }
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
+        /* Allow all currently running kprobes to complete */
+        synchronize_sched();
+}
+int sysctl_kprobes_optimization;
+int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
+                                      void __user *buffer, size_t *length,
+                                      loff_t *ppos)
+{
+        int ret;
+        mutex_lock(&kprobe_mutex);
+        sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
+        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+        if (sysctl_kprobes_optimization)
+                optimize_all_kprobes();
+        else
+                unoptimize_all_kprobes();
+        mutex_unlock(&kprobe_mutex);
+        return ret;
+}
+#endif /* CONFIG_SYSCTL */
+static void __kprobes __arm_kprobe(struct kprobe *p)
+{
+        struct kprobe *old_p;
+        /* Check collision with other optimized kprobes */
+        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        if (unlikely(old_p))
+                unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */
+        arch_arm_kprobe(p);
+        optimize_kprobe(p);     /* Try to optimize (add kprobe to a list) */
+}
+static void __kprobes __disarm_kprobe(struct kprobe *p)
+{
+        struct kprobe *old_p;
+        unoptimize_kprobe(p);   /* Try to unoptimize */
+        arch_disarm_kprobe(p);
+        /* If another kprobe was blocked, optimize it. */
+        old_p = get_optimized_kprobe((unsigned long)p->addr);
+        if (unlikely(old_p))
+                optimize_kprobe(old_p);
+}
+#else /* !CONFIG_OPTPROBES */
+#define optimize_kprobe(p)                      do {} while (0)
+#define unoptimize_kprobe(p)                    do {} while (0)
+#define kill_optimized_kprobe(p)                do {} while (0)
+#define prepare_optimized_kprobe(p)             do {} while (0)
+#define try_to_optimize_kprobe(p)               do {} while (0)
+#define __arm_kprobe(p)                         arch_arm_kprobe(p)
+#define __disarm_kprobe(p)                      arch_disarm_kprobe(p)
+static __kprobes void free_aggr_kprobe(struct kprobe *p)
+{
+        kfree(p);
+}
+static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+{
+        return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+}
+#endif /* CONFIG_OPTPROBES */
 /* Arm a kprobe with text_mutex */
 static void __kprobes arm_kprobe(struct kprobe *kp)
 {
+        /*
+         * Here, since __arm_kprobe() doesn't use stop_machine(),
+         * this doesn't cause deadlock on text_mutex. So, we don't
+         * need get_online_cpus().
+         */
        mutex_lock(&text_mutex);
-        arch_arm_kprobe(kp);
+        __arm_kprobe(kp);
        mutex_unlock(&text_mutex);
 }
 /* Disarm a kprobe with text_mutex */
 static void __kprobes disarm_kprobe(struct kprobe *kp)
 {
+        get_online_cpus();      /* For avoiding text_mutex deadlock */
        mutex_lock(&text_mutex);
-        arch_disarm_kprobe(kp);
+        __disarm_kprobe(kp);
        mutex_unlock(&text_mutex);
+        put_online_cpus();
 }
 /*
@@ -369,7 +802,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
 {
        struct kprobe *kp;
-        if (p->pre_handler != aggr_pre_handler) {
+        if (!kprobe_aggrprobe(p)) {
                p->nmissed++;
        } else {
                list_for_each_entry_rcu(kp, &p->list, list)
@@ -493,21 +926,16 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 }
 /*
- * Keep all fields in the kprobe consistent
- */
-static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
-{
-        memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
-        memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
-}
-/*
 * Add the new probe to ap->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
 static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
        BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
+        if (p->break_handler || p->post_handler)
+                unoptimize_kprobe(ap);  /* Fall back to normal kprobe */
        if (p->break_handler) {
                if (ap->break_handler)
                        return -EEXIST;
@@ -522,7 +950,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
                ap->flags &= ~KPROBE_FLAG_DISABLED;
                if (!kprobes_all_disarmed)
                        /* Arm the breakpoint again. */
-                        arm_kprobe(ap);
+                        __arm_kprobe(ap);
        }
        return 0;
 }
@@ -531,12 +959,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 * Fill in the required fields of the "manager kprobe". Replace the
 * earlier kprobe in the hlist with the manager kprobe
 */
-static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
+        /* Copy p's insn slot to ap */
        copy_kprobe(p, ap);
        flush_insn_slot(ap);
        ap->addr = p->addr;
-        ap->flags = p->flags;
+        ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED;
        ap->pre_handler = aggr_pre_handler;
        ap->fault_handler = aggr_fault_handler;
        /* We don't care the kprobe which has gone. */
@@ -546,8 +975,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
                ap->break_handler = aggr_break_handler;
        INIT_LIST_HEAD(&ap->list);
-        list_add_rcu(&p->list, &ap->list);
+        INIT_HLIST_NODE(&ap->hlist);
+        list_add_rcu(&p->list, &ap->list);
        hlist_replace_rcu(&p->hlist, &ap->hlist);
 }
@@ -561,12 +991,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
        int ret = 0;
        struct kprobe *ap = old_p;
-        if (old_p->pre_handler != aggr_pre_handler) {
+        if (!kprobe_aggrprobe(old_p)) {
-                /* If old_p is not an aggr_probe, create new aggr_kprobe. */
+                /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */
-                ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+                ap = alloc_aggr_kprobe(old_p);
                if (!ap)
                        return -ENOMEM;
-                add_aggr_kprobe(ap, old_p);
+                init_aggr_kprobe(ap, old_p);
        }
        if (kprobe_gone(ap)) {
@@ -585,6 +1015,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
                         */
                        return ret;
+                /* Prepare optimized instructions if possible. */
+                prepare_optimized_kprobe(ap);
                /*
                 * Clear gone flag to prevent allocating new slot again, and
                 * set disabled flag because it is not armed yet.
@@ -593,6 +1026,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
                            | KPROBE_FLAG_DISABLED;
        }
+        /* Copy ap's insn slot to p */
        copy_kprobe(ap, p);
        return add_new_kprobe(ap, p);
 }
@@ -743,27 +1177,34 @@ int __kprobes register_kprobe(struct kprobe *p)
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);
        mutex_lock(&kprobe_mutex);
+        get_online_cpus();      /* For avoiding text_mutex deadlock. */
+        mutex_lock(&text_mutex);
        old_p = get_kprobe(p->addr);
        if (old_p) {
+                /* Since this may unoptimize old_p, locking text_mutex. */
                ret = register_aggr_kprobe(old_p, p);
                goto out;
        }
-        mutex_lock(&text_mutex);
        ret = arch_prepare_kprobe(p);
        if (ret)
-                goto out_unlock_text;
+                goto out;
        INIT_HLIST_NODE(&p->hlist);
        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
        if (!kprobes_all_disarmed && !kprobe_disabled(p))
-                arch_arm_kprobe(p);
+                __arm_kprobe(p);
+        /* Try to optimize kprobe */
+        try_to_optimize_kprobe(p);
-out_unlock_text:
-        mutex_unlock(&text_mutex);
 out:
+        mutex_unlock(&text_mutex);
+        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
        if (probed_mod)
@@ -785,7 +1226,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
                return -EINVAL;
        if (old_p == p ||
-            (old_p->pre_handler == aggr_pre_handler &&
+            (kprobe_aggrprobe(old_p) &&
             list_is_singular(&old_p->list))) {
                /*
                 * Only probe on the hash list. Disarm only if kprobes are
@@ -793,7 +1234,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p)
                 * already have been removed. We save on flushing icache.
                 */
                if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
-                        disarm_kprobe(p);
+                        disarm_kprobe(old_p);
                hlist_del_rcu(&old_p->hlist);
        } else {
                if (p->break_handler && !kprobe_gone(p))
@@ -809,8 +1250,13 @@ noclean:
                list_del_rcu(&p->list);
                if (!kprobe_disabled(old_p)) {
                        try_to_disable_aggr_kprobe(old_p);
-                        if (!kprobes_all_disarmed && kprobe_disabled(old_p))
+                        if (!kprobes_all_disarmed) {
-                                disarm_kprobe(old_p);
+                                if (kprobe_disabled(old_p))
+                                        disarm_kprobe(old_p);
+                                else
+                                        /* Try to optimize this probe again */
+                                        optimize_kprobe(old_p);
+                        }
                }
        }
        return 0;
@@ -827,7 +1273,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
                old_p = list_entry(p->list.next, struct kprobe, list);
                list_del(&p->list);
                arch_remove_kprobe(old_p);
-                kfree(old_p);
+                free_aggr_kprobe(old_p);
        }
 }
@@ -1123,7 +1569,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
        struct kprobe *kp;
        p->flags |= KPROBE_FLAG_GONE;
-        if (p->pre_handler == aggr_pre_handler) {
+        if (kprobe_aggrprobe(p)) {
                /*
                 * If this is an aggr_kprobe, we have to list all the
                 * chained probes and mark them GONE.
@@ -1132,6 +1578,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
                        kp->flags |= KPROBE_FLAG_GONE;
                p->post_handler = NULL;
                p->break_handler = NULL;
+                kill_optimized_kprobe(p);
        }
        /*
         * Here, we can remove insn_slot safely, because no thread calls
@@ -1241,6 +1688,15 @@ static int __init init_kprobes(void)
                }
        }
+#if defined(CONFIG_OPTPROBES)
+#if defined(__ARCH_WANT_KPROBES_INSN_SLOT)
+        /* Init kprobe_optinsn_slots */
+        kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
+#endif
+        /* By default, kprobes can be optimized */
+        kprobes_allow_optimization = true;
+#endif
        /* By default, kprobes are armed */
        kprobes_all_disarmed = false;
@@ -1259,7 +1715,7 @@ static int __init init_kprobes(void)
 #ifdef CONFIG_DEBUG_FS
 static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
-                const char *sym, int offset,char *modname)
+                const char *sym, int offset, char *modname, struct kprobe *pp)
 {
        char *kprobe_type;
@@ -1269,19 +1725,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
                kprobe_type = "j";
        else
                kprobe_type = "k";
        if (sym)
-                seq_printf(pi, "%p  %s  %s+0x%x  %s %s%s\n",
+                seq_printf(pi, "%p  %s  %s+0x%x  %s ",
                        p->addr, kprobe_type, sym, offset,
-                        (modname ? modname : " "),
+                        (modname ? modname : " "));
-                        (kprobe_gone(p) ? "[GONE]" : ""),
-                        ((kprobe_disabled(p) && !kprobe_gone(p)) ?
-                         "[DISABLED]" : ""));
        else
-                seq_printf(pi, "%p  %s  %p %s%s\n",
+                seq_printf(pi, "%p  %s  %p ",
-                        p->addr, kprobe_type, p->addr,
+                        p->addr, kprobe_type, p->addr);
-                        (kprobe_gone(p) ? "[GONE]" : ""),
-                        ((kprobe_disabled(p) && !kprobe_gone(p)) ?
+        if (!pp)
-                         "[DISABLED]" : ""));
+                pp = p;
+        seq_printf(pi, "%s%s%s\n",
+                (kprobe_gone(p) ? "[GONE]" : ""),
+                ((kprobe_disabled(p) && !kprobe_gone(p)) ?  "[DISABLED]" : ""),
+                (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
 }
 static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1317,11 +1775,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
        hlist_for_each_entry_rcu(p, node, head, hlist) {
                sym = kallsyms_lookup((unsigned long)p->addr, NULL,
                                        &offset, &modname, namebuf);
-                if (p->pre_handler == aggr_pre_handler) {
+                if (kprobe_aggrprobe(p)) {
                        list_for_each_entry_rcu(kp, &p->list, list)
-                                report_probe(pi, kp, sym, offset, modname);
+                                report_probe(pi, kp, sym, offset, modname, p);
                } else
-                        report_probe(pi, p, sym, offset, modname);
+                        report_probe(pi, p, sym, offset, modname, NULL);
        }
        preempt_enable();
        return 0;
@@ -1399,12 +1857,13 @@ int __kprobes enable_kprobe(struct kprobe *kp)
                goto out;
        }
-        if (!kprobes_all_disarmed && kprobe_disabled(p))
-                arm_kprobe(p);
-        p->flags &= ~KPROBE_FLAG_DISABLED;
        if (p != kp)
                kp->flags &= ~KPROBE_FLAG_DISABLED;
+        if (!kprobes_all_disarmed && kprobe_disabled(p)) {
+                p->flags &= ~KPROBE_FLAG_DISABLED;
+                arm_kprobe(p);
+        }
 out:
        mutex_unlock(&kprobe_mutex);
        return ret;
@@ -1424,12 +1883,13 @@ static void __kprobes arm_all_kprobes(void)
        if (!kprobes_all_disarmed)
                goto already_enabled;
+        /* Arming kprobes doesn't optimize kprobe itself */
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist)
                        if (!kprobe_disabled(p))
-                                arch_arm_kprobe(p);
+                                __arm_kprobe(p);
        }
        mutex_unlock(&text_mutex);
@@ -1456,16 +1916,23 @@ static void __kprobes disarm_all_kprobes(void)
        kprobes_all_disarmed = true;
        printk(KERN_INFO "Kprobes globally disabled\n");
+        /*
+         * Here we call get_online_cpus() for avoiding text_mutex deadlock,
+         * because disarming may also unoptimize kprobes.
+         */
+        get_online_cpus();
        mutex_lock(&text_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
                hlist_for_each_entry_rcu(p, node, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
-                                arch_disarm_kprobe(p);
+                                __disarm_kprobe(p);
                }
        }
        mutex_unlock(&text_mutex);
+        put_online_cpus();
        mutex_unlock(&kprobe_mutex);
        /* Allow all currently running kprobes to complete */
        synchronize_sched();
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6b1ccc3f0205..21fe3c426948 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(uevent_seqnum);
-/* uevent helper program, used during early boo */
+/* uevent helper program, used during early boot */
 static ssize_t uevent_helper_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
 {
diff --git a/kernel/module.c b/kernel/module.c
index f82386bd9ee9..c968d3606dca 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod)
        INIT_LIST_HEAD(&mod->modules_which_use_me);
        for_each_possible_cpu(cpu)
-                local_set(__module_ref_addr(mod, cpu), 0);
+                per_cpu_ptr(mod->refptr, cpu)->count = 0;
        /* Hold reference count during initialization. */
-        local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
+        __this_cpu_write(mod->refptr->count, 1);
        /* Backwards compatibility macros put refcount during init. */
        mod->waiter = current;
 }
@@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod)
        int cpu;
        for_each_possible_cpu(cpu)
-                total += local_read(__module_ref_addr(mod, cpu));
+                total += per_cpu_ptr(mod->refptr, cpu)->count;
        return total;
 }
 EXPORT_SYMBOL(module_refcount);
@@ -796,14 +797,15 @@ static struct module_attribute refcnt = {
 void module_put(struct module *module)
 {
        if (module) {
-                unsigned int cpu = get_cpu();
+                preempt_disable();
-                local_dec(__module_ref_addr(module, cpu));
+                __this_cpu_dec(module->refptr->count);
                trace_module_put(module, _RET_IP_,
-                                 local_read(__module_ref_addr(module, cpu)));
+                                 __this_cpu_read(module->refptr->count));
                /* Maybe they're waiting for us to drop reference? */
                if (unlikely(!module_is_live(module)))
                        wake_up_process(module->waiter);
-                put_cpu();
+                preempt_enable();
        }
 }
 EXPORT_SYMBOL(module_put);
@@ -1083,6 +1085,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
                if (sattr->name == NULL)
                        goto out;
                sect_attrs->nsections++;
+                sysfs_attr_init(&sattr->mattr.attr);
                sattr->mattr.show = module_sect_show;
                sattr->mattr.store = NULL;
                sattr->mattr.attr.name = sattr->name;
@@ -1178,6 +1181,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
                if (sect_empty(&sechdrs[i]))
                        continue;
                if (sechdrs[i].sh_type == SHT_NOTE) {
+                        sysfs_bin_attr_init(nattr);
                        nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
                        nattr->attr.mode = S_IRUGO;
                        nattr->size = sechdrs[i].sh_size;
@@ -1250,6 +1254,7 @@ int module_add_modinfo_attrs(struct module *mod)
                if (!attr->test ||
                    (attr->test && attr->test(mod))) {
                        memcpy(temp_attr, attr, sizeof(*temp_attr));
+                        sysfs_attr_init(&temp_attr->attr);
                        error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr);
                        ++temp_attr;
                }
@@ -1397,9 +1402,9 @@ static void free_module(struct module *mod)
        kfree(mod->args);
        if (mod->percpu)
                percpu_modfree(mod->percpu);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+#if defined(CONFIG_MODULE_UNLOAD)
        if (mod->refptr)
-                percpu_modfree(mod->refptr);
+                free_percpu(mod->refptr);
 #endif
        /* Free lock-classes: */
        lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -2162,9 +2167,8 @@ static noinline struct module *load_module(void __user *umod,
        mod = (void *)sechdrs[modindex].sh_addr;
        kmemleak_load_module(mod, hdr, sechdrs, secstrings);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+#if defined(CONFIG_MODULE_UNLOAD)
-        mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+        mod->refptr = alloc_percpu(struct module_ref);
-                                      mod->name);
        if (!mod->refptr) {
                err = -ENOMEM;
                goto free_init;
@@ -2396,8 +2400,8 @@ static noinline struct module *load_module(void __user *umod,
        kobject_put(&mod->mkobj.kobj);
 free_unload:
        module_unload_free(mod);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+#if defined(CONFIG_MODULE_UNLOAD)
-        percpu_modfree(mod->refptr);
+        free_percpu(mod->refptr);
 free_init:
 #endif
        module_free(mod, mod->module_init);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9711b2..2ab67233ee8f 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -24,7 +24,18 @@
 static struct kmem_cache *nsproxy_cachep;
-struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
+struct nsproxy init_nsproxy = {
+        .count  = ATOMIC_INIT(1),
+        .uts_ns = &init_uts_ns,
+#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
+        .ipc_ns = &init_ipc_ns,
+#endif
+        .mnt_ns = NULL,
+        .pid_ns = &init_pid_ns,
+#ifdef CONFIG_NET
+        .net_ns = &init_net,
+#endif
+};
 static inline struct nsproxy *create_nsproxy(void)
 {
diff --git a/kernel/padata.c b/kernel/padata.c
index 6f9bcb8313d6..93caf65ff57c 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -642,6 +642,9 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
        if (!pd)
                goto err_free_inst;
+        if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL))
+                goto err_free_pd;
        rcu_assign_pointer(pinst->pd, pd);
        pinst->wq = wq;
@@ -654,12 +657,14 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
        pinst->cpu_notifier.priority = 0;
        err = register_hotcpu_notifier(&pinst->cpu_notifier);
        if (err)
-                goto err_free_pd;
+                goto err_free_cpumask;
        mutex_init(&pinst->lock);
        return pinst;
+err_free_cpumask:
+        free_cpumask_var(pinst->cpumask);
 err_free_pd:
        padata_free_pd(pd);
 err_free_inst:
@@ -685,6 +690,7 @@ void padata_free(struct padata_instance *pinst)
        unregister_hotcpu_notifier(&pinst->cpu_notifier);
        padata_free_pd(pinst->pd);
+        free_cpumask_var(pinst->cpumask);
        kfree(pinst);
 }
 EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index c787333282b8..13d966b4c14a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 EXPORT_SYMBOL(panic_notifier_list);
-static long no_blink(long time)
-{
-        return 0;
-}
 /* Returns how long it waited in ms */
 long (*panic_blink)(long time);
 EXPORT_SYMBOL(panic_blink);
+static void panic_blink_one_second(void)
+{
+        static long i = 0, end;
+        if (panic_blink) {
+                end = i + MSEC_PER_SEC;
+                while (i < end) {
+                        i += panic_blink(i);
+                        mdelay(1);
+                        i++;
+                }
+        } else {
+                /*
+                 * When running under a hypervisor a small mdelay may get
+                 * rounded up to the hypervisor timeslice. For example, with
+                 * a 1ms in 10ms hypervisor timeslice we might inflate a
+                 * mdelay(1) loop by 10x.
+                 *
+                 * If we have nothing to blink, spin on 1 second calls to
+                 * mdelay to avoid this.
+                 */
+                mdelay(MSEC_PER_SEC);
+        }
+}
 /**
 *      panic - halt the system
 *      @fmt: The text string to print
@@ -95,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...)
        bust_spinlocks(0);
-        if (!panic_blink)
-                panic_blink = no_blink;
        if (panic_timeout > 0) {
                /*
                 * Delay timeout seconds before rebooting the machine.
@@ -105,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...)
                 */
                printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
-                for (i = 0; i < panic_timeout*1000; ) {
+                for (i = 0; i < panic_timeout; i++) {
                        touch_nmi_watchdog();
-                        i += panic_blink(i);
+                        panic_blink_one_second();
-                        mdelay(1);
-                        i++;
                }
                /*
                 * This will not be a clean reboot, with everything
@@ -135,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...)
        }
 #endif
        local_irq_enable();
-        for (i = 0; ; ) {
+        while (1) {
                touch_softlockup_watchdog();
-                i += panic_blink(i);
+                panic_blink_one_second();
-                mdelay(1);
-                i++;
        }
 }
diff --git a/kernel/params.c b/kernel/params.c
index cf1b69183127..0b30ecd53a52 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,7 +24,6 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/ctype.h>
-#include <linux/string.h>
 #if 0
 #define DEBUGP printk
@@ -402,8 +401,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
 }
 /* sysfs output in /sys/modules/XYZ/parameters/ */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr);
+#define to_module_attr(n) container_of(n, struct module_attribute, attr)
-#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
+#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
 extern struct kernel_param __start___param[], __stop___param[];
@@ -421,7 +420,7 @@ struct module_param_attrs
 };
 #ifdef CONFIG_SYSFS
-#define to_param_attr(n) container_of(n, struct param_attribute, mattr);
+#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
 static ssize_t param_attr_show(struct module_attribute *mattr,
                               struct module *mod, char *buf)
@@ -517,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
        new->grp.attrs = attrs;
        /* Tack new one on the end. */
+        sysfs_attr_init(&new->attrs[num].mattr.attr);
        new->attrs[num].param = kp;
        new->attrs[num].mattr.show = param_attr_show;
        new->attrs[num].mattr.store = param_attr_store;
@@ -723,7 +723,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
        return ret;
 }
-static struct sysfs_ops module_sysfs_ops = {
+static const struct sysfs_ops module_sysfs_ops = {
        .show = module_attr_show,
        .store = module_attr_store,
 };
@@ -737,7 +737,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
        return 0;
 }
-static struct kset_uevent_ops module_uevent_ops = {
+static const struct kset_uevent_ops module_uevent_ops = {
        .filter = uevent_filter,
 };
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a661e7991865..f40560b86544 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2610,7 +2610,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        if (user_locked > user_lock_limit)
                extra = user_locked - user_lock_limit;
-        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = vma->vm_mm->locked_vm + extra;
@@ -5481,13 +5481,16 @@ void __init perf_event_init(void)
        register_cpu_notifier(&perf_cpu_nb);
 }
-static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
+                                        struct sysdev_class_attribute *attr,
+                                        char *buf)
 {
        return sprintf(buf, "%d\n", perf_reserved_percpu);
 }
 static ssize_t
 perf_set_reserve_percpu(struct sysdev_class *class,
+                        struct sysdev_class_attribute *attr,
                        const char *buf,
                        size_t count)
 {
@@ -5516,13 +5519,17 @@ perf_set_reserve_percpu(struct sysdev_class *class,
        return count;
 }
-static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+static ssize_t perf_show_overcommit(struct sysdev_class *class,
+                                    struct sysdev_class_attribute *attr,
+                                    char *buf)
 {
        return sprintf(buf, "%d\n", perf_overcommit);
 }
 static ssize_t
-perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+perf_set_overcommit(struct sysdev_class *class,
+                    struct sysdev_class_attribute *attr,
+                    const char *buf, size_t count)
 {
        unsigned long val;
        int err;
diff --git a/kernel/pid.c b/kernel/pid.c
index b08e697cd83f..86b296943e5f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -376,7 +376,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 EXPORT_SYMBOL(pid_task);
 /*
- * Must be called under rcu_read_lock() or with tasklist_lock read-held.
+ * Must be called under rcu_read_lock().
 */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 86b3796b0436..79aac93acf99 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -161,13 +161,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
                rcu_read_lock();
                /*
-                 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
+                 * Any nested-container's init processes won't ignore the
-                 * any nested-container's init processes don't ignore the
+                 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
-                 * signal
                 */
                task = pid_task(find_vpid(nr), PIDTYPE_PID);
                if (task)
-                        force_sig(SIGKILL, task);
+                        send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
                rcu_read_unlock();
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 438ff4523513..1a22dfd42df9 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk,
        int maxfire;
        struct list_head *timers = tsk->cpu_timers;
        struct signal_struct *const sig = tsk->signal;
+        unsigned long soft;
        maxfire = 20;
        tsk->cputime_expires.prof_exp = cputime_zero;
@@ -1030,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk,
        /*
         * Check for the special case thread timers.
         */
-        if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
+        soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur);
-                unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
+        if (soft != RLIM_INFINITY) {
-                unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
+                unsigned long hard =
+                        ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
                if (hard != RLIM_INFINITY &&
                    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
@@ -1043,14 +1045,13 @@ static void check_thread_timers(struct task_struct *tsk,
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
-                if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
+                if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
                        /*
                         * At the soft limit, send a SIGXCPU every second.
                         */
-                        if (sig->rlim[RLIMIT_RTTIME].rlim_cur
+                        if (soft < hard) {
-                            < sig->rlim[RLIMIT_RTTIME].rlim_max) {
+                                soft += USEC_PER_SEC;
-                                sig->rlim[RLIMIT_RTTIME].rlim_cur +=
+                                sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
-                                                                USEC_PER_SEC;
                        }
                        printk(KERN_INFO
                                "RT Watchdog Timeout: %s[%d]\n",
@@ -1121,6 +1122,7 @@ static void check_process_timers(struct task_struct *tsk,
        unsigned long long sum_sched_runtime, sched_expires;
        struct list_head *timers = sig->cpu_timers;
        struct task_cputime cputime;
+        unsigned long soft;
        /*
         * Don't sample the current process CPU clocks if there are no timers.
@@ -1193,11 +1195,13 @@ static void check_process_timers(struct task_struct *tsk,
                         SIGPROF);
        check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
                         SIGVTALRM);
+        soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
-        if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
+        if (soft != RLIM_INFINITY) {
                unsigned long psecs = cputime_to_secs(ptime);
+                unsigned long hard =
+                        ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max);
                cputime_t x;
-                if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) {
+                if (psecs >= hard) {
                        /*
                         * At the hard limit, we just die.
                         * No need to calculate anything else now.
@@ -1205,17 +1209,17 @@ static void check_process_timers(struct task_struct *tsk,
                        __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
                        return;
                }
-                if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) {
+                if (psecs >= soft) {
                        /*
                         * At the soft limit, send a SIGXCPU every second.
                         */
                        __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
-                        if (sig->rlim[RLIMIT_CPU].rlim_cur
+                        if (soft < hard) {
-                            < sig->rlim[RLIMIT_CPU].rlim_max) {
+                                soft++;
-                                sig->rlim[RLIMIT_CPU].rlim_cur++;
+                                sig->rlim[RLIMIT_CPU].rlim_cur = soft;
                        }
                }
-                x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
+                x = secs_to_cputime(soft);
                if (cputime_eq(prof_expires, cputime_zero) ||
                    cputime_lt(x, prof_expires)) {
                        prof_expires = x;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index bbfe472d7524..da5288ec2392 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -323,6 +323,7 @@ static int create_image(int platform_mode)
 int hibernation_snapshot(int platform_mode)
 {
        int error;
+        gfp_t saved_mask;
        error = platform_begin(platform_mode);
        if (error)
@@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mode)
        dpm_resume_end(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
        platform_end(platform_mode);
@@ -445,14 +448,17 @@ static int resume_target_kernel(bool platform_mode)
 int hibernation_restore(int platform_mode)
 {
        int error;
+        gfp_t saved_mask;
        pm_prepare_console();
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
                error = resume_target_kernel(platform_mode);
                dpm_resume_end(PMSG_RECOVER);
        }
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
        pm_restore_console();
        return error;
@@ -466,6 +472,7 @@ int hibernation_restore(int platform_mode)
 int hibernation_platform_enter(void)
 {
        int error;
+        gfp_t saved_mask;
        if (!hibernation_ops)
                return -ENOSYS;
@@ -481,6 +488,7 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -518,6 +526,7 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6f10dfc2d3e9..44cce10b582d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -189,6 +189,7 @@ static int suspend_enter(suspend_state_t state)
 int suspend_devices_and_enter(suspend_state_t state)
 {
        int error;
+        gfp_t saved_mask;
        if (!suspend_ops)
                return -ENOSYS;
@@ -199,6 +200,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
+        saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -215,6 +217,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
+        set_gfp_allowed_mask(saved_mask);
        resume_console();
 Close:
        if (suspend_ops->end)
diff --git a/kernel/printk.c b/kernel/printk.c
index 1751c456b71f..75077ad0b537 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -35,6 +35,7 @@
 #include <linux/kexec.h>
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
+#include <linux/syslog.h>
 #include <asm/uaccess.h>
@@ -69,8 +70,6 @@ int console_printk[4] = {
        DEFAULT_CONSOLE_LOGLEVEL,       /* default_console_loglevel */
 };
-static int saved_console_loglevel = -1;
 /*
 * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
@@ -145,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN];
 static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
+static int saved_console_loglevel = -1;
 #ifdef CONFIG_KEXEC
 /*
@@ -258,38 +258,23 @@ static inline void boot_delay_msec(void)
 }
 #endif
-/*
+int do_syslog(int type, char __user *buf, int len, bool from_file)
- * Commands to do_syslog:
- *
- *      0 -- Close the log.  Currently a NOP.
- *      1 -- Open the log. Currently a NOP.
- *      2 -- Read from the log.
- *      3 -- Read all messages remaining in the ring buffer.
- *      4 -- Read and clear all messages remaining in the ring buffer
- *      5 -- Clear ring buffer.
- *      6 -- Disable printk's to console
- *      7 -- Enable printk's to console
- *      8 -- Set level of messages printed to console
- *      9 -- Return number of unread characters in the log buffer
- *     10 -- Return size of the log buffer
- */
-int do_syslog(int type, char __user *buf, int len)
 {
        unsigned i, j, limit, count;
        int do_clear = 0;
        char c;
        int error = 0;
-        error = security_syslog(type);
+        error = security_syslog(type, from_file);
        if (error)
                return error;
        switch (type) {
-        case 0:         /* Close log */
+        case SYSLOG_ACTION_CLOSE:       /* Close log */
                break;
-        case 1:         /* Open log */
+        case SYSLOG_ACTION_OPEN:        /* Open log */
                break;
-        case 2:         /* Read from log */
+        case SYSLOG_ACTION_READ:        /* Read from log */
                error = -EINVAL;
                if (!buf || len < 0)
                        goto out;
@@ -320,10 +305,12 @@ int do_syslog(int type, char __user *buf, int len)
                if (!error)
                        error = i;
                break;
-        case 4:         /* Read/clear last kernel messages */
+        /* Read/clear last kernel messages */
+        case SYSLOG_ACTION_READ_CLEAR:
                do_clear = 1;
                /* FALL THRU */
-        case 3:         /* Read last kernel messages */
+        /* Read last kernel messages */
+        case SYSLOG_ACTION_READ_ALL:
                error = -EINVAL;
                if (!buf || len < 0)
                        goto out;
@@ -376,21 +363,25 @@ int do_syslog(int type, char __user *buf, int len)
                        }
                }
                break;
-        case 5:         /* Clear ring buffer */
+        /* Clear ring buffer */
+        case SYSLOG_ACTION_CLEAR:
                logged_chars = 0;
                break;
-        case 6:         /* Disable logging to console */
+        /* Disable logging to console */
+        case SYSLOG_ACTION_CONSOLE_OFF:
                if (saved_console_loglevel == -1)
                        saved_console_loglevel = console_loglevel;
                console_loglevel = minimum_console_loglevel;
                break;
-        case 7:         /* Enable logging to console */
+        /* Enable logging to console */
+        case SYSLOG_ACTION_CONSOLE_ON:
                if (saved_console_loglevel != -1) {
                        console_loglevel = saved_console_loglevel;
                        saved_console_loglevel = -1;
                }
                break;
-        case 8:         /* Set level of messages printed to console */
+        /* Set level of messages printed to console */
+        case SYSLOG_ACTION_CONSOLE_LEVEL:
                error = -EINVAL;
                if (len < 1 || len > 8)
                        goto out;
@@ -401,10 +392,12 @@ int do_syslog(int type, char __user *buf, int len)
                saved_console_loglevel = -1;
                error = 0;
                break;
-        case 9:         /* Number of chars in the log buffer */
+        /* Number of chars in the log buffer */
+        case SYSLOG_ACTION_SIZE_UNREAD:
                error = log_end - log_start;
                break;
-        case 10:        /* Size of the log buffer */
+        /* Size of the log buffer */
+        case SYSLOG_ACTION_SIZE_BUFFER:
                error = log_buf_len;
                break;
        default:
@@ -417,7 +410,7 @@ out:
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
 {
-        return do_syslog(type, buf, len);
+        return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
 /*
diff --git a/kernel/range.c b/kernel/range.c
new file mode 100644
index 000000000000..74e2e6114927
--- /dev/null
+++ b/kernel/range.c
@@ -0,0 +1,163 @@
+/*
+ * Range add and subtract
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sort.h>
+#include <linux/range.h>
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
+{
+        if (start >= end)
+                return nr_range;
+        /* Out of slots: */
+        if (nr_range >= az)
+                return nr_range;
+        range[nr_range].start = start;
+        range[nr_range].end = end;
+        nr_range++;
+        return nr_range;
+}
+int add_range_with_merge(struct range *range, int az, int nr_range,
+                     u64 start, u64 end)
+{
+        int i;
+        if (start >= end)
+                return nr_range;
+        /* Try to merge it with old one: */
+        for (i = 0; i < nr_range; i++) {
+                u64 final_start, final_end;
+                u64 common_start, common_end;
+                if (!range[i].end)
+                        continue;
+                common_start = max(range[i].start, start);
+                common_end = min(range[i].end, end);
+                if (common_start > common_end)
+                        continue;
+                final_start = min(range[i].start, start);
+                final_end = max(range[i].end, end);
+                range[i].start = final_start;
+                range[i].end =  final_end;
+                return nr_range;
+        }
+        /* Need to add it: */
+        return add_range(range, az, nr_range, start, end);
+}
+void subtract_range(struct range *range, int az, u64 start, u64 end)
+{
+        int i, j;
+        if (start >= end)
+                return;
+        for (j = 0; j < az; j++) {
+                if (!range[j].end)
+                        continue;
+                if (start <= range[j].start && end >= range[j].end) {
+                        range[j].start = 0;
+                        range[j].end = 0;
+                        continue;
+                }
+                if (start <= range[j].start && end < range[j].end &&
+                    range[j].start < end) {
+                        range[j].start = end;
+                        continue;
+                }
+                if (start > range[j].start && end >= range[j].end &&
+                    range[j].end > start) {
+                        range[j].end = start;
+                        continue;
+                }
+                if (start > range[j].start && end < range[j].end) {
+                        /* Find the new spare: */
+                        for (i = 0; i < az; i++) {
+                                if (range[i].end == 0)
+                                        break;
+                        }
+                        if (i < az) {
+                                range[i].end = range[j].end;
+                                range[i].start = end;
+                        } else {
+                                printk(KERN_ERR "run of slot in ranges\n");
+                        }
+                        range[j].end = start;
+                        continue;
+                }
+        }
+}
+static int cmp_range(const void *x1, const void *x2)
+{
+        const struct range *r1 = x1;
+        const struct range *r2 = x2;
+        s64 start1, start2;
+        start1 = r1->start;
+        start2 = r2->start;
+        return start1 - start2;
+}
+int clean_sort_range(struct range *range, int az)
+{
+        int i, j, k = az - 1, nr_range = 0;
+        for (i = 0; i < k; i++) {
+                if (range[i].end)
+                        continue;
+                for (j = k; j > i; j--) {
+                        if (range[j].end) {
+                                k = j;
+                                break;
+                        }
+                }
+                if (j == i)
+                        break;
+                range[i].start = range[k].start;
+                range[i].end   = range[k].end;
+                range[k].start = 0;
+                range[k].end   = 0;
+                k--;
+        }
+        /* count it */
+        for (i = 0; i < az; i++) {
+                if (!range[i].end) {
+                        nr_range = i;
+                        break;
+                }
+        }
+        /* sort them */
+        sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+        return nr_range;
+}
+void sort_range(struct range *range, int nr_range)
+{
+        /* sort them */
+        sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+}
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 258cdf0a91eb..58df55bf83ed 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -818,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
+        __this_cpu_inc(rcu_torture_count[pipe_count]);
        completed = cur_ops->completed() - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
        }
-        __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
+        __this_cpu_inc(rcu_torture_batch[completed]);
        preempt_enable();
        cur_ops->readunlock(idx);
 }
@@ -877,13 +877,13 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
+                __this_cpu_inc(rcu_torture_count[pipe_count]);
                completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
                }
-                __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
+                __this_cpu_inc(rcu_torture_batch[completed]);
                preempt_enable();
                cur_ops->readunlock(idx);
                schedule();
diff --git a/kernel/relay.c b/kernel/relay.c
index c705a41b4ba3..3d97f2821611 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
 /*
 *      subbuf_splice_actor - splice up to one subbuf's worth of data
 */
-static int subbuf_splice_actor(struct file *in,
+static ssize_t subbuf_splice_actor(struct file *in,
                               loff_t *ppos,
                               struct pipe_inode_info *pipe,
                               size_t len,
                               unsigned int flags,
                               int *nonpad_ret)
 {
-        unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
+        unsigned int pidx, poff, total_len, subbuf_pages, nr_pages;
        struct rchan_buf *rbuf = in->private_data;
        unsigned int subbuf_size = rbuf->chan->subbuf_size;
        uint64_t pos = (uint64_t) *ppos;
@@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in,
                .ops = &relay_pipe_buf_ops,
                .spd_release = relay_page_release,
        };
+        ssize_t ret;
        if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
                return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index 4e9d87fd7bc5..2d5be5d9bf5f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -304,7 +304,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
                void *arg, int (*func)(unsigned long, unsigned long, void *))
 {
        struct resource res;
-        unsigned long pfn, len;
+        unsigned long pfn, end_pfn;
        u64 orig_end;
        int ret = -1;
@@ -314,9 +314,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
        orig_end = res.end;
        while ((res.start < res.end) &&
                (find_next_system_ram(&res, "System RAM") >= 0)) {
-                pfn = (unsigned long)(res.start >> PAGE_SHIFT);
+                pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
-                len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
+                end_pfn = (res.end + 1) >> PAGE_SHIFT;
-                ret = (*func)(pfn, len, arg);
+                if (end_pfn > pfn)
+                        ret = (*func)(pfn, end_pfn - pfn, arg);
                if (ret)
                        break;
                res.start = res.end + 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6a212c97f523..150b6988de49 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1521,7 +1521,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long *update_shares_data;
+static __read_mostly unsigned long __percpu *update_shares_data;
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -4353,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice)
        /* convert nice value [19,-20] to rlimit style value [1,40] */
        int nice_rlim = 20 - nice;
-        return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                capable(CAP_SYS_NICE));
 }
@@ -4530,7 +4530,7 @@ recheck:
                        if (!lock_task_sighand(p, &flags))
                                return -ESRCH;
-                        rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+                        rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
                        unlock_task_sighand(p, &flags);
                        /* can't set/change the rt policy */
@@ -7406,11 +7406,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 #ifdef CONFIG_SCHED_MC
 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+                                           struct sysdev_class_attribute *attr,
                                           char *page)
 {
        return sprintf(page, "%u\n", sched_mc_power_savings);
 }
 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+                                            struct sysdev_class_attribute *attr,
                                            const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 0);
@@ -7422,11 +7424,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
 #ifdef CONFIG_SCHED_SMT
 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+                                            struct sysdev_class_attribute *attr,
                                            char *page)
 {
        return sprintf(page, "%u\n", sched_smt_power_savings);
 }
 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+                                             struct sysdev_class_attribute *attr,
                                             const char *buf, size_t count)
 {
        return sched_power_savings_store(buf, count, 1);
@@ -8813,7 +8817,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
-        u64 *cpuusage;
+        u64 __percpu *cpuusage;
        struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
        struct cpuacct *parent;
 };
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index eeb3506c4834..fccf9fbb0d7b 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,7 +47,7 @@ static int convert_prio(int prio)
 }
 #define for_each_cpupri_active(array, idx)                    \
-        for_each_bit(idx, array, CPUPRI_NR_PRIORITIES)
+        for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
 /**
 * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -56,7 +56,7 @@ static int convert_prio(int prio)
 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
 *
 * Note: This function returns the recommended CPUs as calculated during the
- * current invokation.  By the time the call returns, the CPUs may have in
+ * current invocation.  By the time the call returns, the CPUs may have in
 * fact changed priorities any number of times.  While not ideal, it is not
 * an issue of correctness since the normal rebalancer logic will correct
 * any discrepancies created by racing against the uncertainty of the current
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bf3e38fdbe6d..5a6ed1f0990a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1662,8 +1662,9 @@ static void watchdog(struct rq *rq, struct task_struct *p)
        if (!p->signal)
                return;
-        soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
+        /* max may change after cur was read, this will be fixed next tick */
-        hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
+        soft = task_rlimit(p, RLIMIT_RTTIME);
+        hard = task_rlimit_max(p, RLIMIT_RTTIME);
        if (soft != RLIM_INFINITY) {
                unsigned long next;
diff --git a/kernel/signal.c b/kernel/signal.c
index 934ae5e687b9..dbd7fe073c55 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -159,6 +159,10 @@ void recalc_sigpending(void)
 /* Given the mask, find the first available signal that should be serviced. */
+#define SYNCHRONOUS_MASK \
+        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
+         sigmask(SIGTRAP) | sigmask(SIGFPE))
 int next_signal(struct sigpending *pending, sigset_t *mask)
 {
        unsigned long i, *s, *m, x;
@@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
        s = pending->signal.sig;
        m = mask->sig;
+        /*
+         * Handle the first word specially: it contains the
+         * synchronous signals that need to be dequeued first.
+         */
+        x = *s &~ *m;
+        if (x) {
+                if (x & SYNCHRONOUS_MASK)
+                        x &= SYNCHRONOUS_MASK;
+                sig = ffz(~x) + 1;
+                return sig;
+        }
        switch (_NSIG_WORDS) {
        default:
-                for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m)
+                for (i = 1; i < _NSIG_WORDS; ++i) {
-                        if ((x = *s &~ *m) != 0) {
+                        x = *++s &~ *++m;
-                                sig = ffz(~x) + i*_NSIG_BPW + 1;
+                        if (!x)
-                                break;
+                                continue;
-                        }
+                        sig = ffz(~x) + i*_NSIG_BPW + 1;
+                        break;
+                }
                break;
-        case 2: if ((x = s[0] &~ m[0]) != 0)
+        case 2:
-                        sig = 1;
+                x = s[1] &~ m[1];
-                else if ((x = s[1] &~ m[1]) != 0)
+                if (!x)
-                        sig = _NSIG_BPW + 1;
-                else
                        break;
-                sig += ffz(~x);
+                sig = ffz(~x) + _NSIG_BPW + 1;
                break;
-        case 1: if ((x = *s &~ *m) != 0)
+        case 1:
-                        sig = ffz(~x) + 1;
+                /* Nothing to do */
                break;
        }
@@ -228,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
        if (override_rlimit ||
            atomic_read(&user->sigpending) <=
-                        t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
+                        task_rlimit(t, RLIMIT_SIGPENDING)) {
                q = kmem_cache_alloc(sigqueue_cachep, flags);
        } else {
                print_dropped_signal(sig);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 912823e2a11b..9bb9fb1bd79c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -45,7 +45,7 @@ static int refcount;
 static struct workqueue_struct *stop_machine_wq;
 static struct stop_machine_data active, idle;
 static const struct cpumask *active_cpus;
-static void *stop_machine_work;
+static void __percpu *stop_machine_work;
 static void set_state(enum stopmachine_state newstate)
 {
diff --git a/kernel/sys.c b/kernel/sys.c
index 877fe4f8e05e..8298878f4f71 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
+#include <linux/personality.h>
 #include <linux/ptrace.h>
 #include <linux/fs_struct.h>
@@ -571,8 +572,7 @@ static int set_user(struct cred *new)
        if (!new_user)
                return -EAGAIN;
-        if (atomic_read(&new_user->processes) >=
+        if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
-                                current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
                        new_user != INIT_USER) {
                free_uid(new_user);
                return -EAGAIN;
@@ -1115,6 +1115,15 @@ out:
 DECLARE_RWSEM(uts_sem);
+#ifdef COMPAT_UTS_MACHINE
+#define override_architecture(name) \
+        (current->personality == PER_LINUX32 && \
+         copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
+                      sizeof(COMPAT_UTS_MACHINE)))
+#else
+#define override_architecture(name)     0
+#endif
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 {
        int errno = 0;
@@ -1123,9 +1132,66 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
        if (copy_to_user(name, utsname(), sizeof *name))
                errno = -EFAULT;
        up_read(&uts_sem);
+        if (!errno && override_architecture(name))
+                errno = -EFAULT;
        return errno;
 }
+#ifdef __ARCH_WANT_SYS_OLD_UNAME
+/*
+ * Old cruft
+ */
+SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
+{
+        int error = 0;
+        if (!name)
+                return -EFAULT;
+        down_read(&uts_sem);
+        if (copy_to_user(name, utsname(), sizeof(*name)))
+                error = -EFAULT;
+        up_read(&uts_sem);
+        if (!error && override_architecture(name))
+                error = -EFAULT;
+        return error;
+}
+SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
+{
+        int error;
+        if (!name)
+                return -EFAULT;
+        if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
+                return -EFAULT;
+        down_read(&uts_sem);
+        error = __copy_to_user(&name->sysname, &utsname()->sysname,
+                               __OLD_UTS_LEN);
+        error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->nodename, &utsname()->nodename,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->release, &utsname()->release,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->release + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->version, &utsname()->version,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->version + __OLD_UTS_LEN);
+        error |= __copy_to_user(&name->machine, &utsname()->machine,
+                                __OLD_UTS_LEN);
+        error |= __put_user(0, name->machine + __OLD_UTS_LEN);
+        up_read(&uts_sem);
+        if (!error && override_architecture(name))
+                error = -EFAULT;
+        return error ? -EFAULT : 0;
+}
+#endif
 SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
 {
        int errno;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 695384f12a7d..70f2ea758ffe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(sys_ipc);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
 cond_syscall(sys_flock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8a68b2448468..8686b0f5fc12 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
+#include <linux/signal.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -50,6 +51,7 @@
 #include <linux/ftrace.h>
 #include <linux/slow-work.h>
 #include <linux/perf_event.h>
+#include <linux/kprobes.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -59,13 +61,23 @@
 #include <asm/stacktrace.h>
 #include <asm/io.h>
 #endif
+#ifdef CONFIG_BSD_PROCESS_ACCT
+#include <linux/acct.h>
+#endif
+#ifdef CONFIG_RT_MUTEXES
+#include <linux/rtmutex.h>
+#endif
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT)
+#include <linux/lockdep.h>
+#endif
+#ifdef CONFIG_CHR_DEV_SG
+#include <scsi/sg.h>
+#endif
 #if defined(CONFIG_SYSCTL)
 /* External variables not in a header file. */
-extern int C_A_D;
-extern int print_fatal_signals;
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
@@ -87,9 +99,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifndef CONFIG_MMU
 extern int sysctl_nr_trim_pages;
 #endif
-#ifdef CONFIG_RCU_TORTURE_TEST
-extern int rcutorture_runnable;
-#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 #ifdef CONFIG_BLOCK
 extern int blk_iopoll_enabled;
 #endif
@@ -119,14 +128,6 @@ static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
-#ifdef CONFIG_MODULES
-extern char modprobe_path[];
-extern int modules_disabled;
-#endif
-#ifdef CONFIG_CHR_DEV_SG
-extern int sg_big_buff;
-#endif
 #ifdef CONFIG_SPARC
 #include <asm/system.h>
 #endif
@@ -148,10 +149,6 @@ extern int sysctl_userprocess_debug;
 extern int spin_retry;
 #endif
-#ifdef CONFIG_BSD_PROCESS_ACCT
-extern int acct_parm[];
-#endif
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
@@ -159,10 +156,6 @@ extern int unaligned_dump_stack;
 extern struct ratelimit_state printk_ratelimit_state;
-#ifdef CONFIG_RT_MUTEXES
-extern int max_lock_depth;
-#endif
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -201,9 +194,6 @@ extern struct ctl_table epoll_table[];
 int sysctl_legacy_va_layout;
 #endif
-extern int prove_locking;
-extern int lock_stat;
 /* The default sysctl tables: */
 static struct ctl_table root_table[] = {
@@ -1441,7 +1431,7 @@ static struct ctl_table fs_table[] = {
 };
 static struct ctl_table debug_table[] = {
-#if defined(CONFIG_X86) || defined(CONFIG_PPC)
+#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC)
        {
                .procname       = "exception-trace",
                .data           = &show_unhandled_signals,
@@ -1450,6 +1440,17 @@ static struct ctl_table debug_table[] = {
                .proc_handler   = proc_dointvec
        },
 #endif
+#if defined(CONFIG_OPTPROBES)
+        {
+                .procname       = "kprobes-optimization",
+                .data           = &sysctl_kprobes_optimization,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_kprobes_optimization_handler,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif
        { }
 };
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8f5d16e0707a..8cd50d8f9bde 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        ssize_t result;
        char *pathname;
        int flags;
-        int acc_mode, fmode;
+        int acc_mode;
        pathname = sysctl_getname(name, nlen, &table);
        result = PTR_ERR(pathname);
@@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        if (oldval && oldlen && newval && newlen) {
                flags = O_RDWR;
                acc_mode = MAY_READ | MAY_WRITE;
-                fmode = FMODE_READ | FMODE_WRITE;
        } else if (newval && newlen) {
                flags = O_WRONLY;
                acc_mode = MAY_WRITE;
-                fmode = FMODE_WRITE;
        } else if (oldval && oldlen) {
                flags = O_RDONLY;
                acc_mode = MAY_READ;
-                fmode = FMODE_READ;
        } else {
                result = 0;
                goto out_putname;
@@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
        if (result)
                goto out_putname;
-        result = may_open(&nd.path, acc_mode, fmode);
+        result = may_open(&nd.path, acc_mode, flags);
        if (result)
                goto out_putpath;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea8384d3caa7..899ca51be5e8 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -46,15 +46,13 @@ static struct genl_family family = {
        .maxattr        = TASKSTATS_CMD_ATTR_MAX,
 };
-static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
-__read_mostly = {
        [TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
        [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
        [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
        [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
-static struct nla_policy
+static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
-cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
        [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8c1b2d290718..a2f0fe951831 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -20,6 +20,7 @@
 #include <linux/cpu.h>
 #include <linux/fs.h>
+#include <asm/local.h>
 #include "trace.h"
 /*
@@ -2541,7 +2542,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
 * @buffer: The ring buffer to enable writes
 *
 * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
 */
 void ring_buffer_record_enable(struct ring_buffer *buffer)
 {
@@ -2577,7 +2578,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 * @cpu: The CPU to enable.
 *
 * Note, multiple disables will need the same number of enables
- * to truely enable the writing (much like preempt_disable).
+ * to truly enable the writing (much like preempt_disable).
 */
 void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index b2477caf09c2..df74c7982255 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -8,6 +8,7 @@
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/time.h>
+#include <asm/local.h>
 struct rb_page {
        u64             ts;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 032c57ca6502..ed01fdba4a55 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -92,12 +92,12 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled);
 static inline void ftrace_disable_cpu(void)
 {
        preempt_disable();
-        __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_inc(ftrace_cpu_disabled);
 }
 static inline void ftrace_enable_cpu(void)
 {
-        __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
+        __this_cpu_dec(ftrace_cpu_disabled);
        preempt_enable();
 }
@@ -1166,7 +1166,7 @@ trace_function(struct trace_array *tr,
        struct ftrace_entry *entry;
        /* If we are reading the ring buffer, don't trace */
-        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd05bcaf91b0..09b39112a5e2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -550,7 +550,7 @@ static inline int ftrace_trace_task(struct task_struct *task)
 * struct trace_parser - servers for reading the user input separated by spaces
 * @cont: set if the input is not complete - no final space char was found
 * @buffer: holds the parsed user input
- * @idx: user input lenght
+ * @idx: user input length
 * @size: buffer size
 */
 struct trace_parser {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e998a824e9db..3fc2a575664f 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -188,7 +188,7 @@ static int __trace_graph_entry(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ent_entry *entry;
-        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return 0;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -247,7 +247,7 @@ static void __trace_graph_return(struct trace_array *tr,
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ret_entry *entry;
-        if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
+        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return;
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 00d59d048edf..0a67e041edf8 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -21,6 +21,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/acct.h>
 #include <linux/jiffies.h>
+#include <linux/mm.h>
 /*
 * fill in basic accounting fields