38 files changed, 1204 insertions, 822 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e9cf19155b46..2d64cfcc8b42 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -61,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
-obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 909a35510af5..2731d115d725 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/eventfd.h>
 #include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
 #include <asm/atomic.h>
@@ -1735,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+                               struct task_struct *tsk, bool guarantee)
+{
+        struct css_set *oldcg;
+        struct css_set *newcg;
+        /*
+         * get old css_set. we need to take task_lock and refcount it, because
+         * an exiting task can change its css_set to init_css_set and drop its
+         * old one without taking cgroup_mutex.
+         */
+        task_lock(tsk);
+        oldcg = tsk->cgroups;
+        get_css_set(oldcg);
+        task_unlock(tsk);
+        /* locate or allocate a new css_set for this task. */
+        if (guarantee) {
+                /* we know the css_set we want already exists. */
+                struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+                read_lock(&css_set_lock);
+                newcg = find_existing_css_set(oldcg, cgrp, template);
+                BUG_ON(!newcg);
+                get_css_set(newcg);
+                read_unlock(&css_set_lock);
+        } else {
+                might_sleep();
+                /* find_css_set will give us newcg already referenced. */
+                newcg = find_css_set(oldcg, cgrp);
+                if (!newcg) {
+                        put_css_set(oldcg);
+                        return -ENOMEM;
+                }
+        }
+        put_css_set(oldcg);
+        /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+        task_lock(tsk);
+        if (tsk->flags & PF_EXITING) {
+                task_unlock(tsk);
+                put_css_set(newcg);
+                return -ESRCH;
+        }
+        rcu_assign_pointer(tsk->cgroups, newcg);
+        task_unlock(tsk);
+        /* Update the css_set linked lists if we're using them */
+        write_lock(&css_set_lock);
+        if (!list_empty(&tsk->cg_list))
+                list_move(&tsk->cg_list, &newcg->tasks);
+        write_unlock(&css_set_lock);
+        /*
+         * We just gained a reference on oldcg by taking it from the task. As
+         * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+         * it here; it will be freed under RCU.
+         */
+        put_css_set(oldcg);
+        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+        return 0;
+}
 /**
 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
 * @cgrp: the cgroup the task is attaching to
@@ -1745,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
 */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
-        int retval = 0;
+        int retval;
        struct cgroup_subsys *ss, *failed_ss = NULL;
        struct cgroup *oldcgrp;
-        struct css_set *cg;
-        struct css_set *newcg;
        struct cgroupfs_root *root = cgrp->root;
        /* Nothing to do if the task is already in that cgroup */
@@ -1759,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        for_each_subsys(root, ss) {
                if (ss->can_attach) {
-                        retval = ss->can_attach(ss, cgrp, tsk, false);
+                        retval = ss->can_attach(ss, cgrp, tsk);
                        if (retval) {
                                /*
                                 * Remember on which subsystem the can_attach()
@@ -1771,46 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                                goto out;
                        }
                }
+                if (ss->can_attach_task) {
+                        retval = ss->can_attach_task(cgrp, tsk);
+                        if (retval) {
+                                failed_ss = ss;
+                                goto out;
+                        }
+                }
        }
-        task_lock(tsk);
+        retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
-        cg = tsk->cgroups;
+        if (retval)
-        get_css_set(cg);
-        task_unlock(tsk);
-        /*
-         * Locate or allocate a new css_set for this task,
-         * based on its final set of cgroups
-         */
-        newcg = find_css_set(cg, cgrp);
-        put_css_set(cg);
-        if (!newcg) {
-                retval = -ENOMEM;
-                goto out;
-        }
-        task_lock(tsk);
-        if (tsk->flags & PF_EXITING) {
-                task_unlock(tsk);
-                put_css_set(newcg);
-                retval = -ESRCH;
                goto out;
-        }
-        rcu_assign_pointer(tsk->cgroups, newcg);
-        task_unlock(tsk);
-        /* Update the css_set linked lists if we're using them */
-        write_lock(&css_set_lock);
-        if (!list_empty(&tsk->cg_list))
-                list_move(&tsk->cg_list, &newcg->tasks);
-        write_unlock(&css_set_lock);
        for_each_subsys(root, ss) {
+                if (ss->pre_attach)
+                        ss->pre_attach(cgrp);
+                if (ss->attach_task)
+                        ss->attach_task(cgrp, tsk);
                if (ss->attach)
-                        ss->attach(ss, cgrp, oldcgrp, tsk, false);
+                        ss->attach(ss, cgrp, oldcgrp, tsk);
        }
-        set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
        synchronize_rcu();
-        put_css_set(cg);
        /*
         * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1829,7 +1881,7 @@ out:
                                 */
                                break;
                        if (ss->cancel_attach)
-                                ss->cancel_attach(ss, cgrp, tsk, false);
+                                ss->cancel_attach(ss, cgrp, tsk);
                }
        }
        return retval;
@@ -1860,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 /*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
- * held. May take task_lock of task
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+        struct css_set *cg;
+        struct list_head links;
+};
+static bool css_set_check_fetched(struct cgroup *cgrp,
+                                  struct task_struct *tsk, struct css_set *cg,
+                                  struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+        read_lock(&css_set_lock);
+        newcg = find_existing_css_set(cg, cgrp, template);
+        if (newcg)
+                get_css_set(newcg);
+        read_unlock(&css_set_lock);
+        /* doesn't exist at all? */
+        if (!newcg)
+                return false;
+        /* see if it's already in the list */
+        list_for_each_entry(cg_entry, newcg_list, links) {
+                if (cg_entry->cg == newcg) {
+                        put_css_set(newcg);
+                        return true;
+                }
+        }
+        /* not found */
+        put_css_set(newcg);
+        return false;
+}
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+                            struct list_head *newcg_list)
+{
+        struct css_set *newcg;
+        struct cg_list_entry *cg_entry;
+        /* ensure a new css_set will exist for this thread */
+        newcg = find_css_set(cg, cgrp);
+        if (!newcg)
+                return -ENOMEM;
+        /* add it to the list */
+        cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+        if (!cg_entry) {
+                put_css_set(newcg);
+                return -ENOMEM;
+        }
+        cg_entry->cg = newcg;
+        list_add(&cg_entry->links, newcg_list);
+        return 0;
+}
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+        int retval, i, group_size;
+        struct cgroup_subsys *ss, *failed_ss = NULL;
+        bool cancel_failed_ss = false;
+        /* guaranteed to be initialized later, but the compiler needs this */
+        struct cgroup *oldcgrp = NULL;
+        struct css_set *oldcg;
+        struct cgroupfs_root *root = cgrp->root;
+        /* threadgroup list cursor and array */
+        struct task_struct *tsk;
+        struct flex_array *group;
+        /*
+         * we need to make sure we have css_sets for all the tasks we're
+         * going to move -before- we actually start moving them, so that in
+         * case we get an ENOMEM we can bail out before making any changes.
+         */
+        struct list_head newcg_list;
+        struct cg_list_entry *cg_entry, *temp_nobe;
+        /*
+         * step 0: in order to do expensive, possibly blocking operations for
+         * every thread, we cannot iterate the thread group list, since it needs
+         * rcu or tasklist locked. instead, build an array of all threads in the
+         * group - threadgroup_fork_lock prevents new threads from appearing,
+         * and if threads exit, this will just be an over-estimate.
+         */
+        group_size = get_nr_threads(leader);
+        /* flex_array supports very large thread-groups better than kmalloc. */
+        group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+                                 GFP_KERNEL);
+        if (!group)
+                return -ENOMEM;
+        /* pre-allocate to guarantee space while iterating in rcu read-side. */
+        retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+        if (retval)
+                goto out_free_group_list;
+        /* prevent changes to the threadgroup list while we take a snapshot. */
+        rcu_read_lock();
+        if (!thread_group_leader(leader)) {
+                /*
+                 * a race with de_thread from another thread's exec() may strip
+                 * us of our leadership, making while_each_thread unsafe to use
+                 * on this task. if this happens, there is no choice but to
+                 * throw this task away and try again (from cgroup_procs_write);
+                 * this is "double-double-toil-and-trouble-check locking".
+                 */
+                rcu_read_unlock();
+                retval = -EAGAIN;
+                goto out_free_group_list;
+        }
+        /* take a reference on each task in the group to go in the array. */
+        tsk = leader;
+        i = 0;
+        do {
+                /* as per above, nr_threads may decrease, but not increase. */
+                BUG_ON(i >= group_size);
+                get_task_struct(tsk);
+                /*
+                 * saying GFP_ATOMIC has no effect here because we did prealloc
+                 * earlier, but it's good form to communicate our expectations.
+                 */
+                retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+                BUG_ON(retval != 0);
+                i++;
+        } while_each_thread(leader, tsk);
+        /* remember the number of threads in the array for later. */
+        group_size = i;
+        rcu_read_unlock();
+        /*
+         * step 1: check that we can legitimately attach to the cgroup.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->can_attach) {
+                        retval = ss->can_attach(ss, cgrp, leader);
+                        if (retval) {
+                                failed_ss = ss;
+                                goto out_cancel_attach;
+                        }
+                }
+                /* a callback to be run on every thread in the threadgroup. */
+                if (ss->can_attach_task) {
+                        /* run on each task in the threadgroup. */
+                        for (i = 0; i < group_size; i++) {
+                                tsk = flex_array_get_ptr(group, i);
+                                retval = ss->can_attach_task(cgrp, tsk);
+                                if (retval) {
+                                        failed_ss = ss;
+                                        cancel_failed_ss = true;
+                                        goto out_cancel_attach;
+                                }
+                        }
+                }
+        }
+        /*
+         * step 2: make sure css_sets exist for all threads to be migrated.
+         * we use find_css_set, which allocates a new one if necessary.
+         */
+        INIT_LIST_HEAD(&newcg_list);
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                /* nothing to do if this task is already in the cgroup */
+                oldcgrp = task_cgroup_from_root(tsk, root);
+                if (cgrp == oldcgrp)
+                        continue;
+                /* get old css_set pointer */
+                task_lock(tsk);
+                if (tsk->flags & PF_EXITING) {
+                        /* ignore this task if it's going away */
+                        task_unlock(tsk);
+                        continue;
+                }
+                oldcg = tsk->cgroups;
+                get_css_set(oldcg);
+                task_unlock(tsk);
+                /* see if the new one for us is already in the list? */
+                if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+                        /* was already there, nothing to do. */
+                        put_css_set(oldcg);
+                } else {
+                        /* we don't already have it. get new one. */
+                        retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+                        put_css_set(oldcg);
+                        if (retval)
+                                goto out_list_teardown;
+                }
+        }
+        /*
+         * step 3: now that we're guaranteed success wrt the css_sets, proceed
+         * to move all tasks to the new cgroup, calling ss->attach_task for each
+         * one along the way. there are no failure cases after here, so this is
+         * the commit point.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->pre_attach)
+                        ss->pre_attach(cgrp);
+        }
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                /* leave current thread as it is if it's already there */
+                oldcgrp = task_cgroup_from_root(tsk, root);
+                if (cgrp == oldcgrp)
+                        continue;
+                /* attach each task to each subsystem */
+                for_each_subsys(root, ss) {
+                        if (ss->attach_task)
+                                ss->attach_task(cgrp, tsk);
+                }
+                /* if the thread is PF_EXITING, it can just get skipped. */
+                retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+                BUG_ON(retval != 0 && retval != -ESRCH);
+        }
+        /* nothing is sensitive to fork() after this point. */
+        /*
+         * step 4: do expensive, non-thread-specific subsystem callbacks.
+         * TODO: if ever a subsystem needs to know the oldcgrp for each task
+         * being moved, this call will need to be reworked to communicate that.
+         */
+        for_each_subsys(root, ss) {
+                if (ss->attach)
+                        ss->attach(ss, cgrp, oldcgrp, leader);
+        }
+        /*
+         * step 5: success! and cleanup
+         */
+        synchronize_rcu();
+        cgroup_wakeup_rmdir_waiter(cgrp);
+        retval = 0;
+out_list_teardown:
+        /* clean up the list of prefetched css_sets. */
+        list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+                list_del(&cg_entry->links);
+                put_css_set(cg_entry->cg);
+                kfree(cg_entry);
+        }
+out_cancel_attach:
+        /* same deal as in cgroup_attach_task */
+        if (retval) {
+                for_each_subsys(root, ss) {
+                        if (ss == failed_ss) {
+                                if (cancel_failed_ss && ss->cancel_attach)
+                                        ss->cancel_attach(ss, cgrp, leader);
+                                break;
+                        }
+                        if (ss->cancel_attach)
+                                ss->cancel_attach(ss, cgrp, leader);
+                }
+        }
+        /* clean up the array of referenced threads in the group. */
+        for (i = 0; i < group_size; i++) {
+                tsk = flex_array_get_ptr(group, i);
+                put_task_struct(tsk);
+        }
+out_free_group_list:
+        flex_array_free(group);
+        return retval;
+}
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
 */
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
        struct task_struct *tsk;
        const struct cred *cred = current_cred(), *tcred;
        int ret;
+        if (!cgroup_lock_live_group(cgrp))
+                return -ENODEV;
        if (pid) {
                rcu_read_lock();
                tsk = find_task_by_vpid(pid);
-                if (!tsk || tsk->flags & PF_EXITING) {
+                if (!tsk) {
                        rcu_read_unlock();
+                        cgroup_unlock();
+                        return -ESRCH;
+                }
+                if (threadgroup) {
+                        /*
+                         * RCU protects this access, since tsk was found in the
+                         * tid map. a race with de_thread may cause group_leader
+                         * to stop being the leader, but cgroup_attach_proc will
+                         * detect it later.
+                         */
+                        tsk = tsk->group_leader;
+                } else if (tsk->flags & PF_EXITING) {
+                        /* optimization for the single-task-only case */
+                        rcu_read_unlock();
+                        cgroup_unlock();
                        return -ESRCH;
                }
+                /*
+                 * even if we're attaching all tasks in the thread group, we
+                 * only need to check permissions on one of them.
+                 */
                tcred = __task_cred(tsk);
                if (cred->euid &&
                    cred->euid != tcred->uid &&
                    cred->euid != tcred->suid) {
                        rcu_read_unlock();
+                        cgroup_unlock();
                        return -EACCES;
                }
                get_task_struct(tsk);
                rcu_read_unlock();
        } else {
-                tsk = current;
+                if (threadgroup)
+                        tsk = current->group_leader;
+                else
+                        tsk = current;
                get_task_struct(tsk);
        }
-        ret = cgroup_attach_task(cgrp, tsk);
+        if (threadgroup) {
+                threadgroup_fork_write_lock(tsk);
+                ret = cgroup_attach_proc(cgrp, tsk);
+                threadgroup_fork_write_unlock(tsk);
+        } else {
+                ret = cgroup_attach_task(cgrp, tsk);
+        }
        put_task_struct(tsk);
+        cgroup_unlock();
        return ret;
 }
 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 {
+        return attach_task_by_pid(cgrp, pid, false);
+}
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
        int ret;
-        if (!cgroup_lock_live_group(cgrp))
+        do {
-                return -ENODEV;
+                /*
-        ret = attach_task_by_pid(cgrp, pid);
+                 * attach_proc fails with -EAGAIN if threadgroup leadership
-        cgroup_unlock();
+                 * changes in the middle of the operation, in which case we need
+                 * to find the task_struct for the new leader and start over.
+                 */
+                ret = attach_task_by_pid(cgrp, tgid, true);
+        } while (ret == -EAGAIN);
        return ret;
 }
@@ -3259,9 +3632,9 @@ static struct cftype files[] = {
        {
                .name = CGROUP_FILE_GENERIC_PREFIX "procs",
                .open = cgroup_procs_open,
-                /* .write_u64 = cgroup_procs_write, TODO */
+                .write_u64 = cgroup_procs_write,
                .release = cgroup_pidlist_release,
-                .mode = S_IRUGO,
+                .mode = S_IRUGO | S_IWUSR,
        },
        {
                .name = "notify_on_release",
@@ -4257,122 +4630,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 }
 /**
- * cgroup_clone - clone the cgroup the given subsystem is attached to
- * @tsk: the task to be moved
- * @subsys: the given subsystem
- * @nodename: the name for the new cgroup
- *
- * Duplicate the current cgroup in the hierarchy that the given
- * subsystem is attached to, and move this task into the new
- * child.
- */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
-                                                        char *nodename)
-{
-        struct dentry *dentry;
-        int ret = 0;
-        struct cgroup *parent, *child;
-        struct inode *inode;
-        struct css_set *cg;
-        struct cgroupfs_root *root;
-        struct cgroup_subsys *ss;
-        /* We shouldn't be called by an unregistered subsystem */
-        BUG_ON(!subsys->active);
-        /* First figure out what hierarchy and cgroup we're dealing
-         * with, and pin them so we can drop cgroup_mutex */
-        mutex_lock(&cgroup_mutex);
- again:
-        root = subsys->root;
-        if (root == &rootnode) {
-                mutex_unlock(&cgroup_mutex);
-                return 0;
-        }
-        /* Pin the hierarchy */
-        if (!atomic_inc_not_zero(&root->sb->s_active)) {
-                /* We race with the final deactivate_super() */
-                mutex_unlock(&cgroup_mutex);
-                return 0;
-        }
-        /* Keep the cgroup alive */
-        task_lock(tsk);
-        parent = task_cgroup(tsk, subsys->subsys_id);
-        cg = tsk->cgroups;
-        get_css_set(cg);
-        task_unlock(tsk);
-        mutex_unlock(&cgroup_mutex);
-        /* Now do the VFS work to create a cgroup */
-        inode = parent->dentry->d_inode;
-        /* Hold the parent directory mutex across this operation to
-         * stop anyone else deleting the new cgroup */
-        mutex_lock(&inode->i_mutex);
-        dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
-        if (IS_ERR(dentry)) {
-                printk(KERN_INFO
-                       "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
-                       PTR_ERR(dentry));
-                ret = PTR_ERR(dentry);
-                goto out_release;
-        }
-        /* Create the cgroup directory, which also creates the cgroup */
-        ret = vfs_mkdir(inode, dentry, 0755);
-        child = __d_cgrp(dentry);
-        dput(dentry);
-        if (ret) {
-                printk(KERN_INFO
-                       "Failed to create cgroup %s: %d\n", nodename,
-                       ret);
-                goto out_release;
-        }
-        /* The cgroup now exists. Retake cgroup_mutex and check
-         * that we're still in the same state that we thought we
-         * were. */
-        mutex_lock(&cgroup_mutex);
-        if ((root != subsys->root) ||
-            (parent != task_cgroup(tsk, subsys->subsys_id))) {
-                /* Aargh, we raced ... */
-                mutex_unlock(&inode->i_mutex);
-                put_css_set(cg);
-                deactivate_super(root->sb);
-                /* The cgroup is still accessible in the VFS, but
-                 * we're not going to try to rmdir() it at this
-                 * point. */
-                printk(KERN_INFO
-                       "Race in cgroup_clone() - leaking cgroup %s\n",
-                       nodename);
-                goto again;
-        }
-        /* do any required auto-setup */
-        for_each_subsys(root, ss) {
-                if (ss->post_clone)
-                        ss->post_clone(ss, child);
-        }
-        /* All seems fine. Finish by moving the task into the new cgroup */
-        ret = cgroup_attach_task(child, tsk);
-        mutex_unlock(&cgroup_mutex);
- out_release:
-        mutex_unlock(&inode->i_mutex);
-        mutex_lock(&cgroup_mutex);
-        put_css_set(cg);
-        mutex_unlock(&cgroup_mutex);
-        deactivate_super(root->sb);
-        return ret;
-}
-/**
 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
 * @cgrp: the cgroup in question
 * @task: the task in question
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7c6c38..e691818d7e45 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
 */
 static int freezer_can_attach(struct cgroup_subsys *ss,
                              struct cgroup *new_cgroup,
-                              struct task_struct *task, bool threadgroup)
+                              struct task_struct *task)
 {
        struct freezer *freezer;
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
        if (freezer->state != CGROUP_THAWED)
                return -EBUSY;
+        return 0;
+}
+static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
        rcu_read_lock();
-        if (__cgroup_freezing_or_frozen(task)) {
+        if (__cgroup_freezing_or_frozen(tsk)) {
                rcu_read_unlock();
                return -EBUSY;
        }
        rcu_read_unlock();
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
-                        if (__cgroup_freezing_or_frozen(c)) {
-                                rcu_read_unlock();
-                                return -EBUSY;
-                        }
-                }
-                rcu_read_unlock();
-        }
        return 0;
 }
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
        .populate       = freezer_populate,
        .subsys_id      = freezer_subsys_id,
        .can_attach     = freezer_can_attach,
+        .can_attach_task = freezer_can_attach_task,
+        .pre_attach     = NULL,
+        .attach_task    = NULL,
        .attach         = NULL,
        .fork           = freezer_fork,
        .exit           = NULL,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2bb8c2e98fff..9c9b7545c810 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
 /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-                             struct task_struct *tsk, bool threadgroup)
+                             struct task_struct *tsk)
 {
-        int ret;
        struct cpuset *cs = cgroup_cs(cont);
        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
        if (tsk->flags & PF_THREAD_BOUND)
                return -EINVAL;
-        ret = security_task_setscheduler(tsk);
-        if (ret)
-                return ret;
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        ret = security_task_setscheduler(c);
-                        if (ret) {
-                                rcu_read_unlock();
-                                return ret;
-                        }
-                }
-                rcu_read_unlock();
-        }
        return 0;
 }
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
-                               struct cpuset *cs)
+{
+        return security_task_setscheduler(task);
+}
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in pre_attach, and they must
+ * persist among pre_attach, attach_task, and attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+/* Set-up work for before attaching each task. */
+static void cpuset_pre_attach(struct cgroup *cont)
+{
+        struct cpuset *cs = cgroup_cs(cont);
+        if (cs == &top_cpuset)
+                cpumask_copy(cpus_attach, cpu_possible_mask);
+        else
+                guarantee_online_cpus(cs, cpus_attach);
+        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+}
+/* Per-thread attachment work. */
+static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
 {
        int err;
+        struct cpuset *cs = cgroup_cs(cont);
        /*
         * can_attach beforehand should guarantee that this doesn't fail.
         * TODO: have a better way to handle failure here
@@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
        err = set_cpus_allowed_ptr(tsk, cpus_attach);
        WARN_ON_ONCE(err);
-        cpuset_change_task_nodemask(tsk, to);
+        cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
        cpuset_update_task_spread_flag(cs, tsk);
 }
 static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
-                          struct cgroup *oldcont, struct task_struct *tsk,
+                          struct cgroup *oldcont, struct task_struct *tsk)
-                          bool threadgroup)
 {
        struct mm_struct *mm;
        struct cpuset *cs = cgroup_cs(cont);
        struct cpuset *oldcs = cgroup_cs(oldcont);
-        static nodemask_t to;           /* protected by cgroup_mutex */
-        if (cs == &top_cpuset) {
+        /*
-                cpumask_copy(cpus_attach, cpu_possible_mask);
+         * Change mm, possibly for multiple threads in a threadgroup. This is
-        } else {
+         * expensive and may sleep.
-                guarantee_online_cpus(cs, cpus_attach);
+         */
-        }
+        cpuset_attach_nodemask_from = oldcs->mems_allowed;
-        guarantee_online_mems(cs, &to);
+        cpuset_attach_nodemask_to = cs->mems_allowed;
-        /* do per-task migration stuff possibly for each in the threadgroup */
-        cpuset_attach_task(tsk, &to, cs);
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        cpuset_attach_task(c, &to, cs);
-                }
-                rcu_read_unlock();
-        }
-        /* change mm; only needs to be done once even if threadgroup */
-        to = cs->mems_allowed;
        mm = get_task_mm(tsk);
        if (mm) {
-                mpol_rebind_mm(mm, &to);
+                mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
                if (is_memory_migrate(cs))
-                        cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
+                        cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+                                          &cpuset_attach_nodemask_to);
                mmput(mm);
        }
 }
@@ -1809,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 }
 /*
- * post_clone() is called at the end of cgroup_clone().
+ * post_clone() is called during cgroup_create() when the
- * 'cgroup' was just created automatically as a result of
+ * clone_children mount argument was specified.  The cgroup
- * a cgroup_clone(), and the current task is about to
+ * can not yet have any tasks.
- * be moved into 'cgroup'.
 *
 * Currently we refuse to set up the cgroup - thereby
 * refusing the task to be entered, and as a result refusing
@@ -1911,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
        .create = cpuset_create,
        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
+        .can_attach_task = cpuset_can_attach_task,
+        .pre_attach = cpuset_pre_attach,
+        .attach_task = cpuset_attach_task,
        .attach = cpuset_attach,
        .populate = cpuset_populate,
        .post_clone = cpuset_post_clone,
@@ -2195,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
        rcu_read_lock();
        cs = task_cs(tsk);
        if (cs)
-                cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+                do_set_cpus_allowed(tsk, cs->cpus_allowed);
        rcu_read_unlock();
        /*
@@ -2222,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
                 * Like above we can temporary set any mask and rely on
                 * set_cpus_allowed_ptr() as synchronization point.
                 */
-                cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+                do_set_cpus_allowed(tsk, cpu_possible_mask);
                cpu = cpumask_any(cpu_active_mask);
        }
diff --git a/kernel/cred.c b/kernel/cred.c
index e12c8af793f8..174fa84eca30 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management - see Documentation/credentials.txt
+/* Task credentials management - see Documentation/security/credentials.txt
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c09767f7db3e..9efe7108ccaf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5028,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
        else
                perf_event_output(event, nmi, data, regs);
+        if (event->fasync && event->pending_kill) {
+                if (nmi) {
+                        event->pending_wakeup = 1;
+                        irq_work_queue(&event->pending);
+                } else
+                        perf_event_wakeup(event);
+        }
        return ret;
 }
@@ -7394,26 +7402,12 @@ static int __perf_cgroup_move(void *info)
        return 0;
 }
-static void perf_cgroup_move(struct task_struct *task)
+static void
+perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
 {
        task_function_call(task, __perf_cgroup_move, task);
 }
-static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                struct cgroup *old_cgrp, struct task_struct *task,
-                bool threadgroup)
-{
-        perf_cgroup_move(task);
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
-                        perf_cgroup_move(c);
-                }
-                rcu_read_unlock();
-        }
-}
 static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
                struct cgroup *old_cgrp, struct task_struct *task)
 {
@@ -7425,7 +7419,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
        if (!(task->flags & PF_EXITING))
                return;
-        perf_cgroup_move(task);
+        perf_cgroup_attach_task(cgrp, task);
 }
 struct cgroup_subsys perf_subsys = {
@@ -7434,6 +7428,6 @@ struct cgroup_subsys perf_subsys = {
        .create         = perf_cgroup_create,
        .destroy        = perf_cgroup_destroy,
        .exit           = perf_cgroup_exit,
-        .attach         = perf_cgroup_attach,
+        .attach_task    = perf_cgroup_attach_task,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e7e135d0817..0276c30401a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -59,7 +59,6 @@
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
 #include <linux/tty.h>
-#include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
 #include <linux/magic.h>
@@ -485,20 +484,6 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
-int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-#ifdef CONFIG_CPUMASK_OFFSTACK
-        if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL))
-                return -ENOMEM;
-        if (oldmm)
-                cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm));
-        else
-                memset(mm_cpumask(mm), 0, cpumask_size());
-#endif
-        return 0;
-}
 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
        atomic_set(&mm->mm_users, 1);
@@ -539,17 +524,8 @@ struct mm_struct * mm_alloc(void)
                return NULL;
        memset(mm, 0, sizeof(*mm));
-        mm = mm_init(mm, current);
+        mm_init_cpumask(mm);
-        if (!mm)
+        return mm_init(mm, current);
-                return NULL;
-        if (mm_init_cpumask(mm, NULL)) {
-                mm_free_pgd(mm);
-                free_mm(mm);
-                return NULL;
-        }
-        return mm;
 }
 /*
@@ -560,7 +536,6 @@ struct mm_struct * mm_alloc(void)
 void __mmdrop(struct mm_struct *mm)
 {
        BUG_ON(mm == &init_mm);
-        free_cpumask_var(mm->cpu_vm_mask_var);
        mm_free_pgd(mm);
        destroy_context(mm);
        mmu_notifier_mm_destroy(mm);
@@ -597,6 +572,57 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+        mm->num_exe_file_vmas++;
+}
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+        mm->num_exe_file_vmas--;
+        if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
+                fput(mm->exe_file);
+                mm->exe_file = NULL;
+        }
+}
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+        if (new_exe_file)
+                get_file(new_exe_file);
+        if (mm->exe_file)
+                fput(mm->exe_file);
+        mm->exe_file = new_exe_file;
+        mm->num_exe_file_vmas = 0;
+}
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+        struct file *exe_file;
+        /* We need mmap_sem to protect against races with removal of
+         * VM_EXECUTABLE vmas */
+        down_read(&mm->mmap_sem);
+        exe_file = mm->exe_file;
+        if (exe_file)
+                get_file(exe_file);
+        up_read(&mm->mmap_sem);
+        return exe_file;
+}
+static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+        /* It's safe to write the exe_file pointer without exe_file_lock because
+         * this is called during fork when the task is not yet in /proc */
+        newmm->exe_file = get_mm_exe_file(oldmm);
+}
 /**
 * get_task_mm - acquire a reference to the task's mm
 *
@@ -703,6 +729,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
                goto fail_nomem;
        memcpy(mm, oldmm, sizeof(*mm));
+        mm_init_cpumask(mm);
        /* Initializing for Swap token stuff */
        mm->token_priority = 0;
@@ -715,9 +742,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
        if (!mm_init(mm, tsk))
                goto fail_nomem;
-        if (mm_init_cpumask(mm, oldmm))
-                goto fail_nocpumask;
        if (init_new_context(tsk, mm))
                goto fail_nocontext;
@@ -744,9 +768,6 @@ fail_nomem:
        return NULL;
 fail_nocontext:
-        free_cpumask_var(mm->cpu_vm_mask_var);
-fail_nocpumask:
        /*
         * If init_new_context() failed, we cannot use mmput() to free the mm
         * because it calls destroy_context()
@@ -957,6 +978,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        tty_audit_fork(sig);
        sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+        init_rwsem(&sig->threadgroup_fork_lock);
+#endif
        sig->oom_adj = current->signal->oom_adj;
        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1138,6 +1163,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        monotonic_to_bootbased(&p->real_start_time);
        p->io_context = NULL;
        p->audit_context = NULL;
+        if (clone_flags & CLONE_THREAD)
+                threadgroup_fork_read_lock(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
@@ -1223,12 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (clone_flags & CLONE_THREAD)
                p->tgid = current->tgid;
-        if (current->nsproxy != p->nsproxy) {
-                retval = ns_cgroup_clone(p, pid);
-                if (retval)
-                        goto bad_fork_free_pid;
-        }
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
        /*
         * Clear TID on mm_release()?
@@ -1342,6 +1363,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        cgroup_post_fork(p);
+        if (clone_flags & CLONE_THREAD)
+                threadgroup_fork_read_unlock(current);
        perf_event_fork(p);
        return p;
@@ -1380,6 +1403,8 @@ bad_fork_cleanup_policy:
        mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
 #endif
+        if (clone_flags & CLONE_THREAD)
+                threadgroup_fork_read_unlock(current);
        cgroup_exit(p, cgroup_callbacks_done);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
@@ -1537,6 +1562,13 @@ void __init proc_caches_init(void)
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+        /*
+         * FIXME! The "sizeof(struct mm_struct)" currently includes the
+         * whole struct cpumask for the OFFSTACK case. We could change
+         * this to *only* allocate as much of it as required by the
+         * maximum number of CPU's we can ever have.  The cpumask_allocation
+         * is at the end of the structure, exactly for that reason.
+         */
        mm_cachep = kmem_cache_create("mm_struct",
                        sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 90cb55f6d7eb..470d08c82bbe 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,12 +133,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
                switch (res) {
                case IRQ_WAKE_THREAD:
                        /*
-                         * Set result to handled so the spurious check
-                         * does not trigger.
-                         */
-                        res = IRQ_HANDLED;
-                        /*
                         * Catch drivers which return WAKE_THREAD but
                         * did not set up a thread function
                         */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 886e80347b32..4c60a50e66b2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -257,13 +257,11 @@ int __init early_irq_init(void)
        count = ARRAY_SIZE(irq_desc);
        for (i = 0; i < count; i++) {
-                desc[i].irq_data.irq = i;
-                desc[i].irq_data.chip = &no_irq_chip;
                desc[i].kstat_irqs = alloc_percpu(unsigned int);
-                irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
+                alloc_masks(&desc[i], GFP_KERNEL, node);
-                alloc_masks(desc + i, GFP_KERNEL, node);
+                raw_spin_lock_init(&desc[i].lock);
-                desc_smp_init(desc + i, node);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+                desc_set_defaults(i, &desc[i], node);
        }
        return arch_early_irq_init();
 }
@@ -346,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
        if (!cnt)
                return -EINVAL;
+        if (irq >= 0) {
+                if (from > irq)
+                        return -EINVAL;
+                from = irq;
+        }
        mutex_lock(&sparse_irq_lock);
        start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f7ce0021e1c4..d64bafb1afd0 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -723,13 +723,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
 * context. So we need to disable bh here to avoid deadlocks and other
 * side effects.
 */
-static void
+static irqreturn_t
 irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
 {
+        irqreturn_t ret;
        local_bh_disable();
-        action->thread_fn(action->irq, action->dev_id);
+        ret = action->thread_fn(action->irq, action->dev_id);
        irq_finalize_oneshot(desc, action, false);
        local_bh_enable();
+        return ret;
 }
 /*
@@ -737,10 +740,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
 * preemtible - many of them need to sleep and wait for slow busses to
 * complete.
 */
-static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
+static irqreturn_t irq_thread_fn(struct irq_desc *desc,
+                struct irqaction *action)
 {
-        action->thread_fn(action->irq, action->dev_id);
+        irqreturn_t ret;
+        ret = action->thread_fn(action->irq, action->dev_id);
        irq_finalize_oneshot(desc, action, false);
+        return ret;
 }
 /*
@@ -753,7 +760,8 @@ static int irq_thread(void *data)
        };
        struct irqaction *action = data;
        struct irq_desc *desc = irq_to_desc(action->irq);
-        void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
+        irqreturn_t (*handler_fn)(struct irq_desc *desc,
+                        struct irqaction *action);
        int wake;
        if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
@@ -783,8 +791,12 @@ static int irq_thread(void *data)
                        desc->istate |= IRQS_PENDING;
                        raw_spin_unlock_irq(&desc->lock);
                } else {
+                        irqreturn_t action_ret;
                        raw_spin_unlock_irq(&desc->lock);
-                        handler_fn(desc, action);
+                        action_ret = handler_fn(desc, action);
+                        if (!noirqdebug)
+                                note_interrupt(action->irq, desc, action_ret);
                }
                wake = atomic_dec_and_test(&desc->threads_active);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 64e3df6ab1ef..4bd4faa6323a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -352,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
 #ifdef CONFIG_SMP
        remove_proc_entry("smp_affinity", desc->dir);
        remove_proc_entry("affinity_hint", desc->dir);
+        remove_proc_entry("smp_affinity_list", desc->dir);
        remove_proc_entry("node", desc->dir);
 #endif
        remove_proc_entry("spurious", desc->dir);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dfbd550401b2..aa57d5da18c1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -167,6 +167,13 @@ out:
                  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
+static inline int bad_action_ret(irqreturn_t action_ret)
+{
+        if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
+                return 0;
+        return 1;
+}
 /*
 * If 99,900 of the previous 100,000 interrupts have not been handled
 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
        struct irqaction *action;
        unsigned long flags;
-        if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
+        if (bad_action_ret(action_ret)) {
                printk(KERN_ERR "irq event %d: bogus return value %x\n",
                                irq, action_ret);
        } else {
@@ -201,10 +208,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
        raw_spin_lock_irqsave(&desc->lock, flags);
        action = desc->action;
        while (action) {
-                printk(KERN_ERR "[<%p>]", action->handler);
+                printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
-                print_symbol(" (%s)",
+                if (action->thread_fn)
-                        (unsigned long)action->handler);
+                        printk(KERN_CONT " threaded [<%p>] %pf",
-                printk("\n");
+                                        action->thread_fn, action->thread_fn);
+                printk(KERN_CONT "\n");
                action = action->next;
        }
        raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -262,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
        if (desc->istate & IRQS_POLL_INPROGRESS)
                return;
-        if (unlikely(action_ret != IRQ_HANDLED)) {
+        /* we get here again via the threaded handler */
+        if (action_ret == IRQ_WAKE_THREAD)
+                return;
+        if (bad_action_ret(action_ret)) {
+                report_bad_irq(irq, desc, action_ret);
+                return;
+        }
+        if (unlikely(action_ret == IRQ_NONE)) {
                /*
                 * If we are seeing only the odd spurious IRQ caused by
                 * bus asynchronicity then don't eventually trigger an error,
@@ -274,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                else
                        desc->irqs_unhandled++;
                desc->last_unhandled = jiffies;
-                if (unlikely(action_ret != IRQ_NONE))
-                        report_bad_irq(irq, desc, action_ret);
        }
        if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 74d1c099fbd1..fa27e750dbc0 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -105,9 +105,12 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
 }
 static void __jump_label_update(struct jump_label_key *key,
-                struct jump_entry *entry, int enable)
+                                struct jump_entry *entry,
+                                struct jump_entry *stop, int enable)
 {
-        for (; entry->key == (jump_label_t)(unsigned long)key; entry++) {
+        for (; (entry < stop) &&
+              (entry->key == (jump_label_t)(unsigned long)key);
+              entry++) {
                /*
                 * entry->code set to 0 invalidates module init text sections
                 * kernel_text_address() verifies we are not in core kernel
@@ -181,7 +184,11 @@ static void __jump_label_mod_update(struct jump_label_key *key, int enable)
        struct jump_label_mod *mod = key->next;
        while (mod) {
-                __jump_label_update(key, mod->entries, enable);
+                struct module *m = mod->mod;
+                __jump_label_update(key, mod->entries,
+                                    m->jump_entries + m->num_jump_entries,
+                                    enable);
                mod = mod->next;
        }
 }
@@ -245,7 +252,8 @@ static int jump_label_add_module(struct module *mod)
                key->next = jlm;
                if (jump_label_enabled(key))
-                        __jump_label_update(key, iter, JUMP_LABEL_ENABLE);
+                        __jump_label_update(key, iter, iter_stop,
+                                            JUMP_LABEL_ENABLE);
        }
        return 0;
@@ -371,7 +379,7 @@ static void jump_label_update(struct jump_label_key *key, int enable)
        /* if there are no users, entry can be NULL */
        if (entry)
-                __jump_label_update(key, entry, enable);
+                __jump_label_update(key, entry, __stop___jump_table, enable);
 #ifdef CONFIG_MODULES
        __jump_label_mod_update(key, enable);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3b34d2732bce..4ba7cccb4994 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
                return;
        }
-        p->cpus_allowed = cpumask_of_cpu(cpu);
+        /* It's safe because the task is inactive. */
-        p->rt.nr_cpus_allowed = 1;
+        do_set_cpus_allowed(p, cpumask_of(cpu));
        p->flags |= PF_THREAD_BOUND;
 }
 EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 63437d065ac8..298c9276dfdb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3426,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock)
        int ret = 0;
        if (unlikely(current->lockdep_recursion))
-                return ret;
+                return 1; /* avoid false negative lockdep_assert_held() */
        raw_local_irq_save(flags);
        check_flags(flags);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
deleted file mode 100644
index 2c98ad94ba0e..000000000000
--- a/kernel/ns_cgroup.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * ns_cgroup.c - namespace cgroup subsystem
- *
- * Copyright 2006, 2007 IBM Corp
- */
-#include <linux/module.h>
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <linux/nsproxy.h>
-struct ns_cgroup {
-        struct cgroup_subsys_state css;
-};
-struct cgroup_subsys ns_subsys;
-static inline struct ns_cgroup *cgroup_to_ns(
-                struct cgroup *cgroup)
-{
-        return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
-                            struct ns_cgroup, css);
-}
-int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
-{
-        char name[PROC_NUMBUF];
-        snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
-        return cgroup_clone(task, &ns_subsys, name);
-}
-/*
- * Rules:
- *   1. you can only enter a cgroup which is a descendant of your current
- *     cgroup
- *   2. you can only place another process into a cgroup if
- *     a. you have CAP_SYS_ADMIN
- *     b. your cgroup is an ancestor of task's destination cgroup
- *       (hence either you are in the same cgroup as task, or in an
- *        ancestor cgroup thereof)
- */
-static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
-                         struct task_struct *task, bool threadgroup)
-{
-        if (current != task) {
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (!cgroup_is_descendant(new_cgroup, current))
-                        return -EPERM;
-        }
-        if (!cgroup_is_descendant(new_cgroup, task))
-                return -EPERM;
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
-                        if (!cgroup_is_descendant(new_cgroup, c)) {
-                                rcu_read_unlock();
-                                return -EPERM;
-                        }
-                }
-                rcu_read_unlock();
-        }
-        return 0;
-}
-/*
- * Rules: you can only create a cgroup if
- *     1. you are capable(CAP_SYS_ADMIN)
- *     2. the target cgroup is a descendant of your own cgroup
- */
-static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
-                                                struct cgroup *cgroup)
-{
-        struct ns_cgroup *ns_cgroup;
-        if (!capable(CAP_SYS_ADMIN))
-                return ERR_PTR(-EPERM);
-        if (!cgroup_is_descendant(cgroup, current))
-                return ERR_PTR(-EPERM);
-        if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
-                printk("ns_cgroup can't be created with parent "
-                       "'clone_children' set.\n");
-                return ERR_PTR(-EINVAL);
-        }
-        printk_once("ns_cgroup deprecated: consider using the "
-                    "'clone_children' flag without the ns_cgroup.\n");
-        ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
-        if (!ns_cgroup)
-                return ERR_PTR(-ENOMEM);
-        return &ns_cgroup->css;
-}
-static void ns_destroy(struct cgroup_subsys *ss,
-                        struct cgroup *cgroup)
-{
-        struct ns_cgroup *ns_cgroup;
-        ns_cgroup = cgroup_to_ns(cgroup);
-        kfree(ns_cgroup);
-}
-struct cgroup_subsys ns_subsys = {
-        .name = "ns",
-        .can_attach = ns_can_attach,
-        .create = ns_create,
-        .destroy  = ns_destroy,
-        .subsys_id = ns_subsys_id,
-};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a05d191ffdd9..d6a00f3de15d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
 static struct kmem_cache *nsproxy_cachep;
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
                goto out;
        }
-        err = ns_cgroup_clone(current, task_pid(current));
-        if (err)
-                put_nsproxy(*new_nsp);
 out:
        return err;
 }
@@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p)
        switch_task_namespaces(p, NULL);
 }
+SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+{
+        const struct proc_ns_operations *ops;
+        struct task_struct *tsk = current;
+        struct nsproxy *new_nsproxy;
+        struct proc_inode *ei;
+        struct file *file;
+        int err;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        file = proc_ns_fget(fd);
+        if (IS_ERR(file))
+                return PTR_ERR(file);
+        err = -EINVAL;
+        ei = PROC_I(file->f_dentry->d_inode);
+        ops = ei->ns_ops;
+        if (nstype && (ops->type != nstype))
+                goto out;
+        new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+        if (IS_ERR(new_nsproxy)) {
+                err = PTR_ERR(new_nsproxy);
+                goto out;
+        }
+        err = ops->install(new_nsproxy, ei->ns);
+        if (err) {
+                free_nsproxy(new_nsproxy);
+                goto out;
+        }
+        switch_task_namespaces(tsk, new_nsproxy);
+out:
+        fput(file);
+        return err;
+}
 static int __init nsproxy_cache_init(void)
 {
        nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index beb184689af9..6824ca7d4d0c 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -40,6 +40,7 @@
 #include <linux/string.h>
 #include <linux/platform_device.h>
 #include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/uaccess.h>
@@ -53,11 +54,17 @@ enum pm_qos_type {
        PM_QOS_MIN              /* return the smallest value */
 };
+/*
+ * Note: The lockless read path depends on the CPU accessing
+ * target_value atomically.  Atomic access is only guaranteed on all CPU
+ * types linux supports for 32 bit quantites
+ */
 struct pm_qos_object {
        struct plist_head requests;
        struct blocking_notifier_head *notifiers;
        struct miscdevice pm_qos_power_miscdev;
        char *name;
+        s32 target_value;       /* Do not change to 64 bit */
        s32 default_value;
        enum pm_qos_type type;
 };
@@ -70,7 +77,8 @@ static struct pm_qos_object cpu_dma_pm_qos = {
        .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
        .notifiers = &cpu_dma_lat_notifier,
        .name = "cpu_dma_latency",
-        .default_value = 2000 * USEC_PER_SEC,
+        .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+        .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
        .type = PM_QOS_MIN,
 };
@@ -79,7 +87,8 @@ static struct pm_qos_object network_lat_pm_qos = {
        .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
        .notifiers = &network_lat_notifier,
        .name = "network_latency",
-        .default_value = 2000 * USEC_PER_SEC,
+        .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+        .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
        .type = PM_QOS_MIN
 };
@@ -89,7 +98,8 @@ static struct pm_qos_object network_throughput_pm_qos = {
        .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
        .notifiers = &network_throughput_notifier,
        .name = "network_throughput",
-        .default_value = 0,
+        .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+        .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
        .type = PM_QOS_MAX,
 };
@@ -135,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
        }
 }
+static inline s32 pm_qos_read_value(struct pm_qos_object *o)
+{
+        return o->target_value;
+}
+static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
+{
+        o->target_value = value;
+}
 static void update_target(struct pm_qos_object *o, struct plist_node *node,
                          int del, int value)
 {
@@ -159,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
                plist_add(node, &o->requests);
        }
        curr_value = pm_qos_get_value(o);
+        pm_qos_set_value(o, curr_value);
        spin_unlock_irqrestore(&pm_qos_lock, flags);
        if (prev_value != curr_value)
@@ -193,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor)
 * pm_qos_request - returns current system wide qos expectation
 * @pm_qos_class: identification of which qos value is requested
 *
- * This function returns the current target value in an atomic manner.
+ * This function returns the current target value.
 */
 int pm_qos_request(int pm_qos_class)
 {
-        unsigned long flags;
+        return pm_qos_read_value(pm_qos_array[pm_qos_class]);
-        int value;
-        spin_lock_irqsave(&pm_qos_lock, flags);
-        value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
-        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        return value;
 }
 EXPORT_SYMBOL_GPL(pm_qos_request);
@@ -404,24 +418,36 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos)
 {
        s32 value;
-        int x;
-        char ascii_value[11];
        struct pm_qos_request_list *pm_qos_req;
        if (count == sizeof(s32)) {
                if (copy_from_user(&value, buf, sizeof(s32)))
                        return -EFAULT;
-        } else if (count == 11) { /* len('0x12345678/0') */
+        } else if (count <= 11) { /* ASCII perhaps? */
-                if (copy_from_user(ascii_value, buf, 11))
+                char ascii_value[11];
+                unsigned long int ulval;
+                int ret;
+                if (copy_from_user(ascii_value, buf, count))
                        return -EFAULT;
-                if (strlen(ascii_value) != 10)
-                        return -EINVAL;
+                if (count > 10) {
-                x = sscanf(ascii_value, "%x", &value);
+                        if (ascii_value[10] == '\n')
-                if (x != 1)
+                                ascii_value[10] = '\0';
+                        else
+                                return -EINVAL;
+                } else {
+                        ascii_value[count] = '\0';
+                }
+                ret = strict_strtoul(ascii_value, 16, &ulval);
+                if (ret) {
+                        pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
                        return -EINVAL;
-                pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
+                }
-        } else
+                value = (s32)lower_32_bits(ulval);
+        } else {
                return -EINVAL;
+        }
        pm_qos_req = filp->private_data;
        pm_qos_update_request(pm_qos_req, value);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f9bec56d8825..8f7b1db1ece1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,7 +25,6 @@
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
 #include <scsi/scsi_scan.h>
-#include <asm/suspend.h>
 #include "power.h"
@@ -55,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN;
 static const struct platform_hibernation_ops *hibernation_ops;
 /**
- * hibernation_set_ops - set the global hibernate operations
+ * hibernation_set_ops - Set the global hibernate operations.
- * @ops: the hibernation operations to use in subsequent hibernation transitions
+ * @ops: Hibernation operations to use in subsequent hibernation transitions.
 */
 void hibernation_set_ops(const struct platform_hibernation_ops *ops)
 {
        if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot
@@ -115,10 +113,9 @@ static int hibernation_test(int level) { return 0; }
 #endif /* !CONFIG_PM_DEBUG */
 /**
- *      platform_begin - tell the platform driver that we're starting
+ * platform_begin - Call platform to start hibernation.
- *      hibernation
+ * @platform_mode: Whether or not to use the platform driver.
 */
 static int platform_begin(int platform_mode)
 {
        return (platform_mode && hibernation_ops) ?
@@ -126,10 +123,9 @@ static int platform_begin(int platform_mode)
 }
 /**
- *      platform_end - tell the platform driver that we've entered the
+ * platform_end - Call platform to finish transition to the working state.
- *      working state
+ * @platform_mode: Whether or not to use the platform driver.
 */
 static void platform_end(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -137,8 +133,11 @@ static void platform_end(int platform_mode)
 }
 /**
- *      platform_pre_snapshot - prepare the machine for hibernation using the
+ * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
- *      platform driver if so configured and return an error code if it fails
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to prepare the system for creating a hibernate image,
+ * if so configured, and return an error code if that fails.
 */
 static int platform_pre_snapshot(int platform_mode)
@@ -148,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode)
 }
 /**
- *      platform_leave - prepare the machine for switching to the normal mode
+ * platform_leave - Call platform to prepare a transition to the working state.
- *      of operation using the platform driver (called with interrupts disabled)
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver prepare to prepare the machine for switching to the
+ * normal mode of operation.
+ *
+ * This routine is called on one CPU with interrupts disabled.
 */
 static void platform_leave(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -159,10 +162,14 @@ static void platform_leave(int platform_mode)
 }
 /**
- *      platform_finish - switch the machine to the normal mode of operation
+ * platform_finish - Call platform to switch the system to the working state.
- *      using the platform driver (must be called after platform_prepare())
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to switch the machine to the normal mode of
+ * operation.
+ *
+ * This routine must be called after platform_prepare().
 */
 static void platform_finish(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -170,11 +177,15 @@ static void platform_finish(int platform_mode)
 }
 /**
- *      platform_pre_restore - prepare the platform for the restoration from a
+ * platform_pre_restore - Prepare for hibernate image restoration.
- *      hibernation image.  If the restore fails after this function has been
+ * @platform_mode: Whether or not to use the platform driver.
- *      called, platform_restore_cleanup() must be called.
+ *
+ * Use the platform driver to prepare the system for resume from a hibernation
+ * image.
+ *
+ * If the restore fails after this function has been called,
+ * platform_restore_cleanup() must be called.
 */
 static int platform_pre_restore(int platform_mode)
 {
        return (platform_mode && hibernation_ops) ?
@@ -182,12 +193,16 @@ static int platform_pre_restore(int platform_mode)
 }
 /**
- *      platform_restore_cleanup - switch the platform to the normal mode of
+ * platform_restore_cleanup - Switch to the working state after failing restore.
- *      operation after a failing restore.  If platform_pre_restore() has been
+ * @platform_mode: Whether or not to use the platform driver.
- *      called before the failing restore, this function must be called too,
+ *
- *      regardless of the result of platform_pre_restore().
+ * Use the platform driver to switch the system to the normal mode of operation
+ * after a failing restore.
+ *
+ * If platform_pre_restore() has been called before the failing restore, this
+ * function must be called too, regardless of the result of
+ * platform_pre_restore().
 */
 static void platform_restore_cleanup(int platform_mode)
 {
        if (platform_mode && hibernation_ops)
@@ -195,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode)
 }
 /**
- *      platform_recover - recover the platform from a failure to suspend
+ * platform_recover - Recover from a failure to suspend devices.
- *      devices.
+ * @platform_mode: Whether or not to use the platform driver.
 */
 static void platform_recover(int platform_mode)
 {
        if (platform_mode && hibernation_ops && hibernation_ops->recover)
@@ -206,13 +220,12 @@ static void platform_recover(int platform_mode)
 }
 /**
- *      swsusp_show_speed - print the time elapsed between two events.
+ * swsusp_show_speed - Print time elapsed between two events during hibernation.
- *      @start: Starting event.
+ * @start: Starting event.
- *      @stop: Final event.
+ * @stop: Final event.
- *      @nr_pages -     number of pages processed between @start and @stop
+ * @nr_pages: Number of memory pages processed between @start and @stop.
- *      @msg -          introductory message to print
+ * @msg: Additional diagnostic message to print.
 */
 void swsusp_show_speed(struct timeval *start, struct timeval *stop,
                        unsigned nr_pages, char *msg)
 {
@@ -235,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
 }
 /**
- *      create_image - freeze devices that need to be frozen with interrupts
+ * create_image - Create a hibernation image.
- *      off, create the hibernation image and thaw those devices.  Control
+ * @platform_mode: Whether or not to use the platform driver.
- *      reappears in this routine after a restore.
+ *
+ * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
+ * and execute the drivers' .thaw_noirq() callbacks.
+ *
+ * Control reappears in this routine after the subsequent restore.
 */
 static int create_image(int platform_mode)
 {
        int error;
-        error = arch_prepare_suspend();
-        if (error)
-                return error;
-        /* At this point, dpm_suspend_start() has been called, but *not*
-         * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
-         * Otherwise, drivers for some devices (e.g. interrupt controllers)
-         * become desynchronized with the actual state of the hardware
-         * at resume time, and evil weirdness ensues.
-         */
        error = dpm_suspend_noirq(PMSG_FREEZE);
        if (error) {
                printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -297,9 +303,6 @@ static int create_image(int platform_mode)
 Power_up:
        syscore_resume();
-        /* NOTE:  dpm_resume_noirq() is just a resume() for devices
-         * that suspended with irqs off ... no overall powerup.
-         */
 Enable_irqs:
        local_irq_enable();
@@ -317,14 +320,11 @@ static int create_image(int platform_mode)
 }
 /**
- *      hibernation_snapshot - quiesce devices and create the hibernation
+ * hibernation_snapshot - Quiesce devices and create a hibernation image.
- *      snapshot image.
+ * @platform_mode: If set, use platform driver to prepare for the transition.
- *      @platform_mode - if set, use the platform driver, if available, to
- *                       prepare the platform firmware for the power transition.
 *
- *      Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held.
 */
 int hibernation_snapshot(int platform_mode)
 {
        pm_message_t msg = PMSG_RECOVER;
@@ -384,13 +384,14 @@ int hibernation_snapshot(int platform_mode)
 }
 /**
- *      resume_target_kernel - prepare devices that need to be suspended with
+ * resume_target_kernel - Restore system state from a hibernation image.
- *      interrupts off, restore the contents of highmem that have not been
+ * @platform_mode: Whether or not to use the platform driver.
- *      restored yet from the image and run the low level code that will restore
+ *
- *      the remaining contents of memory and switch to the just restored target
+ * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
- *      kernel.
+ * highmem that have not been restored yet from the image and run the low-level
+ * code that will restore the remaining contents of memory and switch to the
+ * just restored target kernel.
 */
 static int resume_target_kernel(bool platform_mode)
 {
        int error;
@@ -416,24 +417,26 @@ static int resume_target_kernel(bool platform_mode)
        if (error)
                goto Enable_irqs;
-        /* We'll ignore saved state, but this gets preempt count (etc) right */
        save_processor_state();
        error = restore_highmem();
        if (!error) {
                error = swsusp_arch_resume();
                /*
                 * The code below is only ever reached in case of a failure.
-                 * Otherwise execution continues at place where
+                 * Otherwise, execution continues at the place where
-                 * swsusp_arch_suspend() was called
+                 * swsusp_arch_suspend() was called.
                 */
                BUG_ON(!error);
-                /* This call to restore_highmem() undos the previous one */
+                /*
+                 * This call to restore_highmem() reverts the changes made by
+                 * the previous one.
+                 */
                restore_highmem();
        }
        /*
         * The only reason why swsusp_arch_resume() can fail is memory being
         * very tight, so we have to free it as soon as we can to avoid
-         * subsequent failures
+         * subsequent failures.
         */
        swsusp_free();
        restore_processor_state();
@@ -456,14 +459,12 @@ static int resume_target_kernel(bool platform_mode)
 }
 /**
- *      hibernation_restore - quiesce devices and restore the hibernation
+ * hibernation_restore - Quiesce devices and restore from a hibernation image.
- *      snapshot image.  If successful, control returns in hibernation_snaphot()
+ * @platform_mode: If set, use platform driver to prepare for the transition.
- *      @platform_mode - if set, use the platform driver, if available, to
- *                       prepare the platform firmware for the transition.
 *
- *      Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held.  If it is successful, control
+ * reappears in the restored target kernel in hibernation_snaphot().
 */
 int hibernation_restore(int platform_mode)
 {
        int error;
@@ -483,10 +484,8 @@ int hibernation_restore(int platform_mode)
 }
 /**
- *      hibernation_platform_enter - enter the hibernation state using the
+ * hibernation_platform_enter - Power off the system using the platform driver.
- *      platform driver (if available)
 */
 int hibernation_platform_enter(void)
 {
        int error;
@@ -557,12 +556,12 @@ int hibernation_platform_enter(void)
 }
 /**
- *      power_down - Shut the machine down for hibernation.
+ * power_down - Shut the machine down for hibernation.
 *
- *      Use the platform driver, if configured so; otherwise try
+ * Use the platform driver, if configured, to put the system into the sleep
- *      to power off or reboot.
+ * state corresponding to hibernation, or try to power it off or reboot,
+ * depending on the value of hibernation_mode.
 */
 static void power_down(void)
 {
        switch (hibernation_mode) {
@@ -599,9 +598,8 @@ static int prepare_processes(void)
 }
 /**
- *      hibernate - The granpappy of the built-in hibernation management
+ * hibernate - Carry out system hibernation, including saving the image.
 */
 int hibernate(void)
 {
        int error;
@@ -679,17 +677,20 @@ int hibernate(void)
 /**
- *      software_resume - Resume from a saved image.
+ * software_resume - Resume from a saved hibernation image.
 *
- *      Called as a late_initcall (so all devices are discovered and
+ * This routine is called as a late initcall, when all devices have been
- *      initialized), we call swsusp to see if we have a saved image or not.
+ * discovered and initialized already.
- *      If so, we quiesce devices, the restore the saved image. We will
- *      return above (in hibernate() ) if everything goes well.
- *      Otherwise, we fail gracefully and return to the normally
- *      scheduled program.
 *
+ * The image reading code is called to see if there is a hibernation image
+ * available for reading.  If that is the case, devices are quiesced and the
+ * contents of memory is restored from the saved image.
+ *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snaphot() which returns to hibernate().  Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
 */
 static int software_resume(void)
 {
        int error;
@@ -819,21 +820,17 @@ static const char * const hibernation_modes[] = {
        [HIBERNATION_TESTPROC]  = "testproc",
 };
-/**
+/*
- *      disk - Control hibernation mode
+ * /sys/power/disk - Control hibernation mode.
- *
- *      Suspend-to-disk can be handled in several ways. We have a few options
- *      for putting the system to sleep - using the platform driver (e.g. ACPI
- *      or other hibernation_ops), powering off the system or rebooting the
- *      system (for testing) as well as the two test modes.
 *
- *      The system can support 'platform', and that is known a priori (and
+ * Hibernation can be handled in several ways.  There are a few different ways
- *      encoded by the presence of hibernation_ops). However, the user may
+ * to put the system into the sleep state: using the platform driver (e.g. ACPI
- *      choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
+ * or other hibernation_ops), powering it off or rebooting it (for testing
- *      test modes, 'test' or 'testproc'.
+ * mostly), or using one of the two available test modes.
 *
- *      show() will display what the mode is currently set to.
+ * The sysfs file /sys/power/disk provides an interface for selecting the
- *      store() will accept one of
+ * hibernation mode to use.  Reading from this file causes the available modes
+ * to be printed.  There are 5 modes that can be supported:
 *
 *      'platform'
 *      'shutdown'
@@ -841,8 +838,14 @@ static const char * const hibernation_modes[] = {
 *      'test'
 *      'testproc'
 *
- *      It will only change to 'platform' if the system
+ * If a platform hibernation driver is in use, 'platform' will be supported
- *      supports it (as determined by having hibernation_ops).
+ * and will be used by default.  Otherwise, 'shutdown' will be used by default.
+ * The selected option (i.e. the one corresponding to the current value of
+ * hibernation_mode) is enclosed by a square bracket.
+ *
+ * To select a given hibernation mode it is necessary to write the mode's
+ * string representation (as returned by reading from /sys/power/disk) back
+ * into /sys/power/disk.
 */
 static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -875,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
        return buf-start;
 }
 static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
                          const char *buf, size_t n)
 {
diff --git a/kernel/profile.c b/kernel/profile.c
index 14c9f87b9fc9..961b389fe52f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -303,14 +303,12 @@ static void profile_discard_flip_buffers(void)
        mutex_unlock(&profile_flip_mutex);
 }
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
        int i, j, cpu;
        struct profile_hit *hits;
-        if (prof_on != type || !prof_buffer)
-                return;
        pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
        i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
        secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
@@ -417,16 +415,20 @@ out_free:
 #define profile_discard_flip_buffers()  do { } while (0)
 #define profile_cpu_callback            NULL
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long pc;
-        if (prof_on != type || !prof_buffer)
-                return;
        pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
        atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 }
 #endif /* !CONFIG_SMP */
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+        if (prof_on != type || !prof_buffer)
+                return;
+        do_profile_hits(type, __pc, nr_hits);
+}
 EXPORT_SYMBOL_GPL(profile_hits);
 void profile_tick(int type)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f07d2f03181a..89419ff92e99 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -36,7 +36,7 @@
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/nmi.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/completion.h>
@@ -95,7 +95,6 @@ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
 DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
 DEFINE_PER_CPU(char, rcu_cpu_has_work);
 static char rcu_kthreads_spawnable;
@@ -163,7 +162,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
        .dynticks_nesting = 1,
-        .dynticks = 1,
+        .dynticks = ATOMIC_INIT(1),
 };
 #endif /* #ifdef CONFIG_NO_HZ */
@@ -322,13 +321,25 @@ void rcu_enter_nohz(void)
        unsigned long flags;
        struct rcu_dynticks *rdtp;
-        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        rdtp->dynticks++;
+        if (--rdtp->dynticks_nesting) {
-        rdtp->dynticks_nesting--;
+                local_irq_restore(flags);
-        WARN_ON_ONCE(rdtp->dynticks & 0x1);
+                return;
+        }
+        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+        smp_mb__before_atomic_inc();  /* See above. */
+        atomic_inc(&rdtp->dynticks);
+        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
+        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
        local_irq_restore(flags);
+        /* If the interrupt queued a callback, get out of dyntick mode. */
+        if (in_irq() &&
+            (__get_cpu_var(rcu_sched_data).nxtlist ||
+             __get_cpu_var(rcu_bh_data).nxtlist ||
+             rcu_preempt_needs_cpu(smp_processor_id())))
+                set_need_resched();
 }
 /*
@@ -344,11 +355,16 @@ void rcu_exit_nohz(void)
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        rdtp->dynticks++;
+        if (rdtp->dynticks_nesting++) {
-        rdtp->dynticks_nesting++;
+                local_irq_restore(flags);
-        WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
+                return;
+        }
+        smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
+        atomic_inc(&rdtp->dynticks);
+        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+        smp_mb__after_atomic_inc();  /* See above. */
+        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
        local_irq_restore(flags);
-        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 }
 /**
@@ -362,11 +378,15 @@ void rcu_nmi_enter(void)
 {
        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-        if (rdtp->dynticks & 0x1)
+        if (rdtp->dynticks_nmi_nesting == 0 &&
+            (atomic_read(&rdtp->dynticks) & 0x1))
                return;
-        rdtp->dynticks_nmi++;
+        rdtp->dynticks_nmi_nesting++;
-        WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
+        smp_mb__before_atomic_inc();  /* Force delay from prior write. */
-        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+        atomic_inc(&rdtp->dynticks);
+        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+        smp_mb__after_atomic_inc();  /* See above. */
+        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
 }
 /**
@@ -380,11 +400,14 @@ void rcu_nmi_exit(void)
 {
        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-        if (rdtp->dynticks & 0x1)
+        if (rdtp->dynticks_nmi_nesting == 0 ||
+            --rdtp->dynticks_nmi_nesting != 0)
                return;
-        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-        rdtp->dynticks_nmi++;
+        smp_mb__before_atomic_inc();  /* See above. */
-        WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
+        atomic_inc(&rdtp->dynticks);
+        smp_mb__after_atomic_inc();  /* Force delay to next write. */
+        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
 /**
@@ -395,13 +418,7 @@ void rcu_nmi_exit(void)
 */
 void rcu_irq_enter(void)
 {
-        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+        rcu_exit_nohz();
-        if (rdtp->dynticks_nesting++)
-                return;
-        rdtp->dynticks++;
-        WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
-        smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
 }
 /**
@@ -413,18 +430,7 @@ void rcu_irq_enter(void)
 */
 void rcu_irq_exit(void)
 {
-        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+        rcu_enter_nohz();
-        if (--rdtp->dynticks_nesting)
-                return;
-        smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
-        rdtp->dynticks++;
-        WARN_ON_ONCE(rdtp->dynticks & 0x1);
-        /* If the interrupt queued a callback, get out of dyntick mode. */
-        if (__this_cpu_read(rcu_sched_data.nxtlist) ||
-            __this_cpu_read(rcu_bh_data.nxtlist))
-                set_need_resched();
 }
 #ifdef CONFIG_SMP
@@ -436,19 +442,8 @@ void rcu_irq_exit(void)
 */
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
-        int ret;
+        rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
-        int snap;
+        return 0;
-        int snap_nmi;
-        snap = rdp->dynticks->dynticks;
-        snap_nmi = rdp->dynticks->dynticks_nmi;
-        smp_mb();       /* Order sampling of snap with end of grace period. */
-        rdp->dynticks_snap = snap;
-        rdp->dynticks_nmi_snap = snap_nmi;
-        ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
-        if (ret)
-                rdp->dynticks_fqs++;
-        return ret;
 }
 /*
@@ -459,16 +454,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
 */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
-        long curr;
+        unsigned long curr;
-        long curr_nmi;
+        unsigned long snap;
-        long snap;
-        long snap_nmi;
-        curr = rdp->dynticks->dynticks;
+        curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
-        snap = rdp->dynticks_snap;
+        snap = (unsigned long)rdp->dynticks_snap;
-        curr_nmi = rdp->dynticks->dynticks_nmi;
-        snap_nmi = rdp->dynticks_nmi_snap;
-        smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
        /*
         * If the CPU passed through or entered a dynticks idle phase with
@@ -478,8 +468,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
         * read-side critical section that started before the beginning
         * of the current RCU grace period.
         */
-        if ((curr != snap || (curr & 0x1) == 0) &&
+        if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
-            (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
                rdp->dynticks_fqs++;
                return 1;
        }
@@ -908,6 +897,12 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        unsigned long gp_duration;
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+        /*
+         * Ensure that all grace-period and pre-grace-period activity
+         * is seen before the assignment to rsp->completed.
+         */
+        smp_mb(); /* See above block comment. */
        gp_duration = jiffies - rsp->gp_start;
        if (gp_duration > rsp->gp_max)
                rsp->gp_max = gp_duration;
@@ -1455,25 +1450,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 static void rcu_process_callbacks(void)
 {
-        /*
-         * Memory references from any prior RCU read-side critical sections
-         * executed by the interrupted code must be seen before any RCU
-         * grace-period manipulations below.
-         */
-        smp_mb(); /* See above block comment. */
        __rcu_process_callbacks(&rcu_sched_state,
                                &__get_cpu_var(rcu_sched_data));
        __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
        rcu_preempt_process_callbacks();
-        /*
-         * Memory references from any later RCU read-side critical sections
-         * executed by the interrupted code must be seen after any RCU
-         * grace-period manipulations above.
-         */
-        smp_mb(); /* See above block comment. */
        /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
        rcu_needs_cpu_flush();
 }
@@ -1494,7 +1475,7 @@ static void invoke_rcu_cpu_kthread(void)
                local_irq_restore(flags);
                return;
        }
-        wake_up(&__get_cpu_var(rcu_cpu_wq));
+        wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
        local_irq_restore(flags);
 }
@@ -1544,13 +1525,10 @@ static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
 */
 static void rcu_cpu_kthread_timer(unsigned long arg)
 {
-        unsigned long flags;
        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
        struct rcu_node *rnp = rdp->mynode;
-        raw_spin_lock_irqsave(&rnp->lock, flags);
+        atomic_or(rdp->grpmask, &rnp->wakemask);
-        rnp->wakemask |= rdp->grpmask;
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        invoke_rcu_node_kthread(rnp);
 }
@@ -1617,14 +1595,12 @@ static int rcu_cpu_kthread(void *arg)
        unsigned long flags;
        int spincnt = 0;
        unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
-        wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
        char work;
        char *workp = &per_cpu(rcu_cpu_has_work, cpu);
        for (;;) {
                *statusp = RCU_KTHREAD_WAITING;
-                wait_event_interruptible(*wqp,
+                rcu_wait(*workp != 0 || kthread_should_stop());
-                                         *workp != 0 || kthread_should_stop());
                local_bh_disable();
                if (rcu_cpu_kthread_should_stop(cpu)) {
                        local_bh_enable();
@@ -1675,7 +1651,6 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
        per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
        WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
        per_cpu(rcu_cpu_kthread_task, cpu) = t;
-        wake_up_process(t);
        sp.sched_priority = RCU_KTHREAD_PRIO;
        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
        return 0;
@@ -1698,11 +1673,10 @@ static int rcu_node_kthread(void *arg)
        for (;;) {
                rnp->node_kthread_status = RCU_KTHREAD_WAITING;
-                wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0);
+                rcu_wait(atomic_read(&rnp->wakemask) != 0);
                rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
                raw_spin_lock_irqsave(&rnp->lock, flags);
-                mask = rnp->wakemask;
+                mask = atomic_xchg(&rnp->wakemask, 0);
-                rnp->wakemask = 0;
                rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
                for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
                        if ((mask & 0x1) == 0)
@@ -1783,13 +1757,14 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
                raw_spin_lock_irqsave(&rnp->lock, flags);
                rnp->node_kthread_task = t;
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                wake_up_process(t);
                sp.sched_priority = 99;
                sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
        }
        return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
 }
+static void rcu_wake_one_boost_kthread(struct rcu_node *rnp);
 /*
 * Spawn all kthreads -- called as soon as the scheduler is running.
 */
@@ -1797,24 +1772,31 @@ static int __init rcu_spawn_kthreads(void)
 {
        int cpu;
        struct rcu_node *rnp;
+        struct task_struct *t;
        rcu_kthreads_spawnable = 1;
        for_each_possible_cpu(cpu) {
-                init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
                per_cpu(rcu_cpu_has_work, cpu) = 0;
-                if (cpu_online(cpu))
+                if (cpu_online(cpu)) {
                        (void)rcu_spawn_one_cpu_kthread(cpu);
+                        t = per_cpu(rcu_cpu_kthread_task, cpu);
+                        if (t)
+                                wake_up_process(t);
+                }
        }
        rnp = rcu_get_root(rcu_state);
-        init_waitqueue_head(&rnp->node_wq);
-        rcu_init_boost_waitqueue(rnp);
        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
-        if (NUM_RCU_NODES > 1)
+        if (rnp->node_kthread_task)
+                wake_up_process(rnp->node_kthread_task);
+        if (NUM_RCU_NODES > 1) {
                rcu_for_each_leaf_node(rcu_state, rnp) {
-                        init_waitqueue_head(&rnp->node_wq);
-                        rcu_init_boost_waitqueue(rnp);
                        (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+                        t = rnp->node_kthread_task;
+                        if (t)
+                                wake_up_process(t);
+                        rcu_wake_one_boost_kthread(rnp);
                }
+        }
        return 0;
 }
 early_initcall(rcu_spawn_kthreads);
@@ -2218,14 +2200,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
-static void __cpuinit rcu_online_cpu(int cpu)
+static void __cpuinit rcu_prepare_cpu(int cpu)
 {
        rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
        rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
        rcu_preempt_init_percpu_data(cpu);
 }
-static void __cpuinit rcu_online_kthreads(int cpu)
+static void __cpuinit rcu_prepare_kthreads(int cpu)
 {
        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
@@ -2239,6 +2221,31 @@ static void __cpuinit rcu_online_kthreads(int cpu)
 }
 /*
+ * kthread_create() creates threads in TASK_UNINTERRUPTIBLE state,
+ * but the RCU threads are woken on demand, and if demand is low this
+ * could be a while triggering the hung task watchdog.
+ *
+ * In order to avoid this, poke all tasks once the CPU is fully
+ * up and running.
+ */
+static void __cpuinit rcu_online_kthreads(int cpu)
+{
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+        struct rcu_node *rnp = rdp->mynode;
+        struct task_struct *t;
+        t = per_cpu(rcu_cpu_kthread_task, cpu);
+        if (t)
+                wake_up_process(t);
+        t = rnp->node_kthread_task;
+        if (t)
+                wake_up_process(t);
+        rcu_wake_one_boost_kthread(rnp);
+}
+/*
 * Handle CPU online/offline notification events.
 */
 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
@@ -2251,10 +2258,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                rcu_online_cpu(cpu);
+                rcu_prepare_cpu(cpu);
-                rcu_online_kthreads(cpu);
+                rcu_prepare_kthreads(cpu);
                break;
        case CPU_ONLINE:
+                rcu_online_kthreads(cpu);
        case CPU_DOWN_FAILED:
                rcu_node_kthread_setaffinity(rnp, -1);
                rcu_cpu_kthread_setrt(cpu, 1);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 257664815d5d..7b9a08b4aaea 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,11 +84,9 @@
 * Dynticks per-CPU state.
 */
 struct rcu_dynticks {
-        int dynticks_nesting;   /* Track nesting level, sort of. */
+        int dynticks_nesting;   /* Track irq/process nesting level. */
-        int dynticks;           /* Even value for dynticks-idle, else odd. */
+        int dynticks_nmi_nesting; /* Track NMI nesting level. */
-        int dynticks_nmi;       /* Even value for either dynticks-idle or */
+        atomic_t dynticks;      /* Even value for dynticks-idle, else odd. */
-                                /*  not in nmi handler, else odd.  So this */
-                                /*  remains even for nmi from irq handler. */
 };
 /* RCU's kthread states for tracing. */
@@ -121,7 +119,9 @@ struct rcu_node {
                                /*  elements that need to drain to allow the */
                                /*  current expedited grace period to */
                                /*  complete (only for TREE_PREEMPT_RCU). */
-        unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */
+        atomic_t wakemask;      /* CPUs whose kthread needs to be awakened. */
+                                /*  Since this has meaning only for leaf */
+                                /*  rcu_node structures, 32 bits suffices. */
        unsigned long qsmaskinit;
                                /* Per-GP initial value for qsmask & expmask. */
        unsigned long grpmask;  /* Mask to apply to parent qsmask. */
@@ -159,9 +159,6 @@ struct rcu_node {
        struct task_struct *boost_kthread_task;
                                /* kthread that takes care of priority */
                                /*  boosting for this rcu_node structure. */
-        wait_queue_head_t boost_wq;
-                                /* Wait queue on which to park the boost */
-                                /*  kthread. */
        unsigned int boost_kthread_status;
                                /* State of boost_kthread_task for tracing. */
        unsigned long n_tasks_boosted;
@@ -188,9 +185,6 @@ struct rcu_node {
                                /* kthread that takes care of this rcu_node */
                                /*  structure, for example, awakening the */
                                /*  per-CPU kthreads as needed. */
-        wait_queue_head_t node_wq;
-                                /* Wait queue on which to park the per-node */
-                                /*  kthread. */
        unsigned int node_kthread_status;
                                /* State of node_kthread_task for tracing. */
 } ____cacheline_internodealigned_in_smp;
@@ -284,7 +278,6 @@ struct rcu_data {
        /* 3) dynticks interface. */
        struct rcu_dynticks *dynticks;  /* Shared per-CPU dynticks state. */
        int dynticks_snap;              /* Per-GP tracking for dynticks. */
-        int dynticks_nmi_snap;          /* Per-GP tracking for dynticks_nmi. */
 #endif /* #ifdef CONFIG_NO_HZ */
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -337,6 +330,16 @@ struct rcu_data {
                                                /*  scheduling clock irq */
                                                /*  before ratting on them. */
+#define rcu_wait(cond)                                                  \
+do {                                                                    \
+        for (;;) {                                                      \
+                set_current_state(TASK_INTERRUPTIBLE);                  \
+                if (cond)                                               \
+                        break;                                          \
+                schedule();                                             \
+        }                                                               \
+        __set_current_state(TASK_RUNNING);                              \
+} while (0)
 /*
 * RCU global state, including node hierarchy.  This hierarchy is
@@ -446,7 +449,6 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_needs_cpu_flush(void);
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
                                          cpumask_var_t cm);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3f6559a5f5cd..c8bff3099a89 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1196,8 +1196,7 @@ static int rcu_boost_kthread(void *arg)
        for (;;) {
                rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
-                wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks ||
+                rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
-                                                        rnp->exp_tasks);
                rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
                more2boost = rcu_boost(rnp);
                if (more2boost)
@@ -1275,14 +1274,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 }
 /*
- * Initialize the RCU-boost waitqueue.
- */
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
-{
-        init_waitqueue_head(&rnp->boost_wq);
-}
-/*
 * Create an RCU-boost kthread for the specified node if one does not
 * already exist.  We only create this kthread for preemptible RCU.
 * Returns zero if all is well, a negated errno otherwise.
@@ -1306,12 +1297,17 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rnp->boost_kthread_task = t;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        wake_up_process(t);
        sp.sched_priority = RCU_KTHREAD_PRIO;
        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
        return 0;
 }
+static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp)
+{
+        if (rnp->boost_kthread_task)
+                wake_up_process(rnp->boost_kthread_task);
+}
 #else /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1328,10 +1324,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 {
 }
-static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
-{
-}
 static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
                                                 struct rcu_node *rnp,
                                                 int rnp_index)
@@ -1339,6 +1331,10 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
        return 0;
 }
+static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp)
+{
+}
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 #ifndef CONFIG_SMP
@@ -1520,7 +1516,6 @@ int rcu_needs_cpu(int cpu)
 {
        int c = 0;
        int snap;
-        int snap_nmi;
        int thatcpu;
        /* Check for being in the holdoff period. */
@@ -1531,10 +1526,10 @@ int rcu_needs_cpu(int cpu)
        for_each_online_cpu(thatcpu) {
                if (thatcpu == cpu)
                        continue;
-                snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
+                snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
-                snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
+                                                     thatcpu).dynticks);
                smp_mb(); /* Order sampling of snap with end of grace period. */
-                if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
+                if ((snap & 0x1) != 0) {
                        per_cpu(rcu_dyntick_drain, cpu) = 0;
                        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
                        return rcu_needs_cpu_quick_check(cpu);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index aa0fd72b4bc7..9678cc3650f5 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -69,10 +69,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->passed_quiesc, rdp->passed_quiesc_completed,
                   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
-        seq_printf(m, " dt=%d/%d dn=%d df=%lu",
+        seq_printf(m, " dt=%d/%d/%d df=%lu",
-                   rdp->dynticks->dynticks,
+                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
-                   rdp->dynticks->dynticks_nmi,
+                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
@@ -141,9 +141,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
        seq_printf(m, ",%d,%d,%d,%lu",
-                   rdp->dynticks->dynticks,
+                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
-                   rdp->dynticks->dynticks_nmi,
+                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
@@ -167,7 +167,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
 #ifdef CONFIG_NO_HZ
-        seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
        seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
 #ifdef CONFIG_TREE_PREEMPT_RCU
diff --git a/kernel/sched.c b/kernel/sched.c
index 2d12893b8b0f..3f2e502d609b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -605,10 +605,10 @@ static inline int cpu_of(struct rq *rq)
 /*
 * Return the group to which this tasks belongs.
 *
- * We use task_subsys_state_check() and extend the RCU verification
+ * We use task_subsys_state_check() and extend the RCU verification with
- * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * holds that lock for each task it moves into the cgroup. Therefore
+ * task it moves into the cgroup. Therefore by holding either of those locks,
- * by holding that lock, we pin the task to the current cgroup.
+ * we pin the task to the current cgroup.
 */
 static inline struct task_group *task_group(struct task_struct *p)
 {
@@ -616,7 +616,8 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct cgroup_subsys_state *css;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                        lockdep_is_held(&p->pi_lock));
+                        lockdep_is_held(&p->pi_lock) ||
+                        lockdep_is_held(&task_rq(p)->lock));
        tg = container_of(css, struct task_group, css);
        return autogroup_task_group(p, tg);
@@ -2200,6 +2201,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
 #ifdef CONFIG_LOCKDEP
+        /*
+         * The caller should hold either p->pi_lock or rq->lock, when changing
+         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+         *
+         * sched_move_task() holds both and thus holding either pins the cgroup,
+         * see set_task_rq().
+         *
+         * Furthermore, all task_rq users should acquire both locks, see
+         * task_rq_lock().
+         */
        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
                                      lockdep_is_held(&task_rq(p)->lock)));
 #endif
@@ -2447,6 +2458,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
                }
                rcu_read_unlock();
        }
+        if (wake_flags & WF_MIGRATED)
+                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
        schedstat_inc(rq, ttwu_count);
@@ -2455,9 +2470,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
        if (wake_flags & WF_SYNC)
                schedstat_inc(p, se.statistics.nr_wakeups_sync);
-        if (cpu != task_cpu(p))
-                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SCHEDSTATS */
 }
@@ -2573,7 +2585,26 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
        if (!next)
                smp_send_reschedule(cpu);
 }
-#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+        struct rq *rq;
+        int ret = 0;
+        rq = __task_rq_lock(p);
+        if (p->on_cpu) {
+                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+                ttwu_do_wakeup(rq, p, wake_flags);
+                ret = 1;
+        }
+        __task_rq_unlock(rq);
+        return ret;
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+#endif /* CONFIG_SMP */
 static void ttwu_queue(struct task_struct *p, int cpu)
 {
@@ -2581,6 +2612,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 #if defined(CONFIG_SMP)
        if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+                sched_clock_cpu(cpu); /* sync clocks x-cpu */
                ttwu_queue_remote(p, cpu);
                return;
        }
@@ -2631,17 +2663,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        while (p->on_cpu) {
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
                /*
-                 * If called from interrupt context we could have landed in the
+                 * In case the architecture enables interrupts in
-                 * middle of schedule(), in this case we should take care not
+                 * context_switch(), we cannot busy wait, since that
-                 * to spin on ->on_cpu if p is current, since that would
+                 * would lead to deadlocks when an interrupt hits and
-                 * deadlock.
+                 * tries to wake up @prev. So bail and do a complete
+                 * remote wakeup.
                 */
-                if (p == current) {
+                if (ttwu_activate_remote(p, wake_flags))
-                        ttwu_queue(p, cpu);
                        goto stat;
-                }
+#else
-#endif
                cpu_relax();
+#endif
        }
        /*
         * Pairs with the smp_wmb() in finish_lock_switch().
@@ -2655,8 +2687,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                p->sched_class->task_waking(p);
        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        if (task_cpu(p) != cpu)
+        if (task_cpu(p) != cpu) {
+                wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
+        }
 #endif /* CONFIG_SMP */
        ttwu_queue(p, cpu);
@@ -5841,7 +5875,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
-        cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+        do_set_cpus_allowed(idle, cpumask_of(cpu));
        /*
         * We're having a chicken and egg problem, even though we are
         * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5929,6 +5963,16 @@ static inline void sched_init_granularity(void)
 }
 #ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+        if (p->sched_class && p->sched_class->set_cpus_allowed)
+                p->sched_class->set_cpus_allowed(p, new_mask);
+        else {
+                cpumask_copy(&p->cpus_allowed, new_mask);
+                p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+        }
+}
 /*
 * This is how migration works:
 *
@@ -5974,12 +6018,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
                goto out;
        }
-        if (p->sched_class->set_cpus_allowed)
+        do_set_cpus_allowed(p, new_mask);
-                p->sched_class->set_cpus_allowed(p, new_mask);
-        else {
-                cpumask_copy(&p->cpus_allowed, new_mask);
-                p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
-        }
        /* Can the task run on the task's current CPU? If so, we're done */
        if (cpumask_test_cpu(task_cpu(p), new_mask))
@@ -8764,42 +8803,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        return 0;
 }
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                      struct task_struct *tsk, bool threadgroup)
-{
-        int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
-        if (retval)
-                return retval;
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        retval = cpu_cgroup_can_attach_task(cgrp, c);
-                        if (retval) {
-                                rcu_read_unlock();
-                                return retval;
-                        }
-                }
-                rcu_read_unlock();
-        }
-        return 0;
-}
 static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-                  struct cgroup *old_cont, struct task_struct *tsk,
-                  bool threadgroup)
 {
        sched_move_task(tsk);
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        sched_move_task(c);
-                }
-                rcu_read_unlock();
-        }
 }
 static void
@@ -8887,8 +8894,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .destroy        = cpu_cgroup_destroy,
-        .can_attach     = cpu_cgroup_can_attach,
+        .can_attach_task = cpu_cgroup_can_attach_task,
-        .attach         = cpu_cgroup_attach,
+        .attach_task    = cpu_cgroup_attach_task,
        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e32a9b70ee9c..433491c2dc8f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1076,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        se->on_rq = 0;
        update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
-        update_min_vruntime(cfs_rq);
-        update_cfs_shares(cfs_rq);
        /*
         * Normalize the entity after updating the min_vruntime because the
@@ -1086,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         */
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
+        update_min_vruntime(cfs_rq);
+        update_cfs_shares(cfs_rq);
 }
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 64b2a37c07d0..88725c939e0b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1263,6 +1263,7 @@ static int find_lowest_rq(struct task_struct *task)
        if (!cpumask_test_cpu(this_cpu, lowest_mask))
                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+        rcu_read_lock();
        for_each_domain(cpu, sd) {
                if (sd->flags & SD_WAKE_AFFINE) {
                        int best_cpu;
@@ -1272,15 +1273,20 @@ static int find_lowest_rq(struct task_struct *task)
                         * remote processor.
                         */
                        if (this_cpu != -1 &&
-                            cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+                                rcu_read_unlock();
                                return this_cpu;
+                        }
                        best_cpu = cpumask_first_and(lowest_mask,
                                                     sched_domain_span(sd));
-                        if (best_cpu < nr_cpu_ids)
+                        if (best_cpu < nr_cpu_ids) {
+                                rcu_read_unlock();
                                return best_cpu;
+                        }
                }
        }
+        rcu_read_unlock();
        /*
         * And finally, if there were no matches within the domains
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 48ddf431db0e..331e01bcd026 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 #ifdef CONFIG_SMP
                /* domain-specific stats */
-                preempt_disable();
+                rcu_read_lock();
                for_each_domain(cpu, sd) {
                        enum cpu_idle_type itype;
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
                            sd->ttwu_move_balance);
                }
-                preempt_enable();
+                rcu_read_unlock();
 #endif
        }
        kfree(mask_str);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4fc92445a29c..f175d98bd355 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -938,6 +938,12 @@ static struct ctl_table kern_table[] = {
        },
 #endif
 #ifdef CONFIG_PERF_EVENTS
+        /*
+         * User-space scripts rely on the existence of this file
+         * as a feature check for perf_events being enabled.
+         *
+         * So it's an ABI, do not remove!
+         */
        {
                .procname       = "perf_event_paranoid",
                .data           = &sysctl_perf_event_paranoid,
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index c027d4f602f1..e4c699dfa4e8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev)
        unsigned long flags;
        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-        BUG_ON(!dev->cpumask);
+        if (!dev->cpumask) {
+                WARN_ON(num_possible_cpus() > 1);
+                dev->cpumask = cpumask_of(smp_processor_id());
+        }
        raw_spin_lock_irqsave(&clockevents_lock, flags);
diff --git a/kernel/timer.c b/kernel/timer.c
index fd6198692b57..8cff36119e4d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
        unsigned long expires_limit, mask;
        int bit;
-        expires_limit = expires;
        if (timer->slack >= 0) {
                expires_limit = expires + timer->slack;
        } else {
-                unsigned long now = jiffies;
+                long delta = expires - jiffies;
+                if (delta < 256)
+                        return expires;
-                /* No slack, if already expired else auto slack 0.4% */
+                expires_limit = expires + delta / 256;
-                if (time_after(expires, now))
-                        expires_limit = expires + (expires - now)/256;
        }
        mask = expires ^ expires_limit;
        if (mask == 0)
@@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
 */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
+        expires = apply_slack(timer, expires);
        /*
         * This is a common optimization triggered by the
         * networking code - if the timer is re-modified
@@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
        if (timer_pending(timer) && timer->expires == expires)
                return 1;
-        expires = apply_slack(timer, expires);
        return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
 }
 EXPORT_SYMBOL(mod_timer);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d017c2c82c44..1ee417fcbfa5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -109,12 +109,18 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
 static void ftrace_global_list_func(unsigned long ip,
                                    unsigned long parent_ip)
 {
-        struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/
+        struct ftrace_ops *op;
+        if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+                return;
+        trace_recursion_set(TRACE_GLOBAL_BIT);
+        op = rcu_dereference_raw(ftrace_global_list); /*see above*/
        while (op != &ftrace_list_end) {
                op->func(ip, parent_ip);
                op = rcu_dereference_raw(op->next); /*see above*/
        };
+        trace_recursion_clear(TRACE_GLOBAL_BIT);
 }
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
@@ -1638,12 +1644,12 @@ static void ftrace_startup_enable(int command)
        ftrace_run_update_code(command);
 }
-static void ftrace_startup(struct ftrace_ops *ops, int command)
+static int ftrace_startup(struct ftrace_ops *ops, int command)
 {
        bool hash_enable = true;
        if (unlikely(ftrace_disabled))
-                return;
+                return -ENODEV;
        ftrace_start_up++;
        command |= FTRACE_ENABLE_CALLS;
@@ -1662,6 +1668,8 @@ static void ftrace_startup(struct ftrace_ops *ops, int command)
                ftrace_hash_rec_enable(ops, 1);
        ftrace_startup_enable(command);
+        return 0;
 }
 static void ftrace_shutdown(struct ftrace_ops *ops, int command)
@@ -2501,7 +2509,7 @@ static void __enable_ftrace_function_probe(void)
        ret = __register_ftrace_function(&trace_probe_ops);
        if (!ret)
-                ftrace_startup(&trace_probe_ops, 0);
+                ret = ftrace_startup(&trace_probe_ops, 0);
        ftrace_probe_registered = 1;
 }
@@ -3466,7 +3474,11 @@ device_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
 /* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command)   do { } while (0)
+# define ftrace_startup(ops, command)                   \
+        ({                                              \
+                (ops)->flags |= FTRACE_OPS_FL_ENABLED;  \
+                0;                                      \
+        })
 # define ftrace_shutdown(ops, command)  do { } while (0)
 # define ftrace_startup_sysctl()        do { } while (0)
 # define ftrace_shutdown_sysctl()       do { } while (0)
@@ -3484,6 +3496,10 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
 {
        struct ftrace_ops *op;
+        if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+                return;
+        trace_recursion_set(TRACE_INTERNAL_BIT);
        /*
         * Some of the ops may be dynamically allocated,
         * they must be freed after a synchronize_sched().
@@ -3496,6 +3512,7 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
                op = rcu_dereference_raw(op->next);
        };
        preempt_enable_notrace();
+        trace_recursion_clear(TRACE_INTERNAL_BIT);
 }
 static void clear_ftrace_swapper(void)
@@ -3799,7 +3816,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
        ret = __register_ftrace_function(ops);
        if (!ret)
-                ftrace_startup(ops, 0);
+                ret = ftrace_startup(ops, 0);
 out_unlock:
@@ -4045,7 +4062,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        ftrace_graph_return = retfunc;
        ftrace_graph_entry = entryfunc;
-        ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
+        ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
 out:
        mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0ef7b4b2a1f7..b0c7aa407943 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2216,7 +2216,7 @@ static noinline void trace_recursive_fail(void)
        printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
                    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
-                    current->trace_recursion,
+                    trace_recursion_buffer(),
                    hardirq_count() >> HARDIRQ_SHIFT,
                    softirq_count() >> SOFTIRQ_SHIFT,
                    in_nmi());
@@ -2226,9 +2226,9 @@ static noinline void trace_recursive_fail(void)
 static inline int trace_recursive_lock(void)
 {
-        current->trace_recursion++;
+        trace_recursion_inc();
-        if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+        if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
                return 0;
        trace_recursive_fail();
@@ -2238,9 +2238,9 @@ static inline int trace_recursive_lock(void)
 static inline void trace_recursive_unlock(void)
 {
-        WARN_ON_ONCE(!current->trace_recursion);
+        WARN_ON_ONCE(!trace_recursion_buffer());
-        current->trace_recursion--;
+        trace_recursion_dec();
 }
 #else
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6b69c4bd306f..229f8591f61d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -784,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[];
        FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
 #include "trace_entries.h"
+/* Only current can touch trace_recursion */
+#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
+#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
+/* Ring buffer has the 10 LSB bits to count */
+#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+/* for function tracing recursion */
+#define TRACE_INTERNAL_BIT              (1<<11)
+#define TRACE_GLOBAL_BIT                (1<<12)
+#define trace_recursion_set(bit)        do { (current)->trace_recursion |= (bit); } while (0)
+#define trace_recursion_clear(bit)      do { (current)->trace_recursion &= ~(bit); } while (0)
+#define trace_recursion_test(bit)       ((current)->trace_recursion & (bit))
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2fe110341359..686ec399f2a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1657,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata  =
 static __init void event_trace_self_test_with_function(void)
 {
-        register_ftrace_function(&trace_ops);
+        int ret;
+        ret = register_ftrace_function(&trace_ops);
+        if (WARN_ON(ret < 0)) {
+                pr_info("Failed to enable function tracer for event tests\n");
+                return;
+        }
        pr_info("Running tests again, along with the function tracer\n");
        event_trace_self_tests();
        unregister_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index cf535ccedc86..e37de492a9e1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 }
 EXPORT_SYMBOL(ftrace_print_symbols_seq);
+#if BITS_PER_LONG == 32
+const char *
+ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+                         const struct trace_print_flags_u64 *symbol_array)
+{
+        int i;
+        const char *ret = p->buffer + p->len;
+        for (i = 0;  symbol_array[i].name; i++) {
+                if (val != symbol_array[i].mask)
+                        continue;
+                trace_seq_puts(p, symbol_array[i].name);
+                break;
+        }
+        if (!p->len)
+                trace_seq_printf(p, "0x%llx", val);
+        trace_seq_putc(p, 0);
+        return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+#endif
 const char *
 ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 {
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 44646179eaba..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 static struct uts_namespace *create_uts_ns(void)
 {
@@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref)
        put_user_ns(ns->user_ns);
        kfree(ns);
 }
+static void *utsns_get(struct task_struct *task)
+{
+        struct uts_namespace *ns = NULL;
+        struct nsproxy *nsproxy;
+        rcu_read_lock();
+        nsproxy = task_nsproxy(task);
+        if (nsproxy) {
+                ns = nsproxy->uts_ns;
+                get_uts_ns(ns);
+        }
+        rcu_read_unlock();
+        return ns;
+}
+static void utsns_put(void *ns)
+{
+        put_uts_ns(ns);
+}
+static int utsns_install(struct nsproxy *nsproxy, void *ns)
+{
+        get_uts_ns(ns);
+        put_uts_ns(nsproxy->uts_ns);
+        nsproxy->uts_ns = ns;
+        return 0;
+}
+const struct proc_ns_operations utsns_operations = {
+        .name           = "uts",
+        .type           = CLONE_NEWUTS,
+        .get            = utsns_get,
+        .put            = utsns_put,
+        .install        = utsns_install,
+};
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7daa4b072e9f..3d0c56ad4792 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -415,15 +415,13 @@ static void watchdog_nmi_disable(int cpu) { return; }
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 /* prepare/enable/disable routines */
-static int watchdog_prepare_cpu(int cpu)
+static void watchdog_prepare_cpu(int cpu)
 {
        struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
        WARN_ON(per_cpu(softlockup_watchdog, cpu));
        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hrtimer->function = watchdog_timer_fn;
-        return 0;
 }
 static int watchdog_enable(int cpu)
@@ -542,17 +540,16 @@ static int __cpuinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
-        int err = 0;
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                err = watchdog_prepare_cpu(hotcpu);
+                watchdog_prepare_cpu(hotcpu);
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
                if (watchdog_enabled)
-                        err = watchdog_enable(hotcpu);
+                        watchdog_enable(hotcpu);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED: