114 files changed, 6839 insertions, 4213 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 35ef1185e359..a4d1aa8da9bc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,9 +6,9 @@ obj-y     = fork.o exec_domain.o panic.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
-            rcupdate.o extable.o params.o posix-timers.o \
+            extable.o params.o posix-timers.o \
-            kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
+            kthread.o sys_ni.o posix-cpu-timers.o mutex.o \
-            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
+            hrtimer.o rwsem.o nsproxy.o semaphore.o \
            notifier.o ksysfs.o cred.o reboot.o \
            async.o range.o groups.o lglock.o smpboot.o
@@ -26,6 +26,8 @@ obj-y += sched/
 obj-y += power/
 obj-y += printk/
 obj-y += cpu/
+obj-y += irq/
+obj-y += rcu/
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -79,14 +81,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_TREE_RCU) += rcutree.o
-obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
-obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
-obj-$(CONFIG_TINY_RCU) += rcutiny.o
-obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 91e53d04b6a9..7b0e23a740ce 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1117,9 +1117,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
                        sleep_time = timeout_start + audit_backlog_wait_time -
                                        jiffies;
-                        if ((long)sleep_time > 0)
+                        if ((long)sleep_time > 0) {
                                wait_for_auditd(sleep_time);
-                        continue;
+                                continue;
+                        }
                }
                if (audit_rate_check() && printk_ratelimit())
                        printk(KERN_WARNING
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 0c9b862292b2..e8ca97b5c386 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,7 @@
 #include <linux/mmzone.h>
 #include <linux/kbuild.h>
 #include <linux/page_cgroup.h>
+#include <linux/log2.h>
 void foo(void)
 {
@@ -17,5 +18,8 @@ void foo(void)
        DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
        DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
        DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
+#ifdef CONFIG_SMP
+        DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+#endif
        /* End of constants */
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index f6c2ce5701e1..4e66bf9275b0 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -433,18 +433,6 @@ bool capable(int cap)
 EXPORT_SYMBOL(capable);
 /**
- * nsown_capable - Check superior capability to one's own user_ns
- * @cap: The capability in question
- *
- * Return true if the current task has the given superior capability
- * targeted at its own user namespace.
- */
-bool nsown_capable(int cap)
-{
-        return ns_capable(current_user_ns(), cap);
-}
-/**
 * inode_capable - Check superior capability over inode
 * @inode: The inode in question
 * @cap: The capability in question
@@ -464,3 +452,4 @@ bool inode_capable(const struct inode *inode, int cap)
        return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
 }
+EXPORT_SYMBOL(inode_capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 781845a013ab..e0839bcd48c8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_task */
 #include <linux/kthread.h>
+#include <linux/file.h>
 #include <linux/atomic.h>
@@ -81,7 +82,7 @@
 */
 #ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
-EXPORT_SYMBOL_GPL(cgroup_mutex);        /* only for task_subsys_state_check() */
+EXPORT_SYMBOL_GPL(cgroup_mutex);        /* only for lockdep */
 #else
 static DEFINE_MUTEX(cgroup_mutex);
 #endif
@@ -117,51 +118,20 @@ struct cfent {
        struct list_head                node;
        struct dentry                   *dentry;
        struct cftype                   *type;
+        struct cgroup_subsys_state      *css;
        /* file xattrs */
        struct simple_xattrs            xattrs;
 };
 /*
- * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
- * cgroup_subsys->use_id != 0.
- */
-#define CSS_ID_MAX      (65535)
-struct css_id {
-        /*
-         * The css to which this ID points. This pointer is set to valid value
-         * after cgroup is populated. If cgroup is removed, this will be NULL.
-         * This pointer is expected to be RCU-safe because destroy()
-         * is called after synchronize_rcu(). But for safe use, css_tryget()
-         * should be used for avoiding race.
-         */
-        struct cgroup_subsys_state __rcu *css;
-        /*
-         * ID of this css.
-         */
-        unsigned short id;
-        /*
-         * Depth in hierarchy which this ID belongs to.
-         */
-        unsigned short depth;
-        /*
-         * ID is freed by RCU. (and lookup routine is RCU safe.)
-         */
-        struct rcu_head rcu_head;
-        /*
-         * Hierarchy of CSS ID belongs to.
-         */
-        unsigned short stack[0]; /* Array of Length (depth+1) */
-};
-/*
 * cgroup_event represents events which userspace want to receive.
 */
 struct cgroup_event {
        /*
-         * Cgroup which the event belongs to.
+         * css which the event belongs to.
         */
-        struct cgroup *cgrp;
+        struct cgroup_subsys_state *css;
        /*
         * Control file which the event associated.
         */
@@ -215,10 +185,33 @@ static u64 cgroup_serial_nr_next = 1;
 */
 static int need_forkexit_callback __read_mostly;
-static void cgroup_offline_fn(struct work_struct *work);
+static struct cftype cgroup_base_files[];
+static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
-                              struct cftype cfts[], bool is_add);
+                              bool is_add);
+/**
+ * cgroup_css - obtain a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns the dummy_css)
+ *
+ * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
+ * function must be called either under cgroup_mutex or rcu_read_lock() and
+ * the caller is responsible for pinning the returned css if it wants to
+ * keep accessing it outside the said locks.  This function may return
+ * %NULL if @cgrp doesn't have @subsys_id enabled.
+ */
+static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+                                              struct cgroup_subsys *ss)
+{
+        if (ss)
+                return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
+                                             lockdep_is_held(&cgroup_mutex));
+        else
+                return &cgrp->dummy_css;
+}
 /* convenient tests for these bits */
 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
@@ -362,12 +355,11 @@ struct cgrp_cset_link {
 static struct css_set init_css_set;
 static struct cgrp_cset_link init_cgrp_cset_link;
-static int cgroup_init_idr(struct cgroup_subsys *ss,
+/*
-                           struct cgroup_subsys_state *css);
+ * css_set_lock protects the list of css_set objects, and the chain of
+ * tasks off each css_set.  Nests outside task->alloc_lock due to
-/* css_set_lock protects the list of css_set objects, and the
+ * css_task_iter_start().
- * chain of tasks off each css_set.  Nests outside task->alloc_lock
+ */
- * due to cgroup_iter_start() */
 static DEFINE_RWLOCK(css_set_lock);
 static int css_set_count;
@@ -392,10 +384,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
        return key;
 }
-/* We don't maintain the lists running through each css_set to its
+/*
- * task until after the first call to cgroup_iter_start(). This
+ * We don't maintain the lists running through each css_set to its task
- * reduces the fork()/exit() overhead for people who have cgroups
+ * until after the first call to css_task_iter_start().  This reduces the
- * compiled into their kernel but not actually in use */
+ * fork()/exit() overhead for people who have cgroups compiled into their
+ * kernel but not actually in use.
+ */
 static int use_task_css_set_links __read_mostly;
 static void __put_css_set(struct css_set *cset, int taskexit)
@@ -464,7 +458,7 @@ static inline void put_css_set_taskexit(struct css_set *cset)
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
- * Returns true if "cg" matches "old_cg" except for the hierarchy
+ * Returns true if "cset" matches "old_cset" except for the hierarchy
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
 static bool compare_css_sets(struct css_set *cset,
@@ -555,7 +549,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
                        /* Subsystem is in this hierarchy. So we want
                         * the subsystem state from the new
                         * cgroup */
-                        template[i] = cgrp->subsys[i];
+                        template[i] = cgroup_css(cgrp, ss);
                } else {
                        /* Subsystem is not in this hierarchy, so we
                         * don't want to change the subsystem state */
@@ -803,8 +797,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
-static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
-                               unsigned long subsys_mask);
 static const struct inode_operations cgroup_dir_inode_operations;
 static const struct file_operations proc_cgroupstats_operations;
@@ -813,9 +806,6 @@ static struct backing_dev_info cgroup_backing_dev_info = {
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
-static int alloc_css_id(struct cgroup_subsys *ss,
-                        struct cgroup *parent, struct cgroup *child);
 static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 {
        struct inode *inode = new_inode(sb);
@@ -845,15 +835,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
 static void cgroup_free_fn(struct work_struct *work)
 {
        struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
-        struct cgroup_subsys *ss;
        mutex_lock(&cgroup_mutex);
-        /*
-         * Release the subsystem state objects.
-         */
-        for_each_root_subsys(cgrp->root, ss)
-                ss->css_free(cgrp);
        cgrp->root->number_of_cgroups--;
        mutex_unlock(&cgroup_mutex);
@@ -864,8 +847,6 @@ static void cgroup_free_fn(struct work_struct *work)
         */
        dput(cgrp->parent->dentry);
-        ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
        /*
         * Drop the active superblock reference that we took when we
         * created the cgroup. This will free cgrp->root, if we are
@@ -956,27 +937,22 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 }
 /**
- * cgroup_clear_directory - selective removal of base and subsystem files
+ * cgroup_clear_dir - remove subsys files in a cgroup directory
- * @dir: directory containing the files
+ * @cgrp: target cgroup
- * @base_files: true if the base files should be removed
 * @subsys_mask: mask of the subsystem ids whose files should be removed
 */
-static void cgroup_clear_directory(struct dentry *dir, bool base_files,
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
-                                   unsigned long subsys_mask)
 {
-        struct cgroup *cgrp = __d_cgrp(dir);
        struct cgroup_subsys *ss;
+        int i;
-        for_each_root_subsys(cgrp->root, ss) {
+        for_each_subsys(ss, i) {
                struct cftype_set *set;
-                if (!test_bit(ss->subsys_id, &subsys_mask))
+                if (!test_bit(i, &subsys_mask))
                        continue;
                list_for_each_entry(set, &ss->cftsets, node)
-                        cgroup_addrm_files(cgrp, NULL, set->cfts, false);
+                        cgroup_addrm_files(cgrp, set->cfts, false);
-        }
-        if (base_files) {
-                while (!list_empty(&cgrp->files))
-                        cgroup_rm_file(cgrp, NULL);
        }
 }
@@ -986,9 +962,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
 static void cgroup_d_remove_dir(struct dentry *dentry)
 {
        struct dentry *parent;
-        struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
-        cgroup_clear_directory(dentry, true, root->subsys_mask);
        parent = dentry->d_parent;
        spin_lock(&parent->d_lock);
@@ -1009,79 +982,84 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 {
        struct cgroup *cgrp = &root->top_cgroup;
        struct cgroup_subsys *ss;
-        int i;
+        unsigned long pinned = 0;
+        int i, ret;
        BUG_ON(!mutex_is_locked(&cgroup_mutex));
        BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
        /* Check that any added subsystems are currently free */
        for_each_subsys(ss, i) {
-                unsigned long bit = 1UL << i;
+                if (!(added_mask & (1 << i)))
-                if (!(bit & added_mask))
                        continue;
+                /* is the subsystem mounted elsewhere? */
                if (ss->root != &cgroup_dummy_root) {
-                        /* Subsystem isn't free */
+                        ret = -EBUSY;
-                        return -EBUSY;
+                        goto out_put;
+                }
+                /* pin the module */
+                if (!try_module_get(ss->module)) {
+                        ret = -ENOENT;
+                        goto out_put;
                }
+                pinned |= 1 << i;
        }
-        /* Currently we don't handle adding/removing subsystems when
+        /* subsys could be missing if unloaded between parsing and here */
-         * any child cgroups exist. This is theoretically supportable
+        if (added_mask != pinned) {
-         * but involves complex error handling, so it's being left until
+                ret = -ENOENT;
-         * later */
+                goto out_put;
-        if (root->number_of_cgroups > 1)
+        }
-                return -EBUSY;
+        ret = cgroup_populate_dir(cgrp, added_mask);
+        if (ret)
+                goto out_put;
+        /*
+         * Nothing can fail from this point on.  Remove files for the
+         * removed subsystems and rebind each subsystem.
+         */
+        cgroup_clear_dir(cgrp, removed_mask);
-        /* Process each subsystem */
        for_each_subsys(ss, i) {
                unsigned long bit = 1UL << i;
                if (bit & added_mask) {
                        /* We're binding this subsystem to this hierarchy */
-                        BUG_ON(cgrp->subsys[i]);
+                        BUG_ON(cgroup_css(cgrp, ss));
-                        BUG_ON(!cgroup_dummy_top->subsys[i]);
+                        BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
-                        BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top);
+                        BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
+                        rcu_assign_pointer(cgrp->subsys[i],
+                                           cgroup_css(cgroup_dummy_top, ss));
+                        cgroup_css(cgrp, ss)->cgroup = cgrp;
-                        cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
-                        cgrp->subsys[i]->cgroup = cgrp;
                        list_move(&ss->sibling, &root->subsys_list);
                        ss->root = root;
                        if (ss->bind)
-                                ss->bind(cgrp);
+                                ss->bind(cgroup_css(cgrp, ss));
                        /* refcount was already taken, and we're keeping it */
                        root->subsys_mask |= bit;
                } else if (bit & removed_mask) {
                        /* We're removing this subsystem */
-                        BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);
+                        BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
-                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+                        BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
                        if (ss->bind)
-                                ss->bind(cgroup_dummy_top);
+                                ss->bind(cgroup_css(cgroup_dummy_top, ss));
-                        cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
-                        cgrp->subsys[i] = NULL;
+                        cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
+                        RCU_INIT_POINTER(cgrp->subsys[i], NULL);
                        cgroup_subsys[i]->root = &cgroup_dummy_root;
                        list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
                        /* subsystem is now free - drop reference on module */
                        module_put(ss->module);
                        root->subsys_mask &= ~bit;
-                } else if (bit & root->subsys_mask) {
-                        /* Subsystem state should already exist */
-                        BUG_ON(!cgrp->subsys[i]);
-                        /*
-                         * a refcount was taken, but we already had one, so
-                         * drop the extra reference.
-                         */
-                        module_put(ss->module);
-#ifdef CONFIG_MODULE_UNLOAD
-                        BUG_ON(ss->module && !module_refcount(ss->module));
-#endif
-                } else {
-                        /* Subsystem state shouldn't exist */
-                        BUG_ON(cgrp->subsys[i]);
                }
        }
@@ -1092,6 +1070,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
        root->flags |= CGRP_ROOT_SUBSYS_BOUND;
        return 0;
+out_put:
+        for_each_subsys(ss, i)
+                if (pinned & (1 << i))
+                        module_put(ss->module);
+        return ret;
 }
 static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1142,7 +1126,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        char *token, *o = data;
        bool all_ss = false, one_ss = false;
        unsigned long mask = (unsigned long)-1;
-        bool module_pin_failed = false;
        struct cgroup_subsys *ss;
        int i;
@@ -1285,52 +1268,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        if (!opts->subsys_mask && !opts->name)
                return -EINVAL;
-        /*
-         * Grab references on all the modules we'll need, so the subsystems
-         * don't dance around before rebind_subsystems attaches them. This may
-         * take duplicate reference counts on a subsystem that's already used,
-         * but rebind_subsystems handles this case.
-         */
-        for_each_subsys(ss, i) {
-                if (!(opts->subsys_mask & (1UL << i)))
-                        continue;
-                if (!try_module_get(cgroup_subsys[i]->module)) {
-                        module_pin_failed = true;
-                        break;
-                }
-        }
-        if (module_pin_failed) {
-                /*
-                 * oops, one of the modules was going away. this means that we
-                 * raced with a module_delete call, and to the user this is
-                 * essentially a "subsystem doesn't exist" case.
-                 */
-                for (i--; i >= 0; i--) {
-                        /* drop refcounts only on the ones we took */
-                        unsigned long bit = 1UL << i;
-                        if (!(bit & opts->subsys_mask))
-                                continue;
-                        module_put(cgroup_subsys[i]->module);
-                }
-                return -ENOENT;
-        }
        return 0;
 }
-static void drop_parsed_module_refcounts(unsigned long subsys_mask)
-{
-        struct cgroup_subsys *ss;
-        int i;
-        mutex_lock(&cgroup_mutex);
-        for_each_subsys(ss, i)
-                if (subsys_mask & (1UL << i))
-                        module_put(cgroup_subsys[i]->module);
-        mutex_unlock(&cgroup_mutex);
-}
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 {
        int ret = 0;
@@ -1370,22 +1310,15 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
                goto out_unlock;
        }
-        /*
+        /* remounting is not allowed for populated hierarchies */
-         * Clear out the files of subsystems that should be removed, do
+        if (root->number_of_cgroups > 1) {
-         * this before rebind_subsystems, since rebind_subsystems may
+                ret = -EBUSY;
-         * change this hierarchy's subsys_list.
-         */
-        cgroup_clear_directory(cgrp->dentry, false, removed_mask);
-        ret = rebind_subsystems(root, added_mask, removed_mask);
-        if (ret) {
-                /* rebind_subsystems failed, re-populate the removed files */
-                cgroup_populate_dir(cgrp, false, removed_mask);
                goto out_unlock;
        }
-        /* re-populate subsystem files */
+        ret = rebind_subsystems(root, added_mask, removed_mask);
-        cgroup_populate_dir(cgrp, false, added_mask);
+        if (ret)
+                goto out_unlock;
        if (opts.release_agent)
                strcpy(root->release_agent_path, opts.release_agent);
@@ -1395,8 +1328,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
-        if (ret)
-                drop_parsed_module_refcounts(opts.subsys_mask);
        return ret;
 }
@@ -1416,6 +1347,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
+        cgrp->dummy_css.cgroup = cgrp;
        INIT_LIST_HEAD(&cgrp->event_list);
        spin_lock_init(&cgrp->event_list_lock);
        simple_xattrs_init(&cgrp->xattrs);
@@ -1431,6 +1363,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
        cgrp->root = root;
        RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
        init_cgroup_housekeeping(cgrp);
+        idr_init(&root->cgroup_idr);
 }
 static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
@@ -1503,7 +1436,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
         */
        root->subsys_mask = opts->subsys_mask;
        root->flags = opts->flags;
-        ida_init(&root->cgroup_ida);
        if (opts->release_agent)
                strcpy(root->release_agent_path, opts->release_agent);
        if (opts->name)
@@ -1519,7 +1451,7 @@ static void cgroup_free_root(struct cgroupfs_root *root)
                /* hierarhcy ID shoulid already have been released */
                WARN_ON_ONCE(root->hierarchy_id);
-                ida_destroy(&root->cgroup_ida);
+                idr_destroy(&root->cgroup_idr);
                kfree(root);
        }
 }
@@ -1584,7 +1516,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        int ret = 0;
        struct super_block *sb;
        struct cgroupfs_root *new_root;
+        struct list_head tmp_links;
        struct inode *inode;
+        const struct cred *cred;
        /* First find the desired set of subsystems */
        mutex_lock(&cgroup_mutex);
@@ -1600,7 +1534,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        new_root = cgroup_root_from_opts(&opts);
        if (IS_ERR(new_root)) {
                ret = PTR_ERR(new_root);
-                goto drop_modules;
+                goto out_err;
        }
        opts.new_root = new_root;
@@ -1609,17 +1543,15 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
                cgroup_free_root(opts.new_root);
-                goto drop_modules;
+                goto out_err;
        }
        root = sb->s_fs_info;
        BUG_ON(!root);
        if (root == opts.new_root) {
                /* We used the new root structure, so this is a new hierarchy */
-                struct list_head tmp_links;
                struct cgroup *root_cgrp = &root->top_cgroup;
                struct cgroupfs_root *existing_root;
-                const struct cred *cred;
                int i;
                struct css_set *cset;
@@ -1634,6 +1566,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                mutex_lock(&cgroup_mutex);
                mutex_lock(&cgroup_root_mutex);
+                root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
+                                           0, 1, GFP_KERNEL);
+                if (root_cgrp->id < 0)
+                        goto unlock_drop;
                /* Check for name clashes with existing mounts */
                ret = -EBUSY;
                if (strlen(root->name))
@@ -1657,26 +1594,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                if (ret)
                        goto unlock_drop;
+                sb->s_root->d_fsdata = root_cgrp;
+                root_cgrp->dentry = sb->s_root;
+                /*
+                 * We're inside get_sb() and will call lookup_one_len() to
+                 * create the root files, which doesn't work if SELinux is
+                 * in use.  The following cred dancing somehow works around
+                 * it.  See 2ce9738ba ("cgroupfs: use init_cred when
+                 * populating new cgroupfs mount") for more details.
+                 */
+                cred = override_creds(&init_cred);
+                ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+                if (ret)
+                        goto rm_base_files;
                ret = rebind_subsystems(root, root->subsys_mask, 0);
-                if (ret == -EBUSY) {
+                if (ret)
-                        free_cgrp_cset_links(&tmp_links);
+                        goto rm_base_files;
-                        goto unlock_drop;
-                }
+                revert_creds(cred);
                /*
                 * There must be no failure case after here, since rebinding
                 * takes care of subsystems' refcounts, which are explicitly
                 * dropped in the failure exit path.
                 */
-                /* EBUSY should be the only error here */
-                BUG_ON(ret);
                list_add(&root->root_list, &cgroup_roots);
                cgroup_root_count++;
-                sb->s_root->d_fsdata = root_cgrp;
-                root->top_cgroup.dentry = sb->s_root;
                /* Link the top cgroup in this hierarchy into all
                 * the css_set objects */
                write_lock(&css_set_lock);
@@ -1689,9 +1637,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                BUG_ON(!list_empty(&root_cgrp->children));
                BUG_ON(root->number_of_cgroups != 1);
-                cred = override_creds(&init_cred);
-                cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
-                revert_creds(cred);
                mutex_unlock(&cgroup_root_mutex);
                mutex_unlock(&cgroup_mutex);
                mutex_unlock(&inode->i_mutex);
@@ -1711,15 +1656,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                                pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
                        }
                }
-                /* no subsys rebinding, so refcounts don't change */
-                drop_parsed_module_refcounts(opts.subsys_mask);
        }
        kfree(opts.release_agent);
        kfree(opts.name);
        return dget(sb->s_root);
+ rm_base_files:
+        free_cgrp_cset_links(&tmp_links);
+        cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
+        revert_creds(cred);
 unlock_drop:
        cgroup_exit_root_id(root);
        mutex_unlock(&cgroup_root_mutex);
@@ -1727,8 +1673,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        mutex_unlock(&inode->i_mutex);
 drop_new_super:
        deactivate_locked_super(sb);
- drop_modules:
-        drop_parsed_module_refcounts(opts.subsys_mask);
 out_err:
        kfree(opts.release_agent);
        kfree(opts.name);
@@ -1746,6 +1690,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
        BUG_ON(root->number_of_cgroups != 1);
        BUG_ON(!list_empty(&cgrp->children));
+        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
        mutex_lock(&cgroup_mutex);
        mutex_lock(&cgroup_root_mutex);
@@ -1778,6 +1723,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
+        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
        simple_xattrs_free(&cgrp->xattrs);
@@ -1889,7 +1835,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
 struct task_and_cgroup {
        struct task_struct      *task;
        struct cgroup           *cgrp;
-        struct css_set          *cg;
+        struct css_set          *cset;
 };
 struct cgroup_taskset {
@@ -1939,18 +1885,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 EXPORT_SYMBOL_GPL(cgroup_taskset_next);
 /**
- * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
+ * cgroup_taskset_cur_css - return the matching css for the current task
 * @tset: taskset of interest
+ * @subsys_id: the ID of the target subsystem
 *
- * Return the cgroup for the current (last returned) task of @tset.  This
+ * Return the css for the current (last returned) task of @tset for
- * function must be preceded by either cgroup_taskset_first() or
+ * subsystem specified by @subsys_id.  This function must be preceded by
- * cgroup_taskset_next().
+ * either cgroup_taskset_first() or cgroup_taskset_next().
 */
-struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
+struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
+                                                   int subsys_id)
 {
-        return tset->cur_cgrp;
+        return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
 }
-EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
+EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
 /**
 * cgroup_taskset_size - return the number of tasks in taskset
@@ -2054,7 +2002,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
                /* @tsk either already exited or can't exit until the end */
                if (tsk->flags & PF_EXITING)
-                        continue;
+                        goto next;
                /* as per above, nr_threads may decrease, but not increase. */
                BUG_ON(i >= group_size);
@@ -2062,7 +2010,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
                ent.cgrp = task_cgroup_from_root(tsk, root);
                /* nothing to do if this task is already in the cgroup */
                if (ent.cgrp == cgrp)
-                        continue;
+                        goto next;
                /*
                 * saying GFP_ATOMIC has no effect here because we did prealloc
                 * earlier, but it's good form to communicate our expectations.
@@ -2070,7 +2018,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
                retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
                BUG_ON(retval != 0);
                i++;
+        next:
                if (!threadgroup)
                        break;
        } while_each_thread(leader, tsk);
@@ -2089,8 +2037,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
         * step 1: check that we can legitimately attach to the cgroup.
         */
        for_each_root_subsys(root, ss) {
+                struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
                if (ss->can_attach) {
-                        retval = ss->can_attach(cgrp, &tset);
+                        retval = ss->can_attach(css, &tset);
                        if (retval) {
                                failed_ss = ss;
                                goto out_cancel_attach;
@@ -2107,8 +2057,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
                tc = flex_array_get(group, i);
                old_cset = task_css_set(tc->task);
-                tc->cg = find_css_set(old_cset, cgrp);
+                tc->cset = find_css_set(old_cset, cgrp);
-                if (!tc->cg) {
+                if (!tc->cset) {
                        retval = -ENOMEM;
                        goto out_put_css_set_refs;
                }
@@ -2121,7 +2071,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
         */
        for (i = 0; i < group_size; i++) {
                tc = flex_array_get(group, i);
-                cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
+                cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
        }
        /* nothing is sensitive to fork() after this point. */
@@ -2129,8 +2079,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
         * step 4: do subsystem attach callbacks.
         */
        for_each_root_subsys(root, ss) {
+                struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
                if (ss->attach)
-                        ss->attach(cgrp, &tset);
+                        ss->attach(css, &tset);
        }
        /*
@@ -2141,18 +2093,20 @@ out_put_css_set_refs:
        if (retval) {
                for (i = 0; i < group_size; i++) {
                        tc = flex_array_get(group, i);
-                        if (!tc->cg)
+                        if (!tc->cset)
                                break;
-                        put_css_set(tc->cg);
+                        put_css_set(tc->cset);
                }
        }
 out_cancel_attach:
        if (retval) {
                for_each_root_subsys(root, ss) {
+                        struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
                        if (ss == failed_ss)
                                break;
                        if (ss->cancel_attach)
-                                ss->cancel_attach(cgrp, &tset);
+                                ss->cancel_attach(css, &tset);
                }
        }
 out_free_group_list:
@@ -2253,9 +2207,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
        mutex_lock(&cgroup_mutex);
        for_each_active_root(root) {
-                struct cgroup *from_cg = task_cgroup_from_root(from, root);
+                struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
-                retval = cgroup_attach_task(from_cg, tsk, false);
+                retval = cgroup_attach_task(from_cgrp, tsk, false);
                if (retval)
                        break;
        }
@@ -2265,34 +2219,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
+static int cgroup_tasks_write(struct cgroup_subsys_state *css,
+                              struct cftype *cft, u64 pid)
 {
-        return attach_task_by_pid(cgrp, pid, false);
+        return attach_task_by_pid(css->cgroup, pid, false);
 }
-static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+static int cgroup_procs_write(struct cgroup_subsys_state *css,
+                              struct cftype *cft, u64 tgid)
 {
-        return attach_task_by_pid(cgrp, tgid, true);
+        return attach_task_by_pid(css->cgroup, tgid, true);
 }
-static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
+static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
-                                      const char *buffer)
+                                      struct cftype *cft, const char *buffer)
 {
-        BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+        BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
        if (strlen(buffer) >= PATH_MAX)
                return -EINVAL;
-        if (!cgroup_lock_live_group(cgrp))
+        if (!cgroup_lock_live_group(css->cgroup))
                return -ENODEV;
        mutex_lock(&cgroup_root_mutex);
-        strcpy(cgrp->root->release_agent_path, buffer);
+        strcpy(css->cgroup->root->release_agent_path, buffer);
        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
        return 0;
 }
-static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
+static int cgroup_release_agent_show(struct cgroup_subsys_state *css,
-                                     struct seq_file *seq)
+                                     struct cftype *cft, struct seq_file *seq)
 {
+        struct cgroup *cgrp = css->cgroup;
        if (!cgroup_lock_live_group(cgrp))
                return -ENODEV;
        seq_puts(seq, cgrp->root->release_agent_path);
@@ -2301,20 +2259,20 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
        return 0;
 }
-static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
+static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css,
-                                     struct seq_file *seq)
+                                     struct cftype *cft, struct seq_file *seq)
 {
-        seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+        seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup));
        return 0;
 }
 /* A buffer size big enough for numbers or short strings */
 #define CGROUP_LOCAL_BUFFER_SIZE 64
-static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css,
-                                struct file *file,
+                                struct cftype *cft, struct file *file,
-                                const char __user *userbuf,
+                                const char __user *userbuf, size_t nbytes,
-                                size_t nbytes, loff_t *unused_ppos)
+                                loff_t *unused_ppos)
 {
        char buffer[CGROUP_LOCAL_BUFFER_SIZE];
        int retval = 0;
@@ -2332,22 +2290,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
                u64 val = simple_strtoull(strstrip(buffer), &end, 0);
                if (*end)
                        return -EINVAL;
-                retval = cft->write_u64(cgrp, cft, val);
+                retval = cft->write_u64(css, cft, val);
        } else {
                s64 val = simple_strtoll(strstrip(buffer), &end, 0);
                if (*end)
                        return -EINVAL;
-                retval = cft->write_s64(cgrp, cft, val);
+                retval = cft->write_s64(css, cft, val);
        }
        if (!retval)
                retval = nbytes;
        return retval;
 }
-static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cgroup_write_string(struct cgroup_subsys_state *css,
-                                   struct file *file,
+                                   struct cftype *cft, struct file *file,
-                                   const char __user *userbuf,
+                                   const char __user *userbuf, size_t nbytes,
-                                   size_t nbytes, loff_t *unused_ppos)
+                                   loff_t *unused_ppos)
 {
        char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
        int retval = 0;
@@ -2370,7 +2328,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
        }
        buffer[nbytes] = 0;     /* nul-terminate */
-        retval = cft->write_string(cgrp, cft, strstrip(buffer));
+        retval = cft->write_string(css, cft, strstrip(buffer));
        if (!retval)
                retval = nbytes;
 out:
@@ -2380,65 +2338,60 @@ out:
 }
 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
-                                                size_t nbytes, loff_t *ppos)
+                                 size_t nbytes, loff_t *ppos)
 {
+        struct cfent *cfe = __d_cfe(file->f_dentry);
        struct cftype *cft = __d_cft(file->f_dentry);
-        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+        struct cgroup_subsys_state *css = cfe->css;
-        if (cgroup_is_dead(cgrp))
-                return -ENODEV;
        if (cft->write)
-                return cft->write(cgrp, cft, file, buf, nbytes, ppos);
+                return cft->write(css, cft, file, buf, nbytes, ppos);
        if (cft->write_u64 || cft->write_s64)
-                return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
+                return cgroup_write_X64(css, cft, file, buf, nbytes, ppos);
        if (cft->write_string)
-                return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
+                return cgroup_write_string(css, cft, file, buf, nbytes, ppos);
        if (cft->trigger) {
-                int ret = cft->trigger(cgrp, (unsigned int)cft->private);
+                int ret = cft->trigger(css, (unsigned int)cft->private);
                return ret ? ret : nbytes;
        }
        return -EINVAL;
 }
-static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css,
-                               struct file *file,
+                               struct cftype *cft, struct file *file,
-                               char __user *buf, size_t nbytes,
+                               char __user *buf, size_t nbytes, loff_t *ppos)
-                               loff_t *ppos)
 {
        char tmp[CGROUP_LOCAL_BUFFER_SIZE];
-        u64 val = cft->read_u64(cgrp, cft);
+        u64 val = cft->read_u64(css, cft);
        int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
-static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
+static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css,
-                               struct file *file,
+                               struct cftype *cft, struct file *file,
-                               char __user *buf, size_t nbytes,
+                               char __user *buf, size_t nbytes, loff_t *ppos)
-                               loff_t *ppos)
 {
        char tmp[CGROUP_LOCAL_BUFFER_SIZE];
-        s64 val = cft->read_s64(cgrp, cft);
+        s64 val = cft->read_s64(css, cft);
        int len = sprintf(tmp, "%lld\n", (long long) val);
        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
-                                   size_t nbytes, loff_t *ppos)
+                                size_t nbytes, loff_t *ppos)
 {
+        struct cfent *cfe = __d_cfe(file->f_dentry);
        struct cftype *cft = __d_cft(file->f_dentry);
-        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+        struct cgroup_subsys_state *css = cfe->css;
-        if (cgroup_is_dead(cgrp))
-                return -ENODEV;
        if (cft->read)
-                return cft->read(cgrp, cft, file, buf, nbytes, ppos);
+                return cft->read(css, cft, file, buf, nbytes, ppos);
        if (cft->read_u64)
-                return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
+                return cgroup_read_u64(css, cft, file, buf, nbytes, ppos);
        if (cft->read_s64)
-                return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
+                return cgroup_read_s64(css, cft, file, buf, nbytes, ppos);
        return -EINVAL;
 }
@@ -2447,11 +2400,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 * supports string->u64 maps, but can be extended in future.
 */
-struct cgroup_seqfile_state {
-        struct cftype *cft;
-        struct cgroup *cgroup;
-};
 static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
 {
        struct seq_file *sf = cb->state;
@@ -2460,69 +2408,86 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 {
-        struct cgroup_seqfile_state *state = m->private;
+        struct cfent *cfe = m->private;
-        struct cftype *cft = state->cft;
+        struct cftype *cft = cfe->type;
+        struct cgroup_subsys_state *css = cfe->css;
        if (cft->read_map) {
                struct cgroup_map_cb cb = {
                        .fill = cgroup_map_add,
                        .state = m,
                };
-                return cft->read_map(state->cgroup, cft, &cb);
+                return cft->read_map(css, cft, &cb);
        }
-        return cft->read_seq_string(state->cgroup, cft, m);
+        return cft->read_seq_string(css, cft, m);
-}
-static int cgroup_seqfile_release(struct inode *inode, struct file *file)
-{
-        struct seq_file *seq = file->private_data;
-        kfree(seq->private);
-        return single_release(inode, file);
 }
 static const struct file_operations cgroup_seqfile_operations = {
        .read = seq_read,
        .write = cgroup_file_write,
        .llseek = seq_lseek,
-        .release = cgroup_seqfile_release,
+        .release = single_release,
 };
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
+        struct cfent *cfe = __d_cfe(file->f_dentry);
+        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
+        struct cgroup_subsys_state *css;
        int err;
-        struct cftype *cft;
        err = generic_file_open(inode, file);
        if (err)
                return err;
-        cft = __d_cft(file->f_dentry);
-        if (cft->read_map || cft->read_seq_string) {
+        /*
-                struct cgroup_seqfile_state *state;
+         * If the file belongs to a subsystem, pin the css.  Will be
+         * unpinned either on open failure or release.  This ensures that
+         * @css stays alive for all file operations.
+         */
+        rcu_read_lock();
+        css = cgroup_css(cgrp, cft->ss);
+        if (cft->ss && !css_tryget(css))
+                css = NULL;
+        rcu_read_unlock();
-                state = kzalloc(sizeof(*state), GFP_USER);
+        if (!css)
-                if (!state)
+                return -ENODEV;
-                        return -ENOMEM;
-                state->cft = cft;
+        /*
-                state->cgroup = __d_cgrp(file->f_dentry->d_parent);
+         * @cfe->css is used by read/write/close to determine the
+         * associated css.  @file->private_data would be a better place but
+         * that's already used by seqfile.  Multiple accessors may use it
+         * simultaneously which is okay as the association never changes.
+         */
+        WARN_ON_ONCE(cfe->css && cfe->css != css);
+        cfe->css = css;
+        if (cft->read_map || cft->read_seq_string) {
                file->f_op = &cgroup_seqfile_operations;
-                err = single_open(file, cgroup_seqfile_show, state);
+                err = single_open(file, cgroup_seqfile_show, cfe);
-                if (err < 0)
+        } else if (cft->open) {
-                        kfree(state);
-        } else if (cft->open)
                err = cft->open(inode, file);
-        else
+        }
-                err = 0;
+        if (css->ss && err)
+                css_put(css);
        return err;
 }
 static int cgroup_file_release(struct inode *inode, struct file *file)
 {
+        struct cfent *cfe = __d_cfe(file->f_dentry);
        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cgroup_subsys_state *css = cfe->css;
+        int ret = 0;
        if (cft->release)
-                return cft->release(inode, file);
+                ret = cft->release(inode, file);
-        return 0;
+        if (css->ss)
+                css_put(css);
+        return ret;
 }
 /*
@@ -2736,8 +2701,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
        return mode;
 }
-static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
-                           struct cftype *cft)
 {
        struct dentry *dir = cgrp->dentry;
        struct cgroup *parent = __d_cgrp(dir);
@@ -2747,8 +2711,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
        umode_t mode;
        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
-        if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+        if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
-                strcpy(name, subsys->name);
+            !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+                strcpy(name, cft->ss->name);
                strcat(name, ".");
        }
        strcat(name, cft->name);
@@ -2782,11 +2747,25 @@ out:
        return error;
 }
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+/**
-                              struct cftype cfts[], bool is_add)
+ * cgroup_addrm_files - add or remove files to a cgroup directory
+ * @cgrp: the target cgroup
+ * @cfts: array of cftypes to be added
+ * @is_add: whether to add or remove
+ *
+ * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
+ * For removals, this function never fails.  If addition fails, this
+ * function doesn't remove files already added.  The caller is responsible
+ * for cleaning up.
+ */
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+                              bool is_add)
 {
        struct cftype *cft;
-        int err, ret = 0;
+        int ret;
+        lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
+        lockdep_assert_held(&cgroup_mutex);
        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2798,16 +2777,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
                        continue;
                if (is_add) {
-                        err = cgroup_add_file(cgrp, subsys, cft);
+                        ret = cgroup_add_file(cgrp, cft);
-                        if (err)
+                        if (ret) {
                                pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
-                                        cft->name, err);
+                                        cft->name, ret);
-                        ret = err;
+                                return ret;
+                        }
                } else {
                        cgroup_rm_file(cgrp, cft);
                }
        }
-        return ret;
+        return 0;
 }
 static void cgroup_cfts_prepare(void)
@@ -2816,28 +2796,30 @@ static void cgroup_cfts_prepare(void)
        /*
         * Thanks to the entanglement with vfs inode locking, we can't walk
         * the existing cgroups under cgroup_mutex and create files.
-         * Instead, we use cgroup_for_each_descendant_pre() and drop RCU
+         * Instead, we use css_for_each_descendant_pre() and drop RCU read
-         * read lock before calling cgroup_addrm_files().
+         * lock before calling cgroup_addrm_files().
         */
        mutex_lock(&cgroup_mutex);
 }
-static void cgroup_cfts_commit(struct cgroup_subsys *ss,
+static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
-                               struct cftype *cfts, bool is_add)
        __releases(&cgroup_mutex)
 {
        LIST_HEAD(pending);
-        struct cgroup *cgrp, *root = &ss->root->top_cgroup;
+        struct cgroup_subsys *ss = cfts[0].ss;
+        struct cgroup *root = &ss->root->top_cgroup;
        struct super_block *sb = ss->root->sb;
        struct dentry *prev = NULL;
        struct inode *inode;
+        struct cgroup_subsys_state *css;
        u64 update_before;
+        int ret = 0;
        /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
        if (!cfts || ss->root == &cgroup_dummy_root ||
            !atomic_inc_not_zero(&sb->s_active)) {
                mutex_unlock(&cgroup_mutex);
-                return;
+                return 0;
        }
        /*
@@ -2849,17 +2831,11 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
        mutex_unlock(&cgroup_mutex);
-        /* @root always needs to be updated */
-        inode = root->dentry->d_inode;
-        mutex_lock(&inode->i_mutex);
-        mutex_lock(&cgroup_mutex);
-        cgroup_addrm_files(root, ss, cfts, is_add);
-        mutex_unlock(&cgroup_mutex);
-        mutex_unlock(&inode->i_mutex);
        /* add/rm files for all cgroups created before */
        rcu_read_lock();
-        cgroup_for_each_descendant_pre(cgrp, root) {
+        css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
+                struct cgroup *cgrp = css->cgroup;
                if (cgroup_is_dead(cgrp))
                        continue;
@@ -2873,15 +2849,18 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
                mutex_lock(&inode->i_mutex);
                mutex_lock(&cgroup_mutex);
                if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
-                        cgroup_addrm_files(cgrp, ss, cfts, is_add);
+                        ret = cgroup_addrm_files(cgrp, cfts, is_add);
                mutex_unlock(&cgroup_mutex);
                mutex_unlock(&inode->i_mutex);
                rcu_read_lock();
+                if (ret)
+                        break;
        }
        rcu_read_unlock();
        dput(prev);
        deactivate_super(sb);
+        return ret;
 }
 /**
@@ -2901,49 +2880,56 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
        struct cftype_set *set;
+        struct cftype *cft;
+        int ret;
        set = kzalloc(sizeof(*set), GFP_KERNEL);
        if (!set)
                return -ENOMEM;
+        for (cft = cfts; cft->name[0] != '\0'; cft++)
+                cft->ss = ss;
        cgroup_cfts_prepare();
        set->cfts = cfts;
        list_add_tail(&set->node, &ss->cftsets);
-        cgroup_cfts_commit(ss, cfts, true);
+        ret = cgroup_cfts_commit(cfts, true);
+        if (ret)
-        return 0;
+                cgroup_rm_cftypes(cfts);
+        return ret;
 }
 EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
 /**
 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
- * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
- * Unregister @cfts from @ss.  Files described by @cfts are removed from
+ * Unregister @cfts.  Files described by @cfts are removed from all
- * all existing cgroups to which @ss is attached and all future cgroups
+ * existing cgroups and all future cgroups won't have them either.  This
- * won't have them either.  This function can be called anytime whether @ss
+ * function can be called anytime whether @cfts' subsys is attached or not.
- * is attached or not.
 *
 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
- * registered with @ss.
+ * registered.
 */
-int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+int cgroup_rm_cftypes(struct cftype *cfts)
 {
        struct cftype_set *set;
+        if (!cfts || !cfts[0].ss)
+                return -ENOENT;
        cgroup_cfts_prepare();
-        list_for_each_entry(set, &ss->cftsets, node) {
+        list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
                if (set->cfts == cfts) {
                        list_del(&set->node);
                        kfree(set);
-                        cgroup_cfts_commit(ss, cfts, false);
+                        cgroup_cfts_commit(cfts, false);
                        return 0;
                }
        }
-        cgroup_cfts_commit(ss, NULL, false);
+        cgroup_cfts_commit(NULL, false);
        return -ENOENT;
 }
@@ -2966,34 +2952,10 @@ int cgroup_task_count(const struct cgroup *cgrp)
 }
 /*
- * Advance a list_head iterator.  The iterator should be positioned at
+ * To reduce the fork() overhead for systems that are not actually using
- * the start of a css_set
+ * their cgroups capability, we don't maintain the lists running through
- */
+ * each css_set to its tasks until we see the list actually used - in other
-static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it)
+ * words after the first call to css_task_iter_start().
-{
-        struct list_head *l = it->cset_link;
-        struct cgrp_cset_link *link;
-        struct css_set *cset;
-        /* Advance to the next non-empty css_set */
-        do {
-                l = l->next;
-                if (l == &cgrp->cset_links) {
-                        it->cset_link = NULL;
-                        return;
-                }
-                link = list_entry(l, struct cgrp_cset_link, cset_link);
-                cset = link->cset;
-        } while (list_empty(&cset->tasks));
-        it->cset_link = l;
-        it->task = cset->tasks.next;
-}
-/*
- * To reduce the fork() overhead for systems that are not actually
- * using their cgroups capability, we don't maintain the lists running
- * through each css_set to its tasks until we see the list actually
- * used - in other words after the first call to cgroup_iter_start().
 */
 static void cgroup_enable_task_cg_lists(void)
 {
@@ -3024,16 +2986,21 @@ static void cgroup_enable_task_cg_lists(void)
 }
 /**
- * cgroup_next_sibling - find the next sibling of a given cgroup
+ * css_next_child - find the next child of a given css
- * @pos: the current cgroup
+ * @pos_css: the current position (%NULL to initiate traversal)
+ * @parent_css: css whose children to walk
 *
- * This function returns the next sibling of @pos and should be called
+ * This function returns the next child of @parent_css and should be called
- * under RCU read lock.  The only requirement is that @pos is accessible.
+ * under RCU read lock.  The only requirement is that @parent_css and
- * The next sibling is guaranteed to be returned regardless of @pos's
+ * @pos_css are accessible.  The next sibling is guaranteed to be returned
- * state.
+ * regardless of their states.
 */
-struct cgroup *cgroup_next_sibling(struct cgroup *pos)
+struct cgroup_subsys_state *
+css_next_child(struct cgroup_subsys_state *pos_css,
+               struct cgroup_subsys_state *parent_css)
 {
+        struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
+        struct cgroup *cgrp = parent_css->cgroup;
        struct cgroup *next;
        WARN_ON_ONCE(!rcu_read_lock_held());
@@ -3048,78 +3015,81 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
         * safe to dereference from this RCU critical section.  If
         * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
         * to be visible as %true here.
+         *
+         * If @pos is dead, its next pointer can't be dereferenced;
+         * however, as each cgroup is given a monotonically increasing
+         * unique serial number and always appended to the sibling list,
+         * the next one can be found by walking the parent's children until
+         * we see a cgroup with higher serial number than @pos's.  While
+         * this path can be slower, it's taken only when either the current
+         * cgroup is removed or iteration and removal race.
         */
-        if (likely(!cgroup_is_dead(pos))) {
+        if (!pos) {
+                next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
+        } else if (likely(!cgroup_is_dead(pos))) {
                next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
-                if (&next->sibling != &pos->parent->children)
+        } else {
-                        return next;
+                list_for_each_entry_rcu(next, &cgrp->children, sibling)
-                return NULL;
+                        if (next->serial_nr > pos->serial_nr)
+                                break;
        }
-        /*
+        if (&next->sibling == &cgrp->children)
-         * Can't dereference the next pointer.  Each cgroup is given a
+                return NULL;
-         * monotonically increasing unique serial number and always
-         * appended to the sibling list, so the next one can be found by
+        return cgroup_css(next, parent_css->ss);
-         * walking the parent's children until we see a cgroup with higher
-         * serial number than @pos's.
-         *
-         * While this path can be slow, it's taken only when either the
-         * current cgroup is removed or iteration and removal race.
-         */
-        list_for_each_entry_rcu(next, &pos->parent->children, sibling)
-                if (next->serial_nr > pos->serial_nr)
-                        return next;
-        return NULL;
 }
-EXPORT_SYMBOL_GPL(cgroup_next_sibling);
+EXPORT_SYMBOL_GPL(css_next_child);
 /**
- * cgroup_next_descendant_pre - find the next descendant for pre-order walk
+ * css_next_descendant_pre - find the next descendant for pre-order walk
 * @pos: the current position (%NULL to initiate traversal)
- * @cgroup: cgroup whose descendants to walk
+ * @root: css whose descendants to walk
 *
- * To be used by cgroup_for_each_descendant_pre().  Find the next
+ * To be used by css_for_each_descendant_pre().  Find the next descendant
- * descendant to visit for pre-order traversal of @cgroup's descendants.
+ * to visit for pre-order traversal of @root's descendants.  @root is
+ * included in the iteration and the first node to be visited.
 *
 * While this function requires RCU read locking, it doesn't require the
 * whole traversal to be contained in a single RCU critical section.  This
 * function will return the correct next descendant as long as both @pos
- * and @cgroup are accessible and @pos is a descendant of @cgroup.
+ * and @root are accessible and @pos is a descendant of @root.
 */
-struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
+struct cgroup_subsys_state *
-                                          struct cgroup *cgroup)
+css_next_descendant_pre(struct cgroup_subsys_state *pos,
+                        struct cgroup_subsys_state *root)
 {
-        struct cgroup *next;
+        struct cgroup_subsys_state *next;
        WARN_ON_ONCE(!rcu_read_lock_held());
-        /* if first iteration, pretend we just visited @cgroup */
+        /* if first iteration, visit @root */
        if (!pos)
-                pos = cgroup;
+                return root;
        /* visit the first child if exists */
-        next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
+        next = css_next_child(NULL, pos);
        if (next)
                return next;
        /* no child, visit my or the closest ancestor's next sibling */
-        while (pos != cgroup) {
+        while (pos != root) {
-                next = cgroup_next_sibling(pos);
+                next = css_next_child(pos, css_parent(pos));
                if (next)
                        return next;
-                pos = pos->parent;
+                pos = css_parent(pos);
        }
        return NULL;
 }
-EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
 /**
- * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * css_rightmost_descendant - return the rightmost descendant of a css
- * @pos: cgroup of interest
+ * @pos: css of interest
 *
- * Return the rightmost descendant of @pos.  If there's no descendant,
+ * Return the rightmost descendant of @pos.  If there's no descendant, @pos
- * @pos is returned.  This can be used during pre-order traversal to skip
+ * is returned.  This can be used during pre-order traversal to skip
 * subtree of @pos.
 *
 * While this function requires RCU read locking, it doesn't require the
@@ -3127,9 +3097,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
 * function will return the correct rightmost descendant as long as @pos is
 * accessible.
 */
-struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+struct cgroup_subsys_state *
+css_rightmost_descendant(struct cgroup_subsys_state *pos)
 {
-        struct cgroup *last, *tmp;
+        struct cgroup_subsys_state *last, *tmp;
        WARN_ON_ONCE(!rcu_read_lock_held());
@@ -3137,82 +3108,136 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
                last = pos;
                /* ->prev isn't RCU safe, walk ->next till the end */
                pos = NULL;
-                list_for_each_entry_rcu(tmp, &last->children, sibling)
+                css_for_each_child(tmp, last)
                        pos = tmp;
        } while (pos);
        return last;
 }
-EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
+EXPORT_SYMBOL_GPL(css_rightmost_descendant);
-static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
+static struct cgroup_subsys_state *
+css_leftmost_descendant(struct cgroup_subsys_state *pos)
 {
-        struct cgroup *last;
+        struct cgroup_subsys_state *last;
        do {
                last = pos;
-                pos = list_first_or_null_rcu(&pos->children, struct cgroup,
+                pos = css_next_child(NULL, pos);
-                                             sibling);
        } while (pos);
        return last;
 }
 /**
- * cgroup_next_descendant_post - find the next descendant for post-order walk
+ * css_next_descendant_post - find the next descendant for post-order walk
 * @pos: the current position (%NULL to initiate traversal)
- * @cgroup: cgroup whose descendants to walk
+ * @root: css whose descendants to walk
 *
- * To be used by cgroup_for_each_descendant_post().  Find the next
+ * To be used by css_for_each_descendant_post().  Find the next descendant
- * descendant to visit for post-order traversal of @cgroup's descendants.
+ * to visit for post-order traversal of @root's descendants.  @root is
+ * included in the iteration and the last node to be visited.
 *
 * While this function requires RCU read locking, it doesn't require the
 * whole traversal to be contained in a single RCU critical section.  This
 * function will return the correct next descendant as long as both @pos
 * and @cgroup are accessible and @pos is a descendant of @cgroup.
 */
-struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
+struct cgroup_subsys_state *
-                                           struct cgroup *cgroup)
+css_next_descendant_post(struct cgroup_subsys_state *pos,
+                         struct cgroup_subsys_state *root)
 {
-        struct cgroup *next;
+        struct cgroup_subsys_state *next;
        WARN_ON_ONCE(!rcu_read_lock_held());
-        /* if first iteration, visit the leftmost descendant */
+        /* if first iteration, visit leftmost descendant which may be @root */
-        if (!pos) {
+        if (!pos)
-                next = cgroup_leftmost_descendant(cgroup);
+                return css_leftmost_descendant(root);
-                return next != cgroup ? next : NULL;
-        }
+        /* if we visited @root, we're done */
+        if (pos == root)
+                return NULL;
        /* if there's an unvisited sibling, visit its leftmost descendant */
-        next = cgroup_next_sibling(pos);
+        next = css_next_child(pos, css_parent(pos));
        if (next)
-                return cgroup_leftmost_descendant(next);
+                return css_leftmost_descendant(next);
        /* no sibling left, visit parent */
-        next = pos->parent;
+        return css_parent(pos);
-        return next != cgroup ? next : NULL;
+}
+EXPORT_SYMBOL_GPL(css_next_descendant_post);
+/**
+ * css_advance_task_iter - advance a task itererator to the next css_set
+ * @it: the iterator to advance
+ *
+ * Advance @it to the next css_set to walk.
+ */
+static void css_advance_task_iter(struct css_task_iter *it)
+{
+        struct list_head *l = it->cset_link;
+        struct cgrp_cset_link *link;
+        struct css_set *cset;
+        /* Advance to the next non-empty css_set */
+        do {
+                l = l->next;
+                if (l == &it->origin_css->cgroup->cset_links) {
+                        it->cset_link = NULL;
+                        return;
+                }
+                link = list_entry(l, struct cgrp_cset_link, cset_link);
+                cset = link->cset;
+        } while (list_empty(&cset->tasks));
+        it->cset_link = l;
+        it->task = cset->tasks.next;
 }
-EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
-void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
+/**
+ * css_task_iter_start - initiate task iteration
+ * @css: the css to walk tasks of
+ * @it: the task iterator to use
+ *
+ * Initiate iteration through the tasks of @css.  The caller can call
+ * css_task_iter_next() to walk through the tasks until the function
+ * returns NULL.  On completion of iteration, css_task_iter_end() must be
+ * called.
+ *
+ * Note that this function acquires a lock which is released when the
+ * iteration finishes.  The caller can't sleep while iteration is in
+ * progress.
+ */
+void css_task_iter_start(struct cgroup_subsys_state *css,
+                         struct css_task_iter *it)
        __acquires(css_set_lock)
 {
        /*
-         * The first time anyone tries to iterate across a cgroup,
+         * The first time anyone tries to iterate across a css, we need to
-         * we need to enable the list linking each css_set to its
+         * enable the list linking each css_set to its tasks, and fix up
-         * tasks, and fix up all existing tasks.
+         * all existing tasks.
         */
        if (!use_task_css_set_links)
                cgroup_enable_task_cg_lists();
        read_lock(&css_set_lock);
-        it->cset_link = &cgrp->cset_links;
-        cgroup_advance_iter(cgrp, it);
+        it->origin_css = css;
+        it->cset_link = &css->cgroup->cset_links;
+        css_advance_task_iter(it);
 }
-struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
+/**
-                                        struct cgroup_iter *it)
+ * css_task_iter_next - return the next task for the iterator
+ * @it: the task iterator being iterated
+ *
+ * The "next" function for task iteration.  @it should have been
+ * initialized via css_task_iter_start().  Returns NULL when the iteration
+ * reaches the end.
+ */
+struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
        struct task_struct *res;
        struct list_head *l = it->task;
@@ -3226,16 +3251,24 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
        l = l->next;
        link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
        if (l == &link->cset->tasks) {
-                /* We reached the end of this task list - move on to
+                /*
-                 * the next cg_cgroup_link */
+                 * We reached the end of this task list - move on to the
-                cgroup_advance_iter(cgrp, it);
+                 * next cgrp_cset_link.
+                 */
+                css_advance_task_iter(it);
        } else {
                it->task = l;
        }
        return res;
 }
-void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
+/**
+ * css_task_iter_end - finish task iteration
+ * @it: the task iterator to finish
+ *
+ * Finish task iteration started by css_task_iter_start().
+ */
+void css_task_iter_end(struct css_task_iter *it)
        __releases(css_set_lock)
 {
        read_unlock(&css_set_lock);
@@ -3276,46 +3309,49 @@ static inline int started_after(void *p1, void *p2)
 }
 /**
- * cgroup_scan_tasks - iterate though all the tasks in a cgroup
+ * css_scan_tasks - iterate though all the tasks in a css
- * @scan: struct cgroup_scanner containing arguments for the scan
+ * @css: the css to iterate tasks of
+ * @test: optional test callback
+ * @process: process callback
+ * @data: data passed to @test and @process
+ * @heap: optional pre-allocated heap used for task iteration
+ *
+ * Iterate through all the tasks in @css, calling @test for each, and if it
+ * returns %true, call @process for it also.
 *
- * Arguments include pointers to callback functions test_task() and
+ * @test may be NULL, meaning always true (select all tasks), which
- * process_task().
+ * effectively duplicates css_task_iter_{start,next,end}() but does not
- * Iterate through all the tasks in a cgroup, calling test_task() for each,
+ * lock css_set_lock for the call to @process.
- * and if it returns true, call process_task() for it also.
- * The test_task pointer may be NULL, meaning always true (select all tasks).
- * Effectively duplicates cgroup_iter_{start,next,end}()
- * but does not lock css_set_lock for the call to process_task().
- * The struct cgroup_scanner may be embedded in any structure of the caller's
- * creation.
- * It is guaranteed that process_task() will act on every task that
- * is a member of the cgroup for the duration of this call. This
- * function may or may not call process_task() for tasks that exit
- * or move to a different cgroup during the call, or are forked or
- * move into the cgroup during the call.
 *
- * Note that test_task() may be called with locks held, and may in some
+ * It is guaranteed that @process will act on every task that is a member
- * situations be called multiple times for the same task, so it should
+ * of @css for the duration of this call.  This function may or may not
- * be cheap.
+ * call @process for tasks that exit or move to a different css during the
- * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
+ * call, or are forked or move into the css during the call.
- * pre-allocated and will be used for heap operations (and its "gt" member will
+ *
- * be overwritten), else a temporary heap will be used (allocation of which
+ * Note that @test may be called with locks held, and may in some
- * may cause this function to fail).
+ * situations be called multiple times for the same task, so it should be
+ * cheap.
+ *
+ * If @heap is non-NULL, a heap has been pre-allocated and will be used for
+ * heap operations (and its "gt" member will be overwritten), else a
+ * temporary heap will be used (allocation of which may cause this function
+ * to fail).
 */
-int cgroup_scan_tasks(struct cgroup_scanner *scan)
+int css_scan_tasks(struct cgroup_subsys_state *css,
+                   bool (*test)(struct task_struct *, void *),
+                   void (*process)(struct task_struct *, void *),
+                   void *data, struct ptr_heap *heap)
 {
        int retval, i;
-        struct cgroup_iter it;
+        struct css_task_iter it;
        struct task_struct *p, *dropped;
        /* Never dereference latest_task, since it's not refcounted */
        struct task_struct *latest_task = NULL;
        struct ptr_heap tmp_heap;
-        struct ptr_heap *heap;
        struct timespec latest_time = { 0, 0 };
-        if (scan->heap) {
+        if (heap) {
                /* The caller supplied our heap and pre-allocated its memory */
-                heap = scan->heap;
                heap->gt = &started_after;
        } else {
                /* We need to allocate our own heap memory */
@@ -3328,25 +3364,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
 again:
        /*
-         * Scan tasks in the cgroup, using the scanner's "test_task" callback
+         * Scan tasks in the css, using the @test callback to determine
-         * to determine which are of interest, and using the scanner's
+         * which are of interest, and invoking @process callback on the
-         * "process_task" callback to process any of them that need an update.
+         * ones which need an update.  Since we don't want to hold any
-         * Since we don't want to hold any locks during the task updates,
+         * locks during the task updates, gather tasks to be processed in a
-         * gather tasks to be processed in a heap structure.
+         * heap structure.  The heap is sorted by descending task start
-         * The heap is sorted by descending task start time.
+         * time.  If the statically-sized heap fills up, we overflow tasks
-         * If the statically-sized heap fills up, we overflow tasks that
+         * that started later, and in future iterations only consider tasks
-         * started later, and in future iterations only consider tasks that
+         * that started after the latest task in the previous pass. This
-         * started after the latest task in the previous pass. This
         * guarantees forward progress and that we don't miss any tasks.
         */
        heap->size = 0;
-        cgroup_iter_start(scan->cg, &it);
+        css_task_iter_start(css, &it);
-        while ((p = cgroup_iter_next(scan->cg, &it))) {
+        while ((p = css_task_iter_next(&it))) {
                /*
                 * Only affect tasks that qualify per the caller's callback,
                 * if he provided one
                 */
-                if (scan->test_task && !scan->test_task(p, scan))
+                if (test && !test(p, data))
                        continue;
                /*
                 * Only process tasks that started after the last task
@@ -3374,7 +3409,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
                 * the heap and wasn't inserted
                 */
        }
-        cgroup_iter_end(scan->cg, &it);
+        css_task_iter_end(&it);
        if (heap->size) {
                for (i = 0; i < heap->size; i++) {
@@ -3384,7 +3419,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
                                latest_task = q;
                        }
                        /* Process the task per the caller's callback */
-                        scan->process_task(q, scan);
+                        process(q, data);
                        put_task_struct(q);
                }
                /*
@@ -3401,10 +3436,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
        return 0;
 }
-static void cgroup_transfer_one_task(struct task_struct *task,
+static void cgroup_transfer_one_task(struct task_struct *task, void *data)
-                                     struct cgroup_scanner *scan)
 {
-        struct cgroup *new_cgroup = scan->data;
+        struct cgroup *new_cgroup = data;
        mutex_lock(&cgroup_mutex);
        cgroup_attach_task(new_cgroup, task, false);
@@ -3418,15 +3452,8 @@ static void cgroup_transfer_one_task(struct task_struct *task,
 */
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 {
-        struct cgroup_scanner scan;
+        return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
+                              to, NULL);
-        scan.cg = from;
-        scan.test_task = NULL; /* select all tasks in cgroup */
-        scan.process_task = cgroup_transfer_one_task;
-        scan.heap = NULL;
-        scan.data = to;
-        return cgroup_scan_tasks(&scan);
 }
 /*
@@ -3468,7 +3495,7 @@ struct cgroup_pidlist {
        /* pointer to the cgroup we belong to, for list removal purposes */
        struct cgroup *owner;
        /* protects the other fields */
-        struct rw_semaphore mutex;
+        struct rw_semaphore rwsem;
 };
 /*
@@ -3541,7 +3568,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        struct pid_namespace *ns = task_active_pid_ns(current);
        /*
-         * We can't drop the pidlist_mutex before taking the l->mutex in case
+         * We can't drop the pidlist_mutex before taking the l->rwsem in case
         * the last ref-holder is trying to remove l from the list at the same
         * time. Holding the pidlist_mutex precludes somebody taking whichever
         * list we find out from under us - compare release_pid_array().
@@ -3550,7 +3577,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
        list_for_each_entry(l, &cgrp->pidlists, links) {
                if (l->key.type == type && l->key.ns == ns) {
                        /* make sure l doesn't vanish out from under us */
-                        down_write(&l->mutex);
+                        down_write(&l->rwsem);
                        mutex_unlock(&cgrp->pidlist_mutex);
                        return l;
                }
@@ -3561,8 +3588,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
                mutex_unlock(&cgrp->pidlist_mutex);
                return l;
        }
-        init_rwsem(&l->mutex);
+        init_rwsem(&l->rwsem);
-        down_write(&l->mutex);
+        down_write(&l->rwsem);
        l->key.type = type;
        l->key.ns = get_pid_ns(ns);
        l->owner = cgrp;
@@ -3580,7 +3607,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
        pid_t *array;
        int length;
        int pid, n = 0; /* used for populating the array */
-        struct cgroup_iter it;
+        struct css_task_iter it;
        struct task_struct *tsk;
        struct cgroup_pidlist *l;
@@ -3595,8 +3622,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
        if (!array)
                return -ENOMEM;
        /* now, populate the array */
-        cgroup_iter_start(cgrp, &it);
+        css_task_iter_start(&cgrp->dummy_css, &it);
-        while ((tsk = cgroup_iter_next(cgrp, &it))) {
+        while ((tsk = css_task_iter_next(&it))) {
                if (unlikely(n == length))
                        break;
                /* get tgid or pid for procs or tasks file respectively */
@@ -3607,7 +3634,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
                if (pid > 0) /* make sure to only use valid results */
                        array[n++] = pid;
        }
-        cgroup_iter_end(cgrp, &it);
+        css_task_iter_end(&it);
        length = n;
        /* now sort & (if procs) strip out duplicates */
        sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -3623,7 +3650,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
        l->list = array;
        l->length = length;
        l->use_count++;
-        up_write(&l->mutex);
+        up_write(&l->rwsem);
        *lp = l;
        return 0;
 }
@@ -3641,7 +3668,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 {
        int ret = -EINVAL;
        struct cgroup *cgrp;
-        struct cgroup_iter it;
+        struct css_task_iter it;
        struct task_struct *tsk;
        /*
@@ -3655,8 +3682,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
        ret = 0;
        cgrp = dentry->d_fsdata;
-        cgroup_iter_start(cgrp, &it);
+        css_task_iter_start(&cgrp->dummy_css, &it);
-        while ((tsk = cgroup_iter_next(cgrp, &it))) {
+        while ((tsk = css_task_iter_next(&it))) {
                switch (tsk->state) {
                case TASK_RUNNING:
                        stats->nr_running++;
@@ -3676,7 +3703,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
                        break;
                }
        }
-        cgroup_iter_end(cgrp, &it);
+        css_task_iter_end(&it);
 err:
        return ret;
@@ -3701,7 +3728,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
        int index = 0, pid = *pos;
        int *iter;
-        down_read(&l->mutex);
+        down_read(&l->rwsem);
        if (pid) {
                int end = l->length;
@@ -3728,7 +3755,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
        struct cgroup_pidlist *l = s->private;
-        up_read(&l->mutex);
+        up_read(&l->rwsem);
 }
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
@@ -3774,7 +3801,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
         * pidlist_mutex, we have to take pidlist_mutex first.
         */
        mutex_lock(&l->owner->pidlist_mutex);
-        down_write(&l->mutex);
+        down_write(&l->rwsem);
        BUG_ON(!l->use_count);
        if (!--l->use_count) {
                /* we're the last user if refcount is 0; remove and free */
@@ -3782,12 +3809,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l)
                mutex_unlock(&l->owner->pidlist_mutex);
                pidlist_free(l->list);
                put_pid_ns(l->key.ns);
-                up_write(&l->mutex);
+                up_write(&l->rwsem);
                kfree(l);
                return;
        }
        mutex_unlock(&l->owner->pidlist_mutex);
-        up_write(&l->mutex);
+        up_write(&l->rwsem);
 }
 static int cgroup_pidlist_release(struct inode *inode, struct file *file)
@@ -3851,21 +3878,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file)
        return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
 }
-static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
+static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
-                                            struct cftype *cft)
+                                         struct cftype *cft)
 {
-        return notify_on_release(cgrp);
+        return notify_on_release(css->cgroup);
 }
-static int cgroup_write_notify_on_release(struct cgroup *cgrp,
+static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
-                                          struct cftype *cft,
+                                          struct cftype *cft, u64 val)
-                                          u64 val)
 {
-        clear_bit(CGRP_RELEASABLE, &cgrp->flags);
+        clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
        if (val)
-                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+                set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
        else
-                clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+                clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
        return 0;
 }
@@ -3895,18 +3921,18 @@ static void cgroup_event_remove(struct work_struct *work)
 {
        struct cgroup_event *event = container_of(work, struct cgroup_event,
                        remove);
-        struct cgroup *cgrp = event->cgrp;
+        struct cgroup_subsys_state *css = event->css;
        remove_wait_queue(event->wqh, &event->wait);
-        event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+        event->cft->unregister_event(css, event->cft, event->eventfd);
        /* Notify userspace the event is going away. */
        eventfd_signal(event->eventfd, 1);
        eventfd_ctx_put(event->eventfd);
        kfree(event);
-        cgroup_dput(cgrp);
+        css_put(css);
 }
 /*
@@ -3919,7 +3945,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
 {
        struct cgroup_event *event = container_of(wait,
                        struct cgroup_event, wait);
-        struct cgroup *cgrp = event->cgrp;
+        struct cgroup *cgrp = event->css->cgroup;
        unsigned long flags = (unsigned long)key;
        if (flags & POLLHUP) {
@@ -3963,14 +3989,15 @@ static void cgroup_event_ptable_queue_proc(struct file *file,
 * Input must be in format '<event_fd> <control_fd> <args>'.
 * Interpretation of args is defined by control file implementation.
 */
-static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
+static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
-                                      const char *buffer)
+                                      struct cftype *cft, const char *buffer)
 {
-        struct cgroup_event *event = NULL;
+        struct cgroup *cgrp = dummy_css->cgroup;
-        struct cgroup *cgrp_cfile;
+        struct cgroup_event *event;
+        struct cgroup_subsys_state *cfile_css;
        unsigned int efd, cfd;
-        struct file *efile = NULL;
+        struct fd efile;
-        struct file *cfile = NULL;
+        struct fd cfile;
        char *endp;
        int ret;
@@ -3987,109 +4014,113 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
        event = kzalloc(sizeof(*event), GFP_KERNEL);
        if (!event)
                return -ENOMEM;
-        event->cgrp = cgrp;
        INIT_LIST_HEAD(&event->list);
        init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
        init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
        INIT_WORK(&event->remove, cgroup_event_remove);
-        efile = eventfd_fget(efd);
+        efile = fdget(efd);
-        if (IS_ERR(efile)) {
+        if (!efile.file) {
-                ret = PTR_ERR(efile);
+                ret = -EBADF;
-                goto fail;
+                goto out_kfree;
        }
-        event->eventfd = eventfd_ctx_fileget(efile);
+        event->eventfd = eventfd_ctx_fileget(efile.file);
        if (IS_ERR(event->eventfd)) {
                ret = PTR_ERR(event->eventfd);
-                goto fail;
+                goto out_put_efile;
        }
-        cfile = fget(cfd);
+        cfile = fdget(cfd);
-        if (!cfile) {
+        if (!cfile.file) {
                ret = -EBADF;
-                goto fail;
+                goto out_put_eventfd;
        }
        /* the process need read permission on control file */
        /* AV: shouldn't we check that it's been opened for read instead? */
-        ret = inode_permission(file_inode(cfile), MAY_READ);
+        ret = inode_permission(file_inode(cfile.file), MAY_READ);
        if (ret < 0)
-                goto fail;
+                goto out_put_cfile;
-        event->cft = __file_cft(cfile);
+        event->cft = __file_cft(cfile.file);
        if (IS_ERR(event->cft)) {
                ret = PTR_ERR(event->cft);
-                goto fail;
+                goto out_put_cfile;
+        }
+        if (!event->cft->ss) {
+                ret = -EBADF;
+                goto out_put_cfile;
        }
        /*
-         * The file to be monitored must be in the same cgroup as
+         * Determine the css of @cfile, verify it belongs to the same
-         * cgroup.event_control is.
+         * cgroup as cgroup.event_control, and associate @event with it.
+         * Remaining events are automatically removed on cgroup destruction
+         * but the removal is asynchronous, so take an extra ref.
         */
-        cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+        rcu_read_lock();
-        if (cgrp_cfile != cgrp) {
-                ret = -EINVAL;
+        ret = -EINVAL;
-                goto fail;
+        event->css = cgroup_css(cgrp, event->cft->ss);
-        }
+        cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
+        if (event->css && event->css == cfile_css && css_tryget(event->css))
+                ret = 0;
+        rcu_read_unlock();
+        if (ret)
+                goto out_put_cfile;
        if (!event->cft->register_event || !event->cft->unregister_event) {
                ret = -EINVAL;
-                goto fail;
+                goto out_put_css;
        }
-        ret = event->cft->register_event(cgrp, event->cft,
+        ret = event->cft->register_event(event->css, event->cft,
                        event->eventfd, buffer);
        if (ret)
-                goto fail;
+                goto out_put_css;
-        efile->f_op->poll(efile, &event->pt);
-        /*
+        efile.file->f_op->poll(efile.file, &event->pt);
-         * Events should be removed after rmdir of cgroup directory, but before
-         * destroying subsystem state objects. Let's take reference to cgroup
-         * directory dentry to do that.
-         */
-        dget(cgrp->dentry);
        spin_lock(&cgrp->event_list_lock);
        list_add(&event->list, &cgrp->event_list);
        spin_unlock(&cgrp->event_list_lock);
-        fput(cfile);
+        fdput(cfile);
-        fput(efile);
+        fdput(efile);
        return 0;
-fail:
+out_put_css:
-        if (cfile)
+        css_put(event->css);
-                fput(cfile);
+out_put_cfile:
+        fdput(cfile);
-        if (event && event->eventfd && !IS_ERR(event->eventfd))
+out_put_eventfd:
-                eventfd_ctx_put(event->eventfd);
+        eventfd_ctx_put(event->eventfd);
+out_put_efile:
-        if (!IS_ERR_OR_NULL(efile))
+        fdput(efile);
-                fput(efile);
+out_kfree:
        kfree(event);
        return ret;
 }
-static u64 cgroup_clone_children_read(struct cgroup *cgrp,
+static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
-                                    struct cftype *cft)
+                                      struct cftype *cft)
 {
-        return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+        return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
 }
-static int cgroup_clone_children_write(struct cgroup *cgrp,
+static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
-                                     struct cftype *cft,
+                                       struct cftype *cft, u64 val)
-                                     u64 val)
 {
        if (val)
-                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
        else
-                clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
+                clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
        return 0;
 }
@@ -4148,56 +4179,82 @@ static struct cftype cgroup_base_files[] = {
 };
 /**
- * cgroup_populate_dir - selectively creation of files in a directory
+ * cgroup_populate_dir - create subsys files in a cgroup directory
 * @cgrp: target cgroup
- * @base_files: true if the base files should be added
 * @subsys_mask: mask of the subsystem ids whose files should be added
+ *
+ * On failure, no file is added.
 */
-static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
-                               unsigned long subsys_mask)
 {
-        int err;
        struct cgroup_subsys *ss;
+        int i, ret = 0;
-        if (base_files) {
-                err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
-                if (err < 0)
-                        return err;
-        }
        /* process cftsets of each subsystem */
-        for_each_root_subsys(cgrp->root, ss) {
+        for_each_subsys(ss, i) {
                struct cftype_set *set;
-                if (!test_bit(ss->subsys_id, &subsys_mask))
+                if (!test_bit(i, &subsys_mask))
                        continue;
-                list_for_each_entry(set, &ss->cftsets, node)
+                list_for_each_entry(set, &ss->cftsets, node) {
-                        cgroup_addrm_files(cgrp, ss, set->cfts, true);
+                        ret = cgroup_addrm_files(cgrp, set->cfts, true);
+                        if (ret < 0)
+                                goto err;
+                }
        }
+        return 0;
+err:
+        cgroup_clear_dir(cgrp, subsys_mask);
+        return ret;
+}
-        /* This cgroup is ready now */
+/*
-        for_each_root_subsys(cgrp->root, ss) {
+ * css destruction is four-stage process.
-                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+ *
-                struct css_id *id = rcu_dereference_protected(css->id, true);
+ * 1. Destruction starts.  Killing of the percpu_ref is initiated.
+ *    Implemented in kill_css().
+ *
+ * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
+ *    and thus css_tryget() is guaranteed to fail, the css can be offlined
+ *    by invoking offline_css().  After offlining, the base ref is put.
+ *    Implemented in css_killed_work_fn().
+ *
+ * 3. When the percpu_ref reaches zero, the only possible remaining
+ *    accessors are inside RCU read sections.  css_release() schedules the
+ *    RCU callback.
+ *
+ * 4. After the grace period, the css can be freed.  Implemented in
+ *    css_free_work_fn().
+ *
+ * It is actually hairier because both step 2 and 4 require process context
+ * and thus involve punting to css->destroy_work adding two additional
+ * steps to the already complex sequence.
+ */
+static void css_free_work_fn(struct work_struct *work)
+{
+        struct cgroup_subsys_state *css =
+                container_of(work, struct cgroup_subsys_state, destroy_work);
+        struct cgroup *cgrp = css->cgroup;
-                /*
+        if (css->parent)
-                 * Update id->css pointer and make this css visible from
+                css_put(css->parent);
-                 * CSS ID functions. This pointer will be dereferened
-                 * from RCU-read-side without locks.
-                 */
-                if (id)
-                        rcu_assign_pointer(id->css, css);
-        }
-        return 0;
+        css->ss->css_free(css);
+        cgroup_dput(cgrp);
 }
-static void css_dput_fn(struct work_struct *work)
+static void css_free_rcu_fn(struct rcu_head *rcu_head)
 {
        struct cgroup_subsys_state *css =
-                container_of(work, struct cgroup_subsys_state, dput_work);
+                container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
-        cgroup_dput(css->cgroup);
+        /*
+         * css holds an extra ref to @cgrp->dentry which is put on the last
+         * css_put().  dput() requires process context which we don't have.
+         */
+        INIT_WORK(&css->destroy_work, css_free_work_fn);
+        schedule_work(&css->destroy_work);
 }
 static void css_release(struct percpu_ref *ref)
@@ -4205,49 +4262,46 @@ static void css_release(struct percpu_ref *ref)
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);
-        schedule_work(&css->dput_work);
+        call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
-static void init_cgroup_css(struct cgroup_subsys_state *css,
+static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
-                               struct cgroup_subsys *ss,
+                     struct cgroup *cgrp)
-                               struct cgroup *cgrp)
 {
        css->cgroup = cgrp;
+        css->ss = ss;
        css->flags = 0;
-        css->id = NULL;
-        if (cgrp == cgroup_dummy_top)
+        if (cgrp->parent)
+                css->parent = cgroup_css(cgrp->parent, ss);
+        else
                css->flags |= CSS_ROOT;
-        BUG_ON(cgrp->subsys[ss->subsys_id]);
-        cgrp->subsys[ss->subsys_id] = css;
-        /*
+        BUG_ON(cgroup_css(cgrp, ss));
-         * css holds an extra ref to @cgrp->dentry which is put on the last
-         * css_put().  dput() requires process context, which css_put() may
-         * be called without.  @css->dput_work will be used to invoke
-         * dput() asynchronously from css_put().
-         */
-        INIT_WORK(&css->dput_work, css_dput_fn);
 }
-/* invoke ->post_create() on a new CSS and mark it online if successful */
+/* invoke ->css_online() on a new CSS and mark it online if successful */
-static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static int online_css(struct cgroup_subsys_state *css)
 {
+        struct cgroup_subsys *ss = css->ss;
        int ret = 0;
        lockdep_assert_held(&cgroup_mutex);
        if (ss->css_online)
-                ret = ss->css_online(cgrp);
+                ret = ss->css_online(css);
-        if (!ret)
+        if (!ret) {
-                cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
+                css->flags |= CSS_ONLINE;
+                css->cgroup->nr_css++;
+                rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
+        }
        return ret;
 }
-/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
+/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
-static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+static void offline_css(struct cgroup_subsys_state *css)
-        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
-        struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+        struct cgroup_subsys *ss = css->ss;
        lockdep_assert_held(&cgroup_mutex);
@@ -4255,9 +4309,11 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
                return;
        if (ss->css_offline)
-                ss->css_offline(cgrp);
+                ss->css_offline(css);
-        cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
+        css->flags &= ~CSS_ONLINE;
+        css->cgroup->nr_css--;
+        RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
 }
 /*
@@ -4271,6 +4327,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                             umode_t mode)
 {
+        struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
        struct cgroup *cgrp;
        struct cgroup_name *name;
        struct cgroupfs_root *root = parent->root;
@@ -4288,7 +4345,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                goto err_free_cgrp;
        rcu_assign_pointer(cgrp->name, name);
-        cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
+        /*
+         * Temporarily set the pointer to NULL, so idr_find() won't return
+         * a half-baked cgroup.
+         */
+        cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
        if (cgrp->id < 0)
                goto err_free_name;
@@ -4317,6 +4378,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        cgrp->dentry = dentry;
        cgrp->parent = parent;
+        cgrp->dummy_css.parent = &parent->dummy_css;
        cgrp->root = parent->root;
        if (notify_on_release(parent))
@@ -4328,25 +4390,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        for_each_root_subsys(root, ss) {
                struct cgroup_subsys_state *css;
-                css = ss->css_alloc(cgrp);
+                css = ss->css_alloc(cgroup_css(parent, ss));
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
                        goto err_free_all;
                }
+                css_ar[ss->subsys_id] = css;
                err = percpu_ref_init(&css->refcnt, css_release);
-                if (err) {
+                if (err)
-                        ss->css_free(cgrp);
                        goto err_free_all;
-                }
-                init_cgroup_css(css, ss, cgrp);
-                if (ss->use_id) {
+                init_css(css, ss, cgrp);
-                        err = alloc_css_id(ss, parent, cgrp);
-                        if (err)
-                                goto err_free_all;
-                }
        }
        /*
@@ -4365,16 +4420,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
        root->number_of_cgroups++;
-        /* each css holds a ref to the cgroup's dentry */
+        /* each css holds a ref to the cgroup's dentry and the parent css */
-        for_each_root_subsys(root, ss)
+        for_each_root_subsys(root, ss) {
+                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
                dget(dentry);
+                css_get(css->parent);
+        }
        /* hold a ref to the parent's dentry */
        dget(parent->dentry);
        /* creation succeeded, notify subsystems */
        for_each_root_subsys(root, ss) {
-                err = online_css(ss, cgrp);
+                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
+                err = online_css(css);
                if (err)
                        goto err_destroy;
@@ -4388,7 +4449,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                }
        }
-        err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
+        idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
+        err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+        if (err)
+                goto err_destroy;
+        err = cgroup_populate_dir(cgrp, root->subsys_mask);
        if (err)
                goto err_destroy;
@@ -4399,18 +4466,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 err_free_all:
        for_each_root_subsys(root, ss) {
-                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+                struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
                if (css) {
                        percpu_ref_cancel_init(&css->refcnt);
-                        ss->css_free(cgrp);
+                        ss->css_free(css);
                }
        }
        mutex_unlock(&cgroup_mutex);
        /* Release the reference count that we took on the superblock */
        deactivate_super(sb);
 err_free_id:
-        ida_simple_remove(&root->cgroup_ida, cgrp->id);
+        idr_remove(&root->cgroup_idr, cgrp->id);
 err_free_name:
        kfree(rcu_dereference_raw(cgrp->name));
 err_free_cgrp:
@@ -4432,22 +4499,84 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
-static void cgroup_css_killed(struct cgroup *cgrp)
+/*
+ * This is called when the refcnt of a css is confirmed to be killed.
+ * css_tryget() is now guaranteed to fail.
+ */
+static void css_killed_work_fn(struct work_struct *work)
 {
-        if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
+        struct cgroup_subsys_state *css =
-                return;
+                container_of(work, struct cgroup_subsys_state, destroy_work);
+        struct cgroup *cgrp = css->cgroup;
-        /* percpu ref's of all css's are killed, kick off the next step */
+        mutex_lock(&cgroup_mutex);
-        INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
-        schedule_work(&cgrp->destroy_work);
+        /*
+         * css_tryget() is guaranteed to fail now.  Tell subsystems to
+         * initate destruction.
+         */
+        offline_css(css);
+        /*
+         * If @cgrp is marked dead, it's waiting for refs of all css's to
+         * be disabled before proceeding to the second phase of cgroup
+         * destruction.  If we are the last one, kick it off.
+         */
+        if (!cgrp->nr_css && cgroup_is_dead(cgrp))
+                cgroup_destroy_css_killed(cgrp);
+        mutex_unlock(&cgroup_mutex);
+        /*
+         * Put the css refs from kill_css().  Each css holds an extra
+         * reference to the cgroup's dentry and cgroup removal proceeds
+         * regardless of css refs.  On the last put of each css, whenever
+         * that may be, the extra dentry ref is put so that dentry
+         * destruction happens only after all css's are released.
+         */
+        css_put(css);
 }
-static void css_ref_killed_fn(struct percpu_ref *ref)
+/* css kill confirmation processing requires process context, bounce */
+static void css_killed_ref_fn(struct percpu_ref *ref)
 {
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);
-        cgroup_css_killed(css->cgroup);
+        INIT_WORK(&css->destroy_work, css_killed_work_fn);
+        schedule_work(&css->destroy_work);
+}
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference.  ->css_offline() will be invoked
+ * asynchronously once css_tryget() is guaranteed to fail and when the
+ * reference count reaches zero, @css will be released.
+ */
+static void kill_css(struct cgroup_subsys_state *css)
+{
+        cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
+        /*
+         * Killing would put the base ref, but we need to keep it alive
+         * until after ->css_offline().
+         */
+        css_get(css);
+        /*
+         * cgroup core guarantees that, by the time ->css_offline() is
+         * invoked, no new css reference will be given out via
+         * css_tryget().  We can't simply call percpu_ref_kill() and
+         * proceed to offlining css's because percpu_ref_kill() doesn't
+         * guarantee that the ref is seen as killed on all CPUs on return.
+         *
+         * Use percpu_ref_kill_and_confirm() to get notifications as each
+         * css is confirmed to be seen as killed on all CPUs.
+         */
+        percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
 }
 /**
@@ -4480,6 +4609,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        struct dentry *d = cgrp->dentry;
        struct cgroup_event *event, *tmp;
        struct cgroup_subsys *ss;
+        struct cgroup *child;
        bool empty;
        lockdep_assert_held(&d->d_inode->i_mutex);
@@ -4490,47 +4620,41 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         * @cgrp from being removed while __put_css_set() is in progress.
         */
        read_lock(&css_set_lock);
-        empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
+        empty = list_empty(&cgrp->cset_links);
        read_unlock(&css_set_lock);
        if (!empty)
                return -EBUSY;
        /*
-         * Block new css_tryget() by killing css refcnts.  cgroup core
+         * Make sure there's no live children.  We can't test ->children
-         * guarantees that, by the time ->css_offline() is invoked, no new
+         * emptiness as dead children linger on it while being destroyed;
-         * css reference will be given out via css_tryget().  We can't
+         * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
-         * simply call percpu_ref_kill() and proceed to offlining css's
-         * because percpu_ref_kill() doesn't guarantee that the ref is seen
-         * as killed on all CPUs on return.
-         *
-         * Use percpu_ref_kill_and_confirm() to get notifications as each
-         * css is confirmed to be seen as killed on all CPUs.  The
-         * notification callback keeps track of the number of css's to be
-         * killed and schedules cgroup_offline_fn() to perform the rest of
-         * destruction once the percpu refs of all css's are confirmed to
-         * be killed.
         */
-        atomic_set(&cgrp->css_kill_cnt, 1);
+        empty = true;
-        for_each_root_subsys(cgrp->root, ss) {
+        rcu_read_lock();
-                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+        list_for_each_entry_rcu(child, &cgrp->children, sibling) {
+                empty = cgroup_is_dead(child);
-                /*
+                if (!empty)
-                 * Killing would put the base ref, but we need to keep it
+                        break;
-                 * alive until after ->css_offline.
-                 */
-                percpu_ref_get(&css->refcnt);
-                atomic_inc(&cgrp->css_kill_cnt);
-                percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
        }
-        cgroup_css_killed(cgrp);
+        rcu_read_unlock();
+        if (!empty)
+                return -EBUSY;
+        /*
+         * Initiate massacre of all css's.  cgroup_destroy_css_killed()
+         * will be invoked to perform the rest of destruction once the
+         * percpu refs of all css's are confirmed to be killed.
+         */
+        for_each_root_subsys(cgrp->root, ss)
+                kill_css(cgroup_css(cgrp, ss));
        /*
         * Mark @cgrp dead.  This prevents further task migration and child
         * creation by disabling cgroup_lock_live_group().  Note that
-         * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to
+         * CGRP_DEAD assertion is depended upon by css_next_child() to
         * resume iteration after dropping RCU read lock.  See
-         * cgroup_next_sibling() for details.
+         * css_next_child() for details.
         */
        set_bit(CGRP_DEAD, &cgrp->flags);
@@ -4541,9 +4665,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        raw_spin_unlock(&release_list_lock);
        /*
-         * Remove @cgrp directory.  The removal puts the base ref but we
+         * If @cgrp has css's attached, the second stage of cgroup
-         * aren't quite done with @cgrp yet, so hold onto it.
+         * destruction is kicked off from css_killed_work_fn() after the
+         * refs of all attached css's are killed.  If @cgrp doesn't have
+         * any css, we kick it off here.
         */
+        if (!cgrp->nr_css)
+                cgroup_destroy_css_killed(cgrp);
+        /*
+         * Clear the base files and remove @cgrp directory.  The removal
+         * puts the base ref but we aren't quite done with @cgrp yet, so
+         * hold onto it.
+         */
+        cgroup_addrm_files(cgrp, cgroup_base_files, false);
        dget(d);
        cgroup_d_remove_dir(d);
@@ -4563,50 +4698,36 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 };
 /**
- * cgroup_offline_fn - the second step of cgroup destruction
+ * cgroup_destroy_css_killed - the second step of cgroup destruction
 * @work: cgroup->destroy_free_work
 *
 * This function is invoked from a work item for a cgroup which is being
- * destroyed after the percpu refcnts of all css's are guaranteed to be
+ * destroyed after all css's are offlined and performs the rest of
- * seen as killed on all CPUs, and performs the rest of destruction.  This
+ * destruction.  This is the second step of destruction described in the
- * is the second step of destruction described in the comment above
+ * comment above cgroup_destroy_locked().
- * cgroup_destroy_locked().
 */
-static void cgroup_offline_fn(struct work_struct *work)
+static void cgroup_destroy_css_killed(struct cgroup *cgrp)
 {
-        struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
        struct cgroup *parent = cgrp->parent;
        struct dentry *d = cgrp->dentry;
-        struct cgroup_subsys *ss;
-        mutex_lock(&cgroup_mutex);
+        lockdep_assert_held(&cgroup_mutex);
-        /*
+        /* delete this cgroup from parent->children */
-         * css_tryget() is guaranteed to fail now.  Tell subsystems to
+        list_del_rcu(&cgrp->sibling);
-         * initate destruction.
-         */
-        for_each_root_subsys(cgrp->root, ss)
-                offline_css(ss, cgrp);
        /*
-         * Put the css refs from cgroup_destroy_locked().  Each css holds
+         * We should remove the cgroup object from idr before its grace
-         * an extra reference to the cgroup's dentry and cgroup removal
+         * period starts, so we won't be looking up a cgroup while the
-         * proceeds regardless of css refs.  On the last put of each css,
+         * cgroup is being freed.
-         * whenever that may be, the extra dentry ref is put so that dentry
-         * destruction happens only after all css's are released.
         */
-        for_each_root_subsys(cgrp->root, ss)
+        idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
-                css_put(cgrp->subsys[ss->subsys_id]);
+        cgrp->id = -1;
-        /* delete this cgroup from parent->children */
-        list_del_rcu(&cgrp->sibling);
        dput(d);
        set_bit(CGRP_RELEASABLE, &parent->flags);
        check_for_release(parent);
-        mutex_unlock(&cgroup_mutex);
 }
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4629,6 +4750,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
         * deregistration.
         */
        if (ss->base_cftypes) {
+                struct cftype *cft;
+                for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
+                        cft->ss = ss;
                ss->base_cftset.cfts = ss->base_cftypes;
                list_add_tail(&ss->base_cftset.node, &ss->cftsets);
        }
@@ -4648,10 +4774,10 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        /* Create the top cgroup state for this subsystem */
        list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
        ss->root = &cgroup_dummy_root;
-        css = ss->css_alloc(cgroup_dummy_top);
+        css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
-        init_cgroup_css(css, ss, cgroup_dummy_top);
+        init_css(css, ss, cgroup_dummy_top);
        /* Update the init_css_set to contain a subsys
         * pointer to this state - since the subsystem is
@@ -4666,7 +4792,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));
-        BUG_ON(online_css(ss, cgroup_dummy_top));
+        BUG_ON(online_css(css));
        mutex_unlock(&cgroup_mutex);
@@ -4727,7 +4853,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         * struct, so this can happen first (i.e. before the dummy root
         * attachment).
         */
-        css = ss->css_alloc(cgroup_dummy_top);
+        css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
        if (IS_ERR(css)) {
                /* failure case - need to deassign the cgroup_subsys[] slot. */
                cgroup_subsys[ss->subsys_id] = NULL;
@@ -4739,13 +4865,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        ss->root = &cgroup_dummy_root;
        /* our new subsystem will be attached to the dummy hierarchy. */
-        init_cgroup_css(css, ss, cgroup_dummy_top);
+        init_css(css, ss, cgroup_dummy_top);
-        /* init_idr must be after init_cgroup_css because it sets css->id. */
-        if (ss->use_id) {
-                ret = cgroup_init_idr(ss, css);
-                if (ret)
-                        goto err_unload;
-        }
        /*
         * Now we need to entangle the css into the existing css_sets. unlike
@@ -4770,7 +4890,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        }
        write_unlock(&css_set_lock);
-        ret = online_css(ss, cgroup_dummy_top);
+        ret = online_css(css);
        if (ret)
                goto err_unload;
@@ -4802,17 +4922,14 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        /*
         * we shouldn't be called if the subsystem is in use, and the use of
-         * try_module_get in parse_cgroupfs_options should ensure that it
+         * try_module_get() in rebind_subsystems() should ensure that it
         * doesn't start being used while we're killing it off.
         */
        BUG_ON(ss->root != &cgroup_dummy_root);
        mutex_lock(&cgroup_mutex);
-        offline_css(ss, cgroup_dummy_top);
+        offline_css(cgroup_css(cgroup_dummy_top, ss));
-        if (ss->use_id)
-                idr_destroy(&ss->idr);
        /* deassign the subsys_id */
        cgroup_subsys[ss->subsys_id] = NULL;
@@ -4840,11 +4957,10 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        /*
         * remove subsystem's css from the cgroup_dummy_top and free it -
         * need to free before marking as null because ss->css_free needs
-         * the cgrp->subsys pointer to find their state. note that this
+         * the cgrp->subsys pointer to find their state.
-         * also takes care of freeing the css_id.
         */
-        ss->css_free(cgroup_dummy_top);
+        ss->css_free(cgroup_css(cgroup_dummy_top, ss));
-        cgroup_dummy_top->subsys[ss->subsys_id] = NULL;
+        RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
        mutex_unlock(&cgroup_mutex);
 }
@@ -4912,8 +5028,6 @@ int __init cgroup_init(void)
        for_each_builtin_subsys(ss, i) {
                if (!ss->early_init)
                        cgroup_init_subsys(ss);
-                if (ss->use_id)
-                        cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
        }
        /* allocate id for the dummy hierarchy */
@@ -4926,6 +5040,10 @@ int __init cgroup_init(void)
        BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
+        err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
+                        0, 1, GFP_KERNEL);
+        BUG_ON(err < 0);
        mutex_unlock(&cgroup_root_mutex);
        mutex_unlock(&cgroup_mutex);
@@ -5082,7 +5200,7 @@ void cgroup_fork(struct task_struct *child)
 * Adds the task to the list running through its css_set if necessary and
 * call the subsystem fork() callbacks.  Has to be after the task is
 * visible on the task list in case we race with the first call to
- * cgroup_iter_start() - to guarantee that the new task ends up on its
+ * cgroup_task_iter_start() - to guarantee that the new task ends up on its
 * list.
 */
 void cgroup_post_fork(struct task_struct *child)
@@ -5195,10 +5313,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
                 */
                for_each_builtin_subsys(ss, i) {
                        if (ss->exit) {
-                                struct cgroup *old_cgrp = cset->subsys[i]->cgroup;
+                                struct cgroup_subsys_state *old_css = cset->subsys[i];
-                                struct cgroup *cgrp = task_cgroup(tsk, i);
+                                struct cgroup_subsys_state *css = task_css(tsk, i);
-                                ss->exit(cgrp, old_cgrp, tsk);
+                                ss->exit(css, old_css, tsk);
                        }
                }
        }
@@ -5329,210 +5447,56 @@ static int __init cgroup_disable(char *str)
 }
 __setup("cgroup_disable=", cgroup_disable);
-/*
- * Functons for CSS ID.
- */
-/* to get ID other than 0, this should be called when !cgroup_is_dead() */
-unsigned short css_id(struct cgroup_subsys_state *css)
-{
-        struct css_id *cssid;
-        /*
-         * This css_id() can return correct value when somone has refcnt
-         * on this or this is under rcu_read_lock(). Once css->id is allocated,
-         * it's unchanged until freed.
-         */
-        cssid = rcu_dereference_raw(css->id);
-        if (cssid)
-                return cssid->id;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(css_id);
 /**
- *  css_is_ancestor - test "root" css is an ancestor of "child"
+ * css_from_dir - get corresponding css from the dentry of a cgroup dir
- * @child: the css to be tested.
+ * @dentry: directory dentry of interest
- * @root: the css supporsed to be an ancestor of the child.
+ * @ss: subsystem of interest
 *
- * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
+ * Must be called under RCU read lock.  The caller is responsible for
- * this function reads css->id, the caller must hold rcu_read_lock().
+ * pinning the returned css if it needs to be accessed outside the RCU
- * But, considering usual usage, the csses should be valid objects after test.
+ * critical section.
- * Assuming that the caller will do some action to the child if this returns
- * returns true, the caller must take "child";s reference count.
- * If "child" is valid object and this returns true, "root" is valid, too.
- */
-bool css_is_ancestor(struct cgroup_subsys_state *child,
-                    const struct cgroup_subsys_state *root)
-{
-        struct css_id *child_id;
-        struct css_id *root_id;
-        child_id  = rcu_dereference(child->id);
-        if (!child_id)
-                return false;
-        root_id = rcu_dereference(root->id);
-        if (!root_id)
-                return false;
-        if (child_id->depth < root_id->depth)
-                return false;
-        if (child_id->stack[root_id->depth] != root_id->id)
-                return false;
-        return true;
-}
-void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
-{
-        struct css_id *id = rcu_dereference_protected(css->id, true);
-        /* When this is called before css_id initialization, id can be NULL */
-        if (!id)
-                return;
-        BUG_ON(!ss->use_id);
-        rcu_assign_pointer(id->css, NULL);
-        rcu_assign_pointer(css->id, NULL);
-        spin_lock(&ss->id_lock);
-        idr_remove(&ss->idr, id->id);
-        spin_unlock(&ss->id_lock);
-        kfree_rcu(id, rcu_head);
-}
-EXPORT_SYMBOL_GPL(free_css_id);
-/*
- * This is called by init or create(). Then, calls to this function are
- * always serialized (By cgroup_mutex() at create()).
 */
+struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
-static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
+                                         struct cgroup_subsys *ss)
 {
-        struct css_id *newid;
+        struct cgroup *cgrp;
-        int ret, size;
-        BUG_ON(!ss->use_id);
-        size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
-        newid = kzalloc(size, GFP_KERNEL);
-        if (!newid)
-                return ERR_PTR(-ENOMEM);
-        idr_preload(GFP_KERNEL);
-        spin_lock(&ss->id_lock);
-        /* Don't use 0. allocates an ID of 1-65535 */
-        ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
-        spin_unlock(&ss->id_lock);
-        idr_preload_end();
-        /* Returns error when there are no free spaces for new ID.*/
-        if (ret < 0)
-                goto err_out;
-        newid->id = ret;
-        newid->depth = depth;
-        return newid;
-err_out:
-        kfree(newid);
-        return ERR_PTR(ret);
-}
-static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
-                                            struct cgroup_subsys_state *rootcss)
-{
-        struct css_id *newid;
-        spin_lock_init(&ss->id_lock);
-        idr_init(&ss->idr);
-        newid = get_new_cssid(ss, 0);
-        if (IS_ERR(newid))
-                return PTR_ERR(newid);
-        newid->stack[0] = newid->id;
-        RCU_INIT_POINTER(newid->css, rootcss);
-        RCU_INIT_POINTER(rootcss->id, newid);
-        return 0;
-}
-static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
-                        struct cgroup *child)
-{
-        int subsys_id, i, depth = 0;
-        struct cgroup_subsys_state *parent_css, *child_css;
-        struct css_id *child_id, *parent_id;
-        subsys_id = ss->subsys_id;
-        parent_css = parent->subsys[subsys_id];
-        child_css = child->subsys[subsys_id];
-        parent_id = rcu_dereference_protected(parent_css->id, true);
-        depth = parent_id->depth + 1;
-        child_id = get_new_cssid(ss, depth);
+        WARN_ON_ONCE(!rcu_read_lock_held());
-        if (IS_ERR(child_id))
-                return PTR_ERR(child_id);
-        for (i = 0; i < depth; i++)
+        /* is @dentry a cgroup dir? */
-                child_id->stack[i] = parent_id->stack[i];
+        if (!dentry->d_inode ||
-        child_id->stack[depth] = child_id->id;
+            dentry->d_inode->i_op != &cgroup_dir_inode_operations)
-        /*
+                return ERR_PTR(-EBADF);
-         * child_id->css pointer will be set after this cgroup is available
-         * see cgroup_populate_dir()
-         */
-        rcu_assign_pointer(child_css->id, child_id);
-        return 0;
+        cgrp = __d_cgrp(dentry);
+        return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
 }
 /**
- * css_lookup - lookup css by id
+ * css_from_id - lookup css by id
- * @ss: cgroup subsys to be looked into.
+ * @id: the cgroup id
- * @id: the id
+ * @ss: cgroup subsys to be looked into
 *
- * Returns pointer to cgroup_subsys_state if there is valid one with id.
+ * Returns the css if there's valid one with @id, otherwise returns NULL.
- * NULL if not. Should be called under rcu_read_lock()
+ * Should be called under rcu_read_lock().
 */
-struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
+struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
-{
-        struct css_id *cssid = NULL;
-        BUG_ON(!ss->use_id);
-        cssid = idr_find(&ss->idr, id);
-        if (unlikely(!cssid))
-                return NULL;
-        return rcu_dereference(cssid->css);
-}
-EXPORT_SYMBOL_GPL(css_lookup);
-/*
- * get corresponding css from file open on cgroupfs directory
- */
-struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
 {
        struct cgroup *cgrp;
-        struct inode *inode;
-        struct cgroup_subsys_state *css;
-        inode = file_inode(f);
+        rcu_lockdep_assert(rcu_read_lock_held() ||
-        /* check in cgroup filesystem dir */
+                           lockdep_is_held(&cgroup_mutex),
-        if (inode->i_op != &cgroup_dir_inode_operations)
+                           "css_from_id() needs proper protection");
-                return ERR_PTR(-EBADF);
-        if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+        cgrp = idr_find(&ss->root->cgroup_idr, id);
-                return ERR_PTR(-EINVAL);
+        if (cgrp)
+                return cgroup_css(cgrp, ss);
-        /* get cgroup */
+        return NULL;
-        cgrp = __d_cgrp(f->f_dentry);
-        css = cgrp->subsys[id];
-        return css ? css : ERR_PTR(-ENOENT);
 }
 #ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
 {
        struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
@@ -5542,22 +5506,24 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
        return css;
 }
-static void debug_css_free(struct cgroup *cgrp)
+static void debug_css_free(struct cgroup_subsys_state *css)
 {
-        kfree(cgrp->subsys[debug_subsys_id]);
+        kfree(css);
 }
-static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+                                struct cftype *cft)
 {
-        return cgroup_task_count(cgrp);
+        return cgroup_task_count(css->cgroup);
 }
-static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 current_css_set_read(struct cgroup_subsys_state *css,
+                                struct cftype *cft)
 {
        return (u64)(unsigned long)current->cgroups;
 }
-static u64 current_css_set_refcount_read(struct cgroup *cgrp,
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
                                         struct cftype *cft)
 {
        u64 count;
@@ -5568,7 +5534,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp,
        return count;
 }
-static int current_css_set_cg_links_read(struct cgroup *cgrp,
+static int current_css_set_cg_links_read(struct cgroup_subsys_state *css,
                                         struct cftype *cft,
                                         struct seq_file *seq)
 {
@@ -5595,14 +5561,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp,
 }
 #define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct cgroup *cgrp,
+static int cgroup_css_links_read(struct cgroup_subsys_state *css,
-                                 struct cftype *cft,
+                                 struct cftype *cft, struct seq_file *seq)
-                                 struct seq_file *seq)
 {
        struct cgrp_cset_link *link;
        read_lock(&css_set_lock);
-        list_for_each_entry(link, &cgrp->cset_links, cset_link) {
+        list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
                struct css_set *cset = link->cset;
                struct task_struct *task;
                int count = 0;
@@ -5621,9 +5586,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp,
        return 0;
 }
-static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-        return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+        return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
 }
 static struct cftype debug_files[] =  {
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 75dda1ea5026..f0ff64d0ebaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -45,25 +45,19 @@ struct freezer {
        spinlock_t                      lock;
 };
-static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
+static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
 {
-        return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
+        return css ? container_of(css, struct freezer, css) : NULL;
-                            struct freezer, css);
 }
 static inline struct freezer *task_freezer(struct task_struct *task)
 {
-        return container_of(task_subsys_state(task, freezer_subsys_id),
+        return css_freezer(task_css(task, freezer_subsys_id));
-                            struct freezer, css);
 }
 static struct freezer *parent_freezer(struct freezer *freezer)
 {
-        struct cgroup *pcg = freezer->css.cgroup->parent;
+        return css_freezer(css_parent(&freezer->css));
-        if (pcg)
-                return cgroup_freezer(pcg);
-        return NULL;
 }
 bool cgroup_freezing(struct task_struct *task)
@@ -92,7 +86,8 @@ static const char *freezer_state_strs(unsigned int state)
 struct cgroup_subsys freezer_subsys;
-static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
+static struct cgroup_subsys_state *
+freezer_css_alloc(struct cgroup_subsys_state *parent_css)
 {
        struct freezer *freezer;
@@ -105,22 +100,22 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
 }
 /**
- * freezer_css_online - commit creation of a freezer cgroup
+ * freezer_css_online - commit creation of a freezer css
- * @cgroup: cgroup being created
+ * @css: css being created
 *
- * We're committing to creation of @cgroup.  Mark it online and inherit
+ * We're committing to creation of @css.  Mark it online and inherit
 * parent's freezing state while holding both parent's and our
 * freezer->lock.
 */
-static int freezer_css_online(struct cgroup *cgroup)
+static int freezer_css_online(struct cgroup_subsys_state *css)
 {
-        struct freezer *freezer = cgroup_freezer(cgroup);
+        struct freezer *freezer = css_freezer(css);
        struct freezer *parent = parent_freezer(freezer);
        /*
         * The following double locking and freezing state inheritance
         * guarantee that @cgroup can never escape ancestors' freezing
-         * states.  See cgroup_for_each_descendant_pre() for details.
+         * states.  See css_for_each_descendant_pre() for details.
         */
        if (parent)
                spin_lock_irq(&parent->lock);
@@ -141,15 +136,15 @@ static int freezer_css_online(struct cgroup *cgroup)
 }
 /**
- * freezer_css_offline - initiate destruction of @cgroup
+ * freezer_css_offline - initiate destruction of a freezer css
- * @cgroup: cgroup being destroyed
+ * @css: css being destroyed
 *
- * @cgroup is going away.  Mark it dead and decrement system_freezing_count
+ * @css is going away.  Mark it dead and decrement system_freezing_count if
- * if it was holding one.
+ * it was holding one.
 */
-static void freezer_css_offline(struct cgroup *cgroup)
+static void freezer_css_offline(struct cgroup_subsys_state *css)
 {
-        struct freezer *freezer = cgroup_freezer(cgroup);
+        struct freezer *freezer = css_freezer(css);
        spin_lock_irq(&freezer->lock);
@@ -161,9 +156,9 @@ static void freezer_css_offline(struct cgroup *cgroup)
        spin_unlock_irq(&freezer->lock);
 }
-static void freezer_css_free(struct cgroup *cgroup)
+static void freezer_css_free(struct cgroup_subsys_state *css)
 {
-        kfree(cgroup_freezer(cgroup));
+        kfree(css_freezer(css));
 }
 /*
@@ -175,25 +170,26 @@ static void freezer_css_free(struct cgroup *cgroup)
 * @freezer->lock.  freezer_attach() makes the new tasks conform to the
 * current state and all following state changes can see the new tasks.
 */
-static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
+static void freezer_attach(struct cgroup_subsys_state *new_css,
+                           struct cgroup_taskset *tset)
 {
-        struct freezer *freezer = cgroup_freezer(new_cgrp);
+        struct freezer *freezer = css_freezer(new_css);
        struct task_struct *task;
        bool clear_frozen = false;
        spin_lock_irq(&freezer->lock);
        /*
-         * Make the new tasks conform to the current state of @new_cgrp.
+         * Make the new tasks conform to the current state of @new_css.
         * For simplicity, when migrating any task to a FROZEN cgroup, we
         * revert it to FREEZING and let update_if_frozen() determine the
         * correct state later.
         *
-         * Tasks in @tset are on @new_cgrp but may not conform to its
+         * Tasks in @tset are on @new_css but may not conform to its
         * current state before executing the following - !frozen tasks may
         * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
         */
-        cgroup_taskset_for_each(task, new_cgrp, tset) {
+        cgroup_taskset_for_each(task, new_css, tset) {
                if (!(freezer->state & CGROUP_FREEZING)) {
                        __thaw_task(task);
                } else {
@@ -231,7 +227,7 @@ static void freezer_fork(struct task_struct *task)
         * The root cgroup is non-freezable, so we can skip the
         * following check.
         */
-        if (!freezer->css.cgroup->parent)
+        if (!parent_freezer(freezer))
                goto out;
        spin_lock_irq(&freezer->lock);
@@ -244,7 +240,7 @@ out:
 /**
 * update_if_frozen - update whether a cgroup finished freezing
- * @cgroup: cgroup of interest
+ * @css: css of interest
 *
 * Once FREEZING is initiated, transition to FROZEN is lazily updated by
 * calling this function.  If the current state is FREEZING but not FROZEN,
@@ -255,14 +251,14 @@ out:
 * update_if_frozen() on all descendants prior to invoking this function.
 *
 * Task states and freezer state might disagree while tasks are being
- * migrated into or out of @cgroup, so we can't verify task states against
+ * migrated into or out of @css, so we can't verify task states against
 * @freezer state here.  See freezer_attach() for details.
 */
-static void update_if_frozen(struct cgroup *cgroup)
+static void update_if_frozen(struct cgroup_subsys_state *css)
 {
-        struct freezer *freezer = cgroup_freezer(cgroup);
+        struct freezer *freezer = css_freezer(css);
-        struct cgroup *pos;
+        struct cgroup_subsys_state *pos;
-        struct cgroup_iter it;
+        struct css_task_iter it;
        struct task_struct *task;
        WARN_ON_ONCE(!rcu_read_lock_held());
@@ -274,8 +270,8 @@ static void update_if_frozen(struct cgroup *cgroup)
                goto out_unlock;
        /* are all (live) children frozen? */
-        cgroup_for_each_child(pos, cgroup) {
+        css_for_each_child(pos, css) {
-                struct freezer *child = cgroup_freezer(pos);
+                struct freezer *child = css_freezer(pos);
                if ((child->state & CGROUP_FREEZER_ONLINE) &&
                    !(child->state & CGROUP_FROZEN))
@@ -283,9 +279,9 @@ static void update_if_frozen(struct cgroup *cgroup)
        }
        /* are all tasks frozen? */
-        cgroup_iter_start(cgroup, &it);
+        css_task_iter_start(css, &it);
-        while ((task = cgroup_iter_next(cgroup, &it))) {
+        while ((task = css_task_iter_next(&it))) {
                if (freezing(task)) {
                        /*
                         * freezer_should_skip() indicates that the task
@@ -300,52 +296,49 @@ static void update_if_frozen(struct cgroup *cgroup)
        freezer->state |= CGROUP_FROZEN;
 out_iter_end:
-        cgroup_iter_end(cgroup, &it);
+        css_task_iter_end(&it);
 out_unlock:
        spin_unlock_irq(&freezer->lock);
 }
-static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
+static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft,
                        struct seq_file *m)
 {
-        struct cgroup *pos;
+        struct cgroup_subsys_state *pos;
        rcu_read_lock();
        /* update states bottom-up */
-        cgroup_for_each_descendant_post(pos, cgroup)
+        css_for_each_descendant_post(pos, css)
                update_if_frozen(pos);
-        update_if_frozen(cgroup);
        rcu_read_unlock();
-        seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
+        seq_puts(m, freezer_state_strs(css_freezer(css)->state));
        seq_putc(m, '\n');
        return 0;
 }
 static void freeze_cgroup(struct freezer *freezer)
 {
-        struct cgroup *cgroup = freezer->css.cgroup;
+        struct css_task_iter it;
-        struct cgroup_iter it;
        struct task_struct *task;
-        cgroup_iter_start(cgroup, &it);
+        css_task_iter_start(&freezer->css, &it);
-        while ((task = cgroup_iter_next(cgroup, &it)))
+        while ((task = css_task_iter_next(&it)))
                freeze_task(task);
-        cgroup_iter_end(cgroup, &it);
+        css_task_iter_end(&it);
 }
 static void unfreeze_cgroup(struct freezer *freezer)
 {
-        struct cgroup *cgroup = freezer->css.cgroup;
+        struct css_task_iter it;
-        struct cgroup_iter it;
        struct task_struct *task;
-        cgroup_iter_start(cgroup, &it);
+        css_task_iter_start(&freezer->css, &it);
-        while ((task = cgroup_iter_next(cgroup, &it)))
+        while ((task = css_task_iter_next(&it)))
                __thaw_task(task);
-        cgroup_iter_end(cgroup, &it);
+        css_task_iter_end(&it);
 }
 /**
@@ -395,12 +388,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
 */
 static void freezer_change_state(struct freezer *freezer, bool freeze)
 {
-        struct cgroup *pos;
+        struct cgroup_subsys_state *pos;
-        /* update @freezer */
-        spin_lock_irq(&freezer->lock);
-        freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
-        spin_unlock_irq(&freezer->lock);
        /*
         * Update all its descendants in pre-order traversal.  Each
@@ -408,24 +396,33 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
         * CGROUP_FREEZING_PARENT.
         */
        rcu_read_lock();
-        cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
+        css_for_each_descendant_pre(pos, &freezer->css) {
-                struct freezer *pos_f = cgroup_freezer(pos);
+                struct freezer *pos_f = css_freezer(pos);
                struct freezer *parent = parent_freezer(pos_f);
-                /*
-                 * Our update to @parent->state is already visible which is
-                 * all we need.  No need to lock @parent.  For more info on
-                 * synchronization, see freezer_post_create().
-                 */
                spin_lock_irq(&pos_f->lock);
-                freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
-                                    CGROUP_FREEZING_PARENT);
+                if (pos_f == freezer) {
+                        freezer_apply_state(pos_f, freeze,
+                                            CGROUP_FREEZING_SELF);
+                } else {
+                        /*
+                         * Our update to @parent->state is already visible
+                         * which is all we need.  No need to lock @parent.
+                         * For more info on synchronization, see
+                         * freezer_post_create().
+                         */
+                        freezer_apply_state(pos_f,
+                                            parent->state & CGROUP_FREEZING,
+                                            CGROUP_FREEZING_PARENT);
+                }
                spin_unlock_irq(&pos_f->lock);
        }
        rcu_read_unlock();
 }
-static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
+static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
                         const char *buffer)
 {
        bool freeze;
@@ -437,20 +434,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
        else
                return -EINVAL;
-        freezer_change_state(cgroup_freezer(cgroup), freeze);
+        freezer_change_state(css_freezer(css), freeze);
        return 0;
 }
-static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
+static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
+                                      struct cftype *cft)
 {
-        struct freezer *freezer = cgroup_freezer(cgroup);
+        struct freezer *freezer = css_freezer(css);
        return (bool)(freezer->state & CGROUP_FREEZING_SELF);
 }
-static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
+static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
+                                        struct cftype *cft)
 {
-        struct freezer *freezer = cgroup_freezer(cgroup);
+        struct freezer *freezer = css_freezer(css);
        return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
 }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 383f8231e436..e5f3917aa05b 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -20,26 +20,46 @@
 #include <linux/hardirq.h>
 #include <linux/export.h>
-DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+#define CREATE_TRACE_POINTS
-#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+#include <trace/events/context_tracking.h>
-        .active = true,
-#endif
+struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE;
-};
+EXPORT_SYMBOL_GPL(context_tracking_enabled);
+DEFINE_PER_CPU(struct context_tracking, context_tracking);
+EXPORT_SYMBOL_GPL(context_tracking);
+void context_tracking_cpu_set(int cpu)
+{
+        if (!per_cpu(context_tracking.active, cpu)) {
+                per_cpu(context_tracking.active, cpu) = true;
+                static_key_slow_inc(&context_tracking_enabled);
+        }
+}
 /**
- * user_enter - Inform the context tracking that the CPU is going to
+ * context_tracking_user_enter - Inform the context tracking that the CPU is going to
- *              enter userspace mode.
+ *                               enter userspace mode.
 *
 * This function must be called right before we switch from the kernel
 * to userspace, when it's guaranteed the remaining kernel instructions
 * to execute won't use any RCU read side critical section because this
 * function sets RCU in extended quiescent state.
 */
-void user_enter(void)
+void context_tracking_user_enter(void)
 {
        unsigned long flags;
        /*
+         * Repeat the user_enter() check here because some archs may be calling
+         * this from asm and if no CPU needs context tracking, they shouldn't
+         * go further. Repeat the check here until they support the static key
+         * check.
+         */
+        if (!static_key_false(&context_tracking_enabled))
+                return;
+        /*
         * Some contexts may involve an exception occuring in an irq,
         * leading to that nesting:
         * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
@@ -54,17 +74,32 @@ void user_enter(void)
        WARN_ON_ONCE(!current->mm);
        local_irq_save(flags);
-        if (__this_cpu_read(context_tracking.active) &&
+        if ( __this_cpu_read(context_tracking.state) != IN_USER) {
-            __this_cpu_read(context_tracking.state) != IN_USER) {
+                if (__this_cpu_read(context_tracking.active)) {
+                        trace_user_enter(0);
+                        /*
+                         * At this stage, only low level arch entry code remains and
+                         * then we'll run in userspace. We can assume there won't be
+                         * any RCU read-side critical section until the next call to
+                         * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+                         * on the tick.
+                         */
+                        vtime_user_enter(current);
+                        rcu_user_enter();
+                }
                /*
-                 * At this stage, only low level arch entry code remains and
+                 * Even if context tracking is disabled on this CPU, because it's outside
-                 * then we'll run in userspace. We can assume there won't be
+                 * the full dynticks mask for example, we still have to keep track of the
-                 * any RCU read-side critical section until the next call to
+                 * context transitions and states to prevent inconsistency on those of
-                 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+                 * other CPUs.
-                 * on the tick.
+                 * If a task triggers an exception in userspace, sleep on the exception
+                 * handler and then migrate to another CPU, that new CPU must know where
+                 * the exception returns by the time we call exception_exit().
+                 * This information can only be provided by the previous CPU when it called
+                 * exception_enter().
+                 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
+                 * is false because we know that CPU is not tickless.
                 */
-                vtime_user_enter(current);
-                rcu_user_enter();
                __this_cpu_write(context_tracking.state, IN_USER);
        }
        local_irq_restore(flags);
@@ -85,12 +120,11 @@ void user_enter(void)
 * instead of preempt_schedule() to exit user context if needed before
 * calling the scheduler.
 */
-void __sched notrace preempt_schedule_context(void)
+asmlinkage void __sched notrace preempt_schedule_context(void)
 {
-        struct thread_info *ti = current_thread_info();
        enum ctx_state prev_ctx;
-        if (likely(ti->preempt_count || irqs_disabled()))
+        if (likely(!preemptible()))
                return;
        /*
@@ -112,8 +146,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
 #endif /* CONFIG_PREEMPT */
 /**
- * user_exit - Inform the context tracking that the CPU is
+ * context_tracking_user_exit - Inform the context tracking that the CPU is
- *             exiting userspace mode and entering the kernel.
+ *                              exiting userspace mode and entering the kernel.
 *
 * This function must be called after we entered the kernel from userspace
 * before any use of RCU read side critical section. This potentially include
@@ -122,47 +156,34 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context);
 * This call supports re-entrancy. This way it can be called from any exception
 * handler without needing to know if we came from userspace or not.
 */
-void user_exit(void)
+void context_tracking_user_exit(void)
 {
        unsigned long flags;
+        if (!static_key_false(&context_tracking_enabled))
+                return;
        if (in_interrupt())
                return;
        local_irq_save(flags);
        if (__this_cpu_read(context_tracking.state) == IN_USER) {
-                /*
+                if (__this_cpu_read(context_tracking.active)) {
-                 * We are going to run code that may use RCU. Inform
+                        /*
-                 * RCU core about that (ie: we may need the tick again).
+                         * We are going to run code that may use RCU. Inform
-                 */
+                         * RCU core about that (ie: we may need the tick again).
-                rcu_user_exit();
+                         */
-                vtime_user_exit(current);
+                        rcu_user_exit();
+                        vtime_user_exit(current);
+                        trace_user_exit(0);
+                }
                __this_cpu_write(context_tracking.state, IN_KERNEL);
        }
        local_irq_restore(flags);
 }
-void guest_enter(void)
-{
-        if (vtime_accounting_enabled())
-                vtime_guest_enter(current);
-        else
-                __guest_enter();
-}
-EXPORT_SYMBOL_GPL(guest_enter);
-void guest_exit(void)
-{
-        if (vtime_accounting_enabled())
-                vtime_guest_exit(current);
-        else
-                __guest_exit();
-}
-EXPORT_SYMBOL_GPL(guest_exit);
 /**
- * context_tracking_task_switch - context switch the syscall callbacks
+ * __context_tracking_task_switch - context switch the syscall callbacks
 * @prev: the task that is being switched out
 * @next: the task that is being switched in
 *
@@ -174,11 +195,19 @@ EXPORT_SYMBOL_GPL(guest_exit);
 * migrate to some CPU that doesn't do the context tracking. As such the TIF
 * flag may not be desired there.
 */
-void context_tracking_task_switch(struct task_struct *prev,
+void __context_tracking_task_switch(struct task_struct *prev,
-                             struct task_struct *next)
+                                    struct task_struct *next)
 {
-        if (__this_cpu_read(context_tracking.active)) {
+        clear_tsk_thread_flag(prev, TIF_NOHZ);
-                clear_tsk_thread_flag(prev, TIF_NOHZ);
+        set_tsk_thread_flag(next, TIF_NOHZ);
-                set_tsk_thread_flag(next, TIF_NOHZ);
-        }
 }
+#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+void __init context_tracking_init(void)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                context_tracking_cpu_set(cpu);
+}
+#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b2b227b82123..63aa50d7ce1e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
 * get_online_cpus() not an api which is called all that often.
 *
 */
-static void cpu_hotplug_begin(void)
+void cpu_hotplug_begin(void)
 {
        cpu_hotplug.active_writer = current;
@@ -127,7 +127,7 @@ static void cpu_hotplug_begin(void)
        }
 }
-static void cpu_hotplug_done(void)
+void cpu_hotplug_done(void)
 {
        cpu_hotplug.active_writer = NULL;
        mutex_unlock(&cpu_hotplug.lock);
@@ -154,10 +154,7 @@ void cpu_hotplug_enable(void)
        cpu_maps_update_done();
 }
-#else /* #if CONFIG_HOTPLUG_CPU */
+#endif  /* CONFIG_HOTPLUG_CPU */
-static void cpu_hotplug_begin(void) {}
-static void cpu_hotplug_done(void) {}
-#endif  /* #else #if CONFIG_HOTPLUG_CPU */
 /* Need to know about CPUs going up/down? */
 int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -311,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        }
        smpboot_park_threads(cpu);
+        /*
+         * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+         * and RCU users of this state to go away such that all new such users
+         * will observe it.
+         *
+         * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+         * not imply sync_sched(), so explicitly call both.
+         */
+#ifdef CONFIG_PREEMPT
+        synchronize_sched();
+#endif
+        synchronize_rcu();
+        /*
+         * So now all preempt/rcu users must observe !cpu_active().
+         */
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
                /* CPU didn't die: tell everyone.  Can't complain. */
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index e695c0a0bcb5..988573a9a387 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
        rcu_idle_enter();
        trace_cpu_idle_rcuidle(0, smp_processor_id());
        local_irq_enable();
-        while (!need_resched())
+        while (!tif_need_resched())
                cpu_relax();
        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
        rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
                        if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
                                cpu_idle_poll();
                        } else {
-                                current_clr_polling();
+                                if (!current_clr_polling_and_test()) {
-                                if (!need_resched()) {
                                        stop_critical_timings();
                                        rcu_idle_enter();
                                        arch_cpu_idle();
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
                                } else {
                                        local_irq_enable();
                                }
-                                current_set_polling();
+                                __current_set_polling();
                        }
                        arch_cpu_idle_exit();
+                        /*
+                         * We need to test and propagate the TIF_NEED_RESCHED
+                         * bit here because we might not have send the
+                         * reschedule IPI to idle tasks.
+                         */
+                        if (tif_need_resched())
+                                set_preempt_need_resched();
                }
                tick_nohz_idle_exit();
                schedule_preempt_disabled();
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
         */
        boot_init_stack_canary();
 #endif
-        current_set_polling();
+        __current_set_polling();
        arch_cpu_idle_prepare();
        cpu_idle_loop();
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e5657788fedd..6bf981e13c43 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -68,10 +68,6 @@
 */
 int number_of_cpusets __read_mostly;
-/* Forward declare cgroup structures */
-struct cgroup_subsys cpuset_subsys;
-struct cpuset;
 /* See "Frequency meter" comments, below. */
 struct fmeter {
@@ -115,27 +111,20 @@ struct cpuset {
        int relax_domain_level;
 };
-/* Retrieve the cpuset for a cgroup */
+static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
-static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
 {
-        return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id),
+        return css ? container_of(css, struct cpuset, css) : NULL;
-                            struct cpuset, css);
 }
 /* Retrieve the cpuset for a task */
 static inline struct cpuset *task_cs(struct task_struct *task)
 {
-        return container_of(task_subsys_state(task, cpuset_subsys_id),
+        return css_cs(task_css(task, cpuset_subsys_id));
-                            struct cpuset, css);
 }
-static inline struct cpuset *parent_cs(const struct cpuset *cs)
+static inline struct cpuset *parent_cs(struct cpuset *cs)
 {
-        struct cgroup *pcgrp = cs->css.cgroup->parent;
+        return css_cs(css_parent(&cs->css));
-        if (pcgrp)
-                return cgroup_cs(pcgrp);
-        return NULL;
 }
 #ifdef CONFIG_NUMA
@@ -212,29 +201,30 @@ static struct cpuset top_cpuset = {
 /**
 * cpuset_for_each_child - traverse online children of a cpuset
 * @child_cs: loop cursor pointing to the current child
- * @pos_cgrp: used for iteration
+ * @pos_css: used for iteration
 * @parent_cs: target cpuset to walk children of
 *
 * Walk @child_cs through the online children of @parent_cs.  Must be used
 * with RCU read locked.
 */
-#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs)            \
+#define cpuset_for_each_child(child_cs, pos_css, parent_cs)             \
-        cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup)      \
+        css_for_each_child((pos_css), &(parent_cs)->css)                \
-                if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
+                if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
 /**
 * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
 * @des_cs: loop cursor pointing to the current descendant
- * @pos_cgrp: used for iteration
+ * @pos_css: used for iteration
 * @root_cs: target cpuset to walk ancestor of
 *
 * Walk @des_cs through the online descendants of @root_cs.  Must be used
- * with RCU read locked.  The caller may modify @pos_cgrp by calling
+ * with RCU read locked.  The caller may modify @pos_css by calling
- * cgroup_rightmost_descendant() to skip subtree.
+ * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
+ * iteration and the first node to be visited.
 */
-#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs)       \
+#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)        \
-        cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
+        css_for_each_descendant_pre((pos_css), &(root_cs)->css)         \
-                if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
+                if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
 /*
 * There are two global mutexes guarding cpuset structures - cpuset_mutex
@@ -320,8 +310,7 @@ static struct file_system_type cpuset_fs_type = {
 *
 * Call with callback_mutex held.
 */
-static void guarantee_online_cpus(const struct cpuset *cs,
+static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
-                                  struct cpumask *pmask)
 {
        while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
                cs = parent_cs(cs);
@@ -339,7 +328,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
 *
 * Call with callback_mutex held.
 */
-static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
+static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 {
        while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
                cs = parent_cs(cs);
@@ -384,7 +373,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 * alloc_trial_cpuset - allocate a trial cpuset
 * @cs: the cpuset that the trial cpuset duplicates
 */
-static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
+static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 {
        struct cpuset *trial;
@@ -431,9 +420,9 @@ static void free_trial_cpuset(struct cpuset *trial)
 * Return 0 if valid, -errno if not.
 */
-static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
+static int validate_change(struct cpuset *cur, struct cpuset *trial)
 {
-        struct cgroup *cgrp;
+        struct cgroup_subsys_state *css;
        struct cpuset *c, *par;
        int ret;
@@ -441,7 +430,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
        /* Each of our child cpusets must be a subset of us */
        ret = -EBUSY;
-        cpuset_for_each_child(c, cgrp, cur)
+        cpuset_for_each_child(c, css, cur)
                if (!is_cpuset_subset(c, trial))
                        goto out;
@@ -462,7 +451,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
         * overlap
         */
        ret = -EINVAL;
-        cpuset_for_each_child(c, cgrp, par) {
+        cpuset_for_each_child(c, css, par) {
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                    c != cur &&
                    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -475,13 +464,17 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
        /*
         * Cpusets with tasks - existing or newly being attached - can't
-         * have empty cpus_allowed or mems_allowed.
+         * be changed to have empty cpus_allowed or mems_allowed.
         */
        ret = -ENOSPC;
-        if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
+        if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) {
-            (cpumask_empty(trial->cpus_allowed) &&
+                if (!cpumask_empty(cur->cpus_allowed) &&
-             nodes_empty(trial->mems_allowed)))
+                    cpumask_empty(trial->cpus_allowed))
-                goto out;
+                        goto out;
+                if (!nodes_empty(cur->mems_allowed) &&
+                    nodes_empty(trial->mems_allowed))
+                        goto out;
+        }
        ret = 0;
 out:
@@ -511,13 +504,16 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
                                    struct cpuset *root_cs)
 {
        struct cpuset *cp;
-        struct cgroup *pos_cgrp;
+        struct cgroup_subsys_state *pos_css;
        rcu_read_lock();
-        cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+        cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+                if (cp == root_cs)
+                        continue;
                /* skip the whole subtree if @cp doesn't have any CPU */
                if (cpumask_empty(cp->cpus_allowed)) {
-                        pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }
@@ -592,7 +588,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;          /* number of sched domains in result */
        int nslot;              /* next empty doms[] struct cpumask slot */
-        struct cgroup *pos_cgrp;
+        struct cgroup_subsys_state *pos_css;
        doms = NULL;
        dattr = NULL;
@@ -621,7 +617,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
        csn = 0;
        rcu_read_lock();
-        cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
+        cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
+                if (cp == &top_cpuset)
+                        continue;
                /*
                 * Continue traversing beyond @cp iff @cp has some CPUs and
                 * isn't load balancing.  The former is obvious.  The
@@ -638,7 +636,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
                        csa[csn++] = cp;
                /* skip @cp's subtree */
-                pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+                pos_css = css_rightmost_descendant(pos_css);
        }
        rcu_read_unlock();
@@ -833,52 +831,45 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
 /**
 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
 * @tsk: task to test
- * @scan: struct cgroup_scanner containing the cgroup of the task
+ * @data: cpuset to @tsk belongs to
 *
- * Called by cgroup_scan_tasks() for each task in a cgroup whose
+ * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
- * cpus_allowed mask needs to be changed.
+ * mask needs to be changed.
 *
 * We don't need to re-check for the cgroup/cpuset membership, since we're
 * holding cpuset_mutex at this point.
 */
-static void cpuset_change_cpumask(struct task_struct *tsk,
+static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
-                                  struct cgroup_scanner *scan)
 {
-        struct cpuset *cpus_cs;
+        struct cpuset *cs = data;
+        struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
-        cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
        set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
 }
 /**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
+ * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
 *
 * Called with cpuset_mutex held
 *
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * The css_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
 *
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * No return value. It's guaranteed that css_scan_tasks() always returns 0
 * if @heap != NULL.
 */
 static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
 {
-        struct cgroup_scanner scan;
+        css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap);
-        scan.cg = cs->css.cgroup;
-        scan.test_task = NULL;
-        scan.process_task = cpuset_change_cpumask;
-        scan.heap = heap;
-        cgroup_scan_tasks(&scan);
 }
 /*
 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
 * @root_cs: the root cpuset of the hierarchy
 * @update_root: update root cpuset or not?
- * @heap: the heap used by cgroup_scan_tasks()
+ * @heap: the heap used by css_scan_tasks()
 *
 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
 * which take on cpumask of @root_cs.
@@ -889,17 +880,19 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
                                      bool update_root, struct ptr_heap *heap)
 {
        struct cpuset *cp;
-        struct cgroup *pos_cgrp;
+        struct cgroup_subsys_state *pos_css;
-        if (update_root)
-                update_tasks_cpumask(root_cs, heap);
        rcu_read_lock();
-        cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+        cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-                /* skip the whole subtree if @cp have some CPU */
+                if (cp == root_cs) {
-                if (!cpumask_empty(cp->cpus_allowed)) {
+                        if (!update_root)
-                        pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+                                continue;
-                        continue;
+                } else {
+                        /* skip the whole subtree if @cp have some CPU */
+                        if (!cpumask_empty(cp->cpus_allowed)) {
+                                pos_css = css_rightmost_descendant(pos_css);
+                                continue;
+                        }
                }
                if (!css_tryget(&cp->css))
                        continue;
@@ -1055,20 +1048,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
        task_unlock(tsk);
 }
+struct cpuset_change_nodemask_arg {
+        struct cpuset           *cs;
+        nodemask_t              *newmems;
+};
 /*
 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
 * memory_migrate flag is set. Called with cpuset_mutex held.
 */
-static void cpuset_change_nodemask(struct task_struct *p,
+static void cpuset_change_nodemask(struct task_struct *p, void *data)
-                                   struct cgroup_scanner *scan)
 {
-        struct cpuset *cs = cgroup_cs(scan->cg);
+        struct cpuset_change_nodemask_arg *arg = data;
+        struct cpuset *cs = arg->cs;
        struct mm_struct *mm;
        int migrate;
-        nodemask_t *newmems = scan->data;
-        cpuset_change_task_nodemask(p, newmems);
+        cpuset_change_task_nodemask(p, arg->newmems);
        mm = get_task_mm(p);
        if (!mm)
@@ -1078,7 +1075,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
        mpol_rebind_mm(mm, &cs->mems_allowed);
        if (migrate)
-                cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems);
+                cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
        mmput(mm);
 }
@@ -1087,28 +1084,22 @@ static void *cpuset_being_rebound;
 /**
 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
+ * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
 *
- * Called with cpuset_mutex held
+ * Called with cpuset_mutex held.  No return value. It's guaranteed that
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * css_scan_tasks() always returns 0 if @heap != NULL.
- * if @heap != NULL.
 */
 static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
 {
        static nodemask_t newmems;      /* protected by cpuset_mutex */
-        struct cgroup_scanner scan;
        struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
+        struct cpuset_change_nodemask_arg arg = { .cs = cs,
+                                                  .newmems = &newmems };
        cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
        guarantee_online_mems(mems_cs, &newmems);
-        scan.cg = cs->css.cgroup;
-        scan.test_task = NULL;
-        scan.process_task = cpuset_change_nodemask;
-        scan.heap = heap;
-        scan.data = &newmems;
        /*
         * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
         * take while holding tasklist_lock.  Forks can happen - the
@@ -1119,7 +1110,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
         */
-        cgroup_scan_tasks(&scan);
+        css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap);
        /*
         * All the tasks' nodemasks have been updated, update
@@ -1135,7 +1126,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
 * @cs: the root cpuset of the hierarchy
 * @update_root: update the root cpuset or not?
- * @heap: the heap used by cgroup_scan_tasks()
+ * @heap: the heap used by css_scan_tasks()
 *
 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
 * which take on nodemask of @root_cs.
@@ -1146,17 +1137,19 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
                                       bool update_root, struct ptr_heap *heap)
 {
        struct cpuset *cp;
-        struct cgroup *pos_cgrp;
+        struct cgroup_subsys_state *pos_css;
-        if (update_root)
-                update_tasks_nodemask(root_cs, heap);
        rcu_read_lock();
-        cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+        cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
-                /* skip the whole subtree if @cp have some CPU */
+                if (cp == root_cs) {
-                if (!nodes_empty(cp->mems_allowed)) {
+                        if (!update_root)
-                        pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+                                continue;
-                        continue;
+                } else {
+                        /* skip the whole subtree if @cp have some CPU */
+                        if (!nodes_empty(cp->mems_allowed)) {
+                                pos_css = css_rightmost_descendant(pos_css);
+                                continue;
+                        }
                }
                if (!css_tryget(&cp->css))
                        continue;
@@ -1263,44 +1256,39 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
        return 0;
 }
-/*
+/**
 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
 * @tsk: task to be updated
- * @scan: struct cgroup_scanner containing the cgroup of the task
+ * @data: cpuset to @tsk belongs to
 *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
+ * Called by css_scan_tasks() for each task in a cgroup.
 *
 * We don't need to re-check for the cgroup/cpuset membership, since we're
 * holding cpuset_mutex at this point.
 */
-static void cpuset_change_flag(struct task_struct *tsk,
+static void cpuset_change_flag(struct task_struct *tsk, void *data)
-                                struct cgroup_scanner *scan)
 {
-        cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
+        struct cpuset *cs = data;
+        cpuset_update_task_spread_flag(cs, tsk);
 }
-/*
+/**
 * update_tasks_flags - update the spread flags of tasks in the cpuset.
 * @cs: the cpuset in which each task's spread flags needs to be changed
- * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
+ * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
 *
 * Called with cpuset_mutex held
 *
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
+ * The css_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
 *
- * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * No return value. It's guaranteed that css_scan_tasks() always returns 0
 * if @heap != NULL.
 */
 static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
 {
-        struct cgroup_scanner scan;
+        css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap);
-        scan.cg = cs->css.cgroup;
-        scan.test_task = NULL;
-        scan.process_task = cpuset_change_flag;
-        scan.heap = heap;
-        cgroup_scan_tasks(&scan);
 }
 /*
@@ -1458,9 +1446,10 @@ static int fmeter_getrate(struct fmeter *fmp)
 }
 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
-static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static int cpuset_can_attach(struct cgroup_subsys_state *css,
+                             struct cgroup_taskset *tset)
 {
-        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *cs = css_cs(css);
        struct task_struct *task;
        int ret;
@@ -1471,11 +1460,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
         * flag is set.
         */
        ret = -ENOSPC;
-        if (!cgroup_sane_behavior(cgrp) &&
+        if (!cgroup_sane_behavior(css->cgroup) &&
            (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                goto out_unlock;
-        cgroup_taskset_for_each(task, cgrp, tset) {
+        cgroup_taskset_for_each(task, css, tset) {
                /*
                 * Kthreads which disallow setaffinity shouldn't be moved
                 * to a new cpuset; we don't want to change their cpu
@@ -1504,11 +1493,11 @@ out_unlock:
        return ret;
 }
-static void cpuset_cancel_attach(struct cgroup *cgrp,
+static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
                                 struct cgroup_taskset *tset)
 {
        mutex_lock(&cpuset_mutex);
-        cgroup_cs(cgrp)->attach_in_progress--;
+        css_cs(css)->attach_in_progress--;
        mutex_unlock(&cpuset_mutex);
 }
@@ -1519,16 +1508,18 @@ static void cpuset_cancel_attach(struct cgroup *cgrp,
 */
 static cpumask_var_t cpus_attach;
-static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static void cpuset_attach(struct cgroup_subsys_state *css,
+                          struct cgroup_taskset *tset)
 {
        /* static buf protected by cpuset_mutex */
        static nodemask_t cpuset_attach_nodemask_to;
        struct mm_struct *mm;
        struct task_struct *task;
        struct task_struct *leader = cgroup_taskset_first(tset);
-        struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
+        struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
-        struct cpuset *cs = cgroup_cs(cgrp);
+                                                        cpuset_subsys_id);
-        struct cpuset *oldcs = cgroup_cs(oldcgrp);
+        struct cpuset *cs = css_cs(css);
+        struct cpuset *oldcs = css_cs(oldcss);
        struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
        struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
@@ -1542,7 +1533,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
        guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
-        cgroup_taskset_for_each(task, cgrp, tset) {
+        cgroup_taskset_for_each(task, css, tset) {
                /*
                 * can_attach beforehand should guarantee that this doesn't
                 * fail.  TODO: have a better way to handle failure here
@@ -1604,15 +1595,18 @@ typedef enum {
        FILE_SPREAD_SLAB,
 } cpuset_filetype_t;
-static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
+                            u64 val)
 {
-        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
-        int retval = -ENODEV;
+        int retval = 0;
        mutex_lock(&cpuset_mutex);
-        if (!is_cpuset_online(cs))
+        if (!is_cpuset_online(cs)) {
+                retval = -ENODEV;
                goto out_unlock;
+        }
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
@@ -1651,9 +1645,10 @@ out_unlock:
        return retval;
 }
-static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
+static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
+                            s64 val)
 {
-        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        int retval = -ENODEV;
@@ -1677,10 +1672,10 @@ out_unlock:
 /*
 * Common handling for a write to a "cpus" or "mems" file.
 */
-static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
+static int cpuset_write_resmask(struct cgroup_subsys_state *css,
-                                const char *buf)
+                                struct cftype *cft, const char *buf)
 {
-        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *cs = css_cs(css);
        struct cpuset *trialcs;
        int retval = -ENODEV;
@@ -1759,13 +1754,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
        return count;
 }
-static ssize_t cpuset_common_file_read(struct cgroup *cgrp,
+static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css,
-                                       struct cftype *cft,
+                                       struct cftype *cft, struct file *file,
-                                       struct file *file,
+                                       char __user *buf, size_t nbytes,
-                                       char __user *buf,
+                                       loff_t *ppos)
-                                       size_t nbytes, loff_t *ppos)
 {
-        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        char *page;
        ssize_t retval = 0;
@@ -1795,9 +1789,9 @@ out:
        return retval;
 }
-static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
@@ -1826,9 +1820,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
        return 0;
 }
-static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1943,11 +1937,12 @@ static struct cftype files[] = {
 *      cgrp:   control group that the new cpuset will be part of
 */
-static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 {
        struct cpuset *cs;
-        if (!cgrp->parent)
+        if (!parent_css)
                return &top_cpuset.css;
        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1967,12 +1962,12 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
        return &cs->css;
 }
-static int cpuset_css_online(struct cgroup *cgrp)
+static int cpuset_css_online(struct cgroup_subsys_state *css)
 {
-        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *cs = css_cs(css);
        struct cpuset *parent = parent_cs(cs);
        struct cpuset *tmp_cs;
-        struct cgroup *pos_cg;
+        struct cgroup_subsys_state *pos_css;
        if (!parent)
                return 0;
@@ -1987,7 +1982,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
        number_of_cpusets++;
-        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
+        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;
        /*
@@ -2004,7 +1999,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
         * (and likewise for mems) to the new cgroup.
         */
        rcu_read_lock();
-        cpuset_for_each_child(tmp_cs, pos_cg, parent) {
+        cpuset_for_each_child(tmp_cs, pos_css, parent) {
                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
                        rcu_read_unlock();
                        goto out_unlock;
@@ -2021,9 +2016,15 @@ out_unlock:
        return 0;
 }
-static void cpuset_css_offline(struct cgroup *cgrp)
+/*
+ * If the cpuset being removed has its flag 'sched_load_balance'
+ * enabled, then simulate turning sched_load_balance off, which
+ * will call rebuild_sched_domains_locked().
+ */
+static void cpuset_css_offline(struct cgroup_subsys_state *css)
 {
-        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *cs = css_cs(css);
        mutex_lock(&cpuset_mutex);
@@ -2036,15 +2037,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
        mutex_unlock(&cpuset_mutex);
 }
-/*
+static void cpuset_css_free(struct cgroup_subsys_state *css)
- * If the cpuset being removed has its flag 'sched_load_balance'
- * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
- */
-static void cpuset_css_free(struct cgroup *cgrp)
 {
-        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *cs = css_cs(css);
        free_cpumask_var(cs->cpus_allowed);
        kfree(cs);
@@ -2251,11 +2246,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        /* if cpus or mems changed, we need to propagate to descendants */
        if (cpus_updated || mems_updated) {
                struct cpuset *cs;
-                struct cgroup *pos_cgrp;
+                struct cgroup_subsys_state *pos_css;
                rcu_read_lock();
-                cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) {
+                cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
-                        if (!css_tryget(&cs->css))
+                        if (cs == &top_cpuset || !css_tryget(&cs->css))
                                continue;
                        rcu_read_unlock();
@@ -2344,7 +2339,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
-        const struct cpuset *cpus_cs;
+        struct cpuset *cpus_cs;
        rcu_read_lock();
        cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
@@ -2417,7 +2412,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
 * (an unusual configuration), then returns the root cpuset.
 */
-static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
+static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 {
        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
                cs = parent_cs(cs);
@@ -2487,7 +2482,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 */
 int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
-        const struct cpuset *cs;        /* current cpuset ancestors */
+        struct cpuset *cs;              /* current cpuset ancestors */
        int allowed;                    /* is allocation in zone z allowed? */
        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
@@ -2725,7 +2720,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
                goto out_free;
        rcu_read_lock();
-        css = task_subsys_state(tsk, cpuset_subsys_id);
+        css = task_css(tsk, cpuset_subsys_id);
        retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
        rcu_read_unlock();
        if (retval < 0)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0506d447aed2..7d2f35e5df2f 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -575,8 +575,12 @@ return_normal:
                raw_spin_lock(&dbg_slave_lock);
 #ifdef CONFIG_SMP
+        /* If send_ready set, slaves are already waiting */
+        if (ks->send_ready)
+                atomic_set(ks->send_ready, 1);
        /* Signal the other CPUs to enter kgdb_wait() */
-        if ((!kgdb_single_step) && kgdb_do_roundup)
+        else if ((!kgdb_single_step) && kgdb_do_roundup)
                kgdb_roundup_cpus(flags);
 #endif
@@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
        if (arch_kgdb_ops.enable_nmi)
                arch_kgdb_ops.enable_nmi(0);
+        memset(ks, 0, sizeof(struct kgdb_state));
        ks->cpu                 = raw_smp_processor_id();
        ks->ex_vector           = evector;
        ks->signo               = signo;
        ks->err_code            = ecode;
-        ks->kgdb_usethreadid    = 0;
        ks->linux_regs          = regs;
        if (kgdb_reenter_check(ks))
@@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs)
        return 1;
 }
+int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready)
+{
+#ifdef CONFIG_SMP
+        if (!kgdb_io_ready(0) || !send_ready)
+                return 1;
+        if (kgdb_info[cpu].enter_kgdb == 0) {
+                struct kgdb_state kgdb_var;
+                struct kgdb_state *ks = &kgdb_var;
+                memset(ks, 0, sizeof(struct kgdb_state));
+                ks->cpu                 = cpu;
+                ks->ex_vector           = trapnr;
+                ks->signo               = SIGTRAP;
+                ks->err_code            = KGDB_KDB_REASON_SYSTEM_NMI;
+                ks->linux_regs          = regs;
+                ks->send_ready          = send_ready;
+                kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+                return 0;
+        }
+#endif
+        return 1;
+}
 static void kgdb_console_write(struct console *co, const char *s,
   unsigned count)
 {
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 2235967e78b0..572aa4f5677c 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -26,6 +26,7 @@ struct kgdb_state {
        unsigned long           threadid;
        long                    kgdb_usethreadid;
        struct pt_regs          *linux_regs;
+        atomic_t                *send_ready;
 };
 /* Exception state values */
@@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks);
 extern int kdb_parse(const char *cmdstr);
 extern int kdb_common_init_state(struct kgdb_state *ks);
 extern int kdb_common_deinit_state(void);
+#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI
 #else /* ! CONFIG_KGDB_KDB */
 static inline int kdb_stub(struct kgdb_state *ks)
 {
        return DBG_PASS_EVENT;
 }
+#define KGDB_KDB_REASON_SYSTEM_NMI 0
 #endif /* CONFIG_KGDB_KDB */
 #endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 328d18ef31e4..8859ca34dcfe 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks)
        if (atomic_read(&kgdb_setting_breakpoint))
                reason = KDB_REASON_KEYBOARD;
-        if (in_nmi())
+        if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP)
+                reason = KDB_REASON_SYSTEM_NMI;
+        else if (in_nmi())
                reason = KDB_REASON_NMI;
        for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 00eb8f7fbf41..0b097c8a1e50 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                           instruction_pointer(regs));
                kdb_dumpregs(regs);
                break;
+        case KDB_REASON_SYSTEM_NMI:
+                kdb_printf("due to System NonMaskable Interrupt\n");
+                break;
        case KDB_REASON_NMI:
                kdb_printf("due to NonMaskable Interrupt @ "
                           kdb_machreg_fmt "\n",
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index c77206184b8b..97b67df8fbfe 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -116,6 +116,9 @@ int get_callchain_buffers(void)
        err = alloc_callchain_buffers();
 exit:
+        if (err)
+                atomic_dec(&nr_callchain_events);
        mutex_unlock(&callchain_mutex);
        return err;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f86599e8c123..8c875ef6e120 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -145,6 +145,7 @@ static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
+static atomic_t nr_freq_events __read_mostly;
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -174,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
-static atomic_t perf_sample_allowed_ns __read_mostly =
+static int perf_sample_allowed_ns __read_mostly =
-        ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 void update_perf_cpu_limits(void)
 {
@@ -183,7 +184,7 @@ void update_perf_cpu_limits(void)
        tmp *= sysctl_perf_cpu_time_max_percent;
        do_div(tmp, 100);
-        atomic_set(&perf_sample_allowed_ns, tmp);
+        ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
 }
 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -192,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-        int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;
@@ -227,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 * we detect that events are taking too long.
 */
 #define NR_ACCUMULATED_SAMPLES 128
-DEFINE_PER_CPU(u64, running_sample_length);
+static DEFINE_PER_CPU(u64, running_sample_length);
 void perf_sample_event_took(u64 sample_len_ns)
 {
        u64 avg_local_sample_len;
        u64 local_samples_len;
+        u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
-        if (atomic_read(&perf_sample_allowed_ns) == 0)
+        if (allowed_ns == 0)
                return;
        /* decay the counter by 1 average sample */
@@ -250,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
         */
        avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-        if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+        if (avg_local_sample_len <= allowed_ns)
                return;
        if (max_samples_per_tick <= 1)
@@ -261,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        printk_ratelimited(KERN_WARNING
-                        "perf samples too long (%lld > %d), lowering "
+                        "perf samples too long (%lld > %lld), lowering "
                        "kernel.perf_event_max_sample_rate to %d\n",
-                        avg_local_sample_len,
+                        avg_local_sample_len, allowed_ns,
-                        atomic_read(&perf_sample_allowed_ns),
                        sysctl_perf_event_sample_rate);
        update_perf_cpu_limits();
@@ -340,8 +341,8 @@ struct perf_cgroup {
 static inline struct perf_cgroup *
 perf_cgroup_from_task(struct task_struct *task)
 {
-        return container_of(task_subsys_state(task, perf_subsys_id),
+        return container_of(task_css(task, perf_subsys_id),
-                        struct perf_cgroup, css);
+                            struct perf_cgroup, css);
 }
 static inline bool
@@ -591,7 +592,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
        if (!f.file)
                return -EBADF;
-        css = cgroup_css_from_dir(f.file, perf_subsys_id);
+        rcu_read_lock();
+        css = css_from_dir(f.file->f_dentry, &perf_subsys);
        if (IS_ERR(css)) {
                ret = PTR_ERR(css);
                goto out;
@@ -617,6 +620,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
                ret = -EINVAL;
        }
 out:
+        rcu_read_unlock();
        fdput(f);
        return ret;
 }
@@ -869,12 +873,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
        WARN_ON(!irqs_disabled());
-        if (list_empty(&cpuctx->rotation_list)) {
+        if (list_empty(&cpuctx->rotation_list))
-                int was_empty = list_empty(head);
                list_add(&cpuctx->rotation_list, head);
-                if (was_empty)
-                        tick_nohz_full_kick();
-        }
 }
 static void get_ctx(struct perf_event_context *ctx)
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
                put_ctx(ctx->parent_ctx);
                ctx->parent_ctx = NULL;
        }
+        ctx->generation++;
 }
 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
+        ctx->generation++;
 }
 /*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
        if (sample_type & PERF_SAMPLE_DATA_SRC)
                size += sizeof(data->data_src.val);
+        if (sample_type & PERF_SAMPLE_TRANSACTION)
+                size += sizeof(data->txn);
        event->header_size = size;
 }
@@ -1216,6 +1222,9 @@ static void perf_event__id_header_size(struct perf_event *event)
        if (sample_type & PERF_SAMPLE_TIME)
                size += sizeof(data->time);
+        if (sample_type & PERF_SAMPLE_IDENTIFIER)
+                size += sizeof(data->id);
        if (sample_type & PERF_SAMPLE_ID)
                size += sizeof(data->id);
@@ -1307,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
         */
        if (event->state > PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_OFF;
+        ctx->generation++;
 }
 static void perf_group_detach(struct perf_event *event)
@@ -2143,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 }
 /*
- * Test whether two contexts are equivalent, i.e. whether they
+ * Test whether two contexts are equivalent, i.e. whether they have both been
- * have both been cloned from the same version of the same context
+ * cloned from the same version of the same context.
- * and they both have the same number of enabled events.
+ *
- * If the number of enabled events is the same, then the set
+ * Equivalence is measured using a generation number in the context that is
- * of enabled events should be the same, because these are both
+ * incremented on each modification to it; see unclone_ctx(), list_add_event()
- * inherited contexts, therefore we can't access individual events
+ * and list_del_event().
- * in them directly with an fd; we can only enable/disable all
- * events via prctl, or enable/disable all events in a family
- * via ioctl, which will have the same effect on both contexts.
 */
 static int context_equiv(struct perf_event_context *ctx1,
                         struct perf_event_context *ctx2)
 {
-        return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+        /* Pinning disables the swap optimization */
-                && ctx1->parent_gen == ctx2->parent_gen
+        if (ctx1->pin_count || ctx2->pin_count)
-                && !ctx1->pin_count && !ctx2->pin_count;
+                return 0;
+        /* If ctx1 is the parent of ctx2 */
+        if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
+                return 1;
+        /* If ctx2 is the parent of ctx1 */
+        if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
+                return 1;
+        /*
+         * If ctx1 and ctx2 have the same parent; we flatten the parent
+         * hierarchy, see perf_event_init_context().
+         */
+        if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
+                        ctx1->parent_gen == ctx2->parent_gen)
+                return 1;
+        /* Unmatched */
+        return 0;
 }
 static void __perf_event_sync_stat(struct perf_event *event,
@@ -2241,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 {
        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
        struct perf_event_context *next_ctx;
-        struct perf_event_context *parent;
+        struct perf_event_context *parent, *next_parent;
        struct perf_cpu_context *cpuctx;
        int do_switch = 1;
@@ -2253,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                return;
        rcu_read_lock();
-        parent = rcu_dereference(ctx->parent_ctx);
        next_ctx = next->perf_event_ctxp[ctxn];
-        if (parent && next_ctx &&
+        if (!next_ctx)
-            rcu_dereference(next_ctx->parent_ctx) == parent) {
+                goto unlock;
+        parent = rcu_dereference(ctx->parent_ctx);
+        next_parent = rcu_dereference(next_ctx->parent_ctx);
+        /* If neither context have a parent context; they cannot be clones. */
+        if (!parent && !next_parent)
+                goto unlock;
+        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
                /*
                 * Looks like the two contexts are clones, so we might be
                 * able to optimize the context switch.  We lock both
@@ -2284,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                raw_spin_unlock(&next_ctx->lock);
                raw_spin_unlock(&ctx->lock);
        }
+unlock:
        rcu_read_unlock();
        if (do_switch) {
@@ -2712,7 +2748,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
                hwc = &event->hw;
-                if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
+                if (hwc->interrupts == MAX_INTERRUPTS) {
                        hwc->interrupts = 0;
                        perf_log_throttle(event, 1);
                        event->pmu->start(event, 0);
@@ -2811,10 +2847,11 @@ done:
 #ifdef CONFIG_NO_HZ_FULL
 bool perf_event_can_stop_tick(void)
 {
-        if (list_empty(&__get_cpu_var(rotation_list)))
+        if (atomic_read(&nr_freq_events) ||
-                return true;
+            __this_cpu_read(perf_throttled_count))
-        else
                return false;
+        else
+                return true;
 }
 #endif
@@ -3128,36 +3165,63 @@ static void free_event_rcu(struct rcu_head *head)
 static void ring_buffer_put(struct ring_buffer *rb);
 static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
-static void free_event(struct perf_event *event)
+static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
-        irq_work_sync(&event->pending);
+        if (event->parent)
+                return;
+        if (has_branch_stack(event)) {
+                if (!(event->attach_state & PERF_ATTACH_TASK))
+                        atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
+        }
+        if (is_cgroup_event(event))
+                atomic_dec(&per_cpu(perf_cgroup_events, cpu));
+}
+static void unaccount_event(struct perf_event *event)
+{
+        if (event->parent)
+                return;
+        if (event->attach_state & PERF_ATTACH_TASK)
+                static_key_slow_dec_deferred(&perf_sched_events);
+        if (event->attr.mmap || event->attr.mmap_data)
+                atomic_dec(&nr_mmap_events);
+        if (event->attr.comm)
+                atomic_dec(&nr_comm_events);
+        if (event->attr.task)
+                atomic_dec(&nr_task_events);
+        if (event->attr.freq)
+                atomic_dec(&nr_freq_events);
+        if (is_cgroup_event(event))
+                static_key_slow_dec_deferred(&perf_sched_events);
+        if (has_branch_stack(event))
+                static_key_slow_dec_deferred(&perf_sched_events);
+        unaccount_event_cpu(event, event->cpu);
+}
+static void __free_event(struct perf_event *event)
+{
        if (!event->parent) {
-                if (event->attach_state & PERF_ATTACH_TASK)
-                        static_key_slow_dec_deferred(&perf_sched_events);
-                if (event->attr.mmap || event->attr.mmap_data)
-                        atomic_dec(&nr_mmap_events);
-                if (event->attr.comm)
-                        atomic_dec(&nr_comm_events);
-                if (event->attr.task)
-                        atomic_dec(&nr_task_events);
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
                        put_callchain_buffers();
-                if (is_cgroup_event(event)) {
-                        atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-                        static_key_slow_dec_deferred(&perf_sched_events);
-                }
-                if (has_branch_stack(event)) {
-                        static_key_slow_dec_deferred(&perf_sched_events);
-                        /* is system-wide event */
-                        if (!(event->attach_state & PERF_ATTACH_TASK)) {
-                                atomic_dec(&per_cpu(perf_branch_stack_events,
-                                                    event->cpu));
-                        }
-                }
        }
+        if (event->destroy)
+                event->destroy(event);
+        if (event->ctx)
+                put_ctx(event->ctx);
+        call_rcu(&event->rcu_head, free_event_rcu);
+}
+static void free_event(struct perf_event *event)
+{
+        irq_work_sync(&event->pending);
+        unaccount_event(event);
        if (event->rb) {
                struct ring_buffer *rb;
@@ -3180,13 +3244,8 @@ static void free_event(struct perf_event *event)
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
-        if (event->destroy)
-                event->destroy(event);
-        if (event->ctx)
-                put_ctx(event->ctx);
-        call_rcu(&event->rcu_head, free_event_rcu);
+        __free_event(event);
 }
 int perf_event_release_kernel(struct perf_event *event)
@@ -3544,6 +3603,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case PERF_EVENT_IOC_PERIOD:
                return perf_event_period(event, (u64 __user *)arg);
+        case PERF_EVENT_IOC_ID:
+        {
+                u64 id = primary_event_id(event);
+                if (copy_to_user((void __user *)arg, &id, sizeof(id)))
+                        return -EFAULT;
+                return 0;
+        }
        case PERF_EVENT_IOC_SET_OUTPUT:
        {
                int ret;
@@ -3625,6 +3693,26 @@ static void calc_timer_values(struct perf_event *event,
        *running = ctx_time - event->tstamp_running;
 }
+static void perf_event_init_userpage(struct perf_event *event)
+{
+        struct perf_event_mmap_page *userpg;
+        struct ring_buffer *rb;
+        rcu_read_lock();
+        rb = rcu_dereference(event->rb);
+        if (!rb)
+                goto unlock;
+        userpg = rb->user_page;
+        /* Allow new userspace to detect that bit 0 is deprecated */
+        userpg->cap_bit0_is_deprecated = 1;
+        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
+unlock:
+        rcu_read_unlock();
+}
 void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 {
 }
@@ -3641,6 +3729,10 @@ void perf_event_update_userpage(struct perf_event *event)
        u64 enabled, running, now;
        rcu_read_lock();
+        rb = rcu_dereference(event->rb);
+        if (!rb)
+                goto unlock;
        /*
         * compute total_time_enabled, total_time_running
         * based on snapshot values taken when the event
@@ -3651,12 +3743,8 @@ void perf_event_update_userpage(struct perf_event *event)
         * NMI context
         */
        calc_timer_values(event, &now, &enabled, &running);
-        rb = rcu_dereference(event->rb);
-        if (!rb)
-                goto unlock;
        userpg = rb->user_page;
        /*
         * Disable preemption so as to not let the corresponding user-space
         * spin too long if we get preempted.
@@ -4009,6 +4097,7 @@ again:
        ring_buffer_attach(event, rb);
        rcu_assign_pointer(event->rb, rb);
+        perf_event_init_userpage(event);
        perf_event_update_userpage(event);
 unlock:
@@ -4251,7 +4340,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
        if (sample_type & PERF_SAMPLE_TIME)
                data->time = perf_clock();
-        if (sample_type & PERF_SAMPLE_ID)
+        if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
                data->id = primary_event_id(event);
        if (sample_type & PERF_SAMPLE_STREAM_ID)
@@ -4290,6 +4379,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);
+        if (sample_type & PERF_SAMPLE_IDENTIFIER)
+                perf_output_put(handle, data->id);
 }
 void perf_event__output_id_sample(struct perf_event *event,
@@ -4355,7 +4447,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
        list_for_each_entry(sub, &leader->sibling_list, group_entry) {
                n = 0;
-                if (sub != event)
+                if ((sub != event) &&
+                    (sub->state == PERF_EVENT_STATE_ACTIVE))
                        sub->pmu->read(sub);
                values[n++] = perf_event_count(sub);
@@ -4402,6 +4495,9 @@ void perf_output_sample(struct perf_output_handle *handle,
        perf_output_put(handle, *header);
+        if (sample_type & PERF_SAMPLE_IDENTIFIER)
+                perf_output_put(handle, data->id);
        if (sample_type & PERF_SAMPLE_IP)
                perf_output_put(handle, data->ip);
@@ -4462,20 +4558,6 @@ void perf_output_sample(struct perf_output_handle *handle,
                }
        }
-        if (!event->attr.watermark) {
-                int wakeup_events = event->attr.wakeup_events;
-                if (wakeup_events) {
-                        struct ring_buffer *rb = handle->rb;
-                        int events = local_inc_return(&rb->events);
-                        if (events >= wakeup_events) {
-                                local_sub(wakeup_events, &rb->events);
-                                local_inc(&rb->wakeup);
-                        }
-                }
-        }
        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
                if (data->br_stack) {
                        size_t size;
@@ -4511,16 +4593,34 @@ void perf_output_sample(struct perf_output_handle *handle,
                }
        }
-        if (sample_type & PERF_SAMPLE_STACK_USER)
+        if (sample_type & PERF_SAMPLE_STACK_USER) {
                perf_output_sample_ustack(handle,
                                          data->stack_user_size,
                                          data->regs_user.regs);
+        }
        if (sample_type & PERF_SAMPLE_WEIGHT)
                perf_output_put(handle, data->weight);
        if (sample_type & PERF_SAMPLE_DATA_SRC)
                perf_output_put(handle, data->data_src.val);
+        if (sample_type & PERF_SAMPLE_TRANSACTION)
+                perf_output_put(handle, data->txn);
+        if (!event->attr.watermark) {
+                int wakeup_events = event->attr.wakeup_events;
+                if (wakeup_events) {
+                        struct ring_buffer *rb = handle->rb;
+                        int events = local_inc_return(&rb->events);
+                        if (events >= wakeup_events) {
+                                local_sub(wakeup_events, &rb->events);
+                                local_inc(&rb->wakeup);
+                        }
+                }
+        }
 }
 void perf_prepare_sample(struct perf_event_header *header,
@@ -4680,12 +4780,10 @@ perf_event_read_event(struct perf_event *event,
        perf_output_end(&handle);
 }
-typedef int  (perf_event_aux_match_cb)(struct perf_event *event, void *data);
 typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
 static void
 perf_event_aux_ctx(struct perf_event_context *ctx,
-                   perf_event_aux_match_cb match,
                   perf_event_aux_output_cb output,
                   void *data)
 {
@@ -4696,15 +4794,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
                        continue;
                if (!event_filter_match(event))
                        continue;
-                if (match(event, data))
+                output(event, data);
-                        output(event, data);
        }
 }
 static void
-perf_event_aux(perf_event_aux_match_cb match,
+perf_event_aux(perf_event_aux_output_cb output, void *data,
-               perf_event_aux_output_cb output,
-               void *data,
               struct perf_event_context *task_ctx)
 {
        struct perf_cpu_context *cpuctx;
@@ -4717,7 +4812,7 @@ perf_event_aux(perf_event_aux_match_cb match,
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
                if (cpuctx->unique_pmu != pmu)
                        goto next;
-                perf_event_aux_ctx(&cpuctx->ctx, match, output, data);
+                perf_event_aux_ctx(&cpuctx->ctx, output, data);
                if (task_ctx)
                        goto next;
                ctxn = pmu->task_ctx_nr;
@@ -4725,14 +4820,14 @@ perf_event_aux(perf_event_aux_match_cb match,
                        goto next;
                ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
                if (ctx)
-                        perf_event_aux_ctx(ctx, match, output, data);
+                        perf_event_aux_ctx(ctx, output, data);
 next:
                put_cpu_ptr(pmu->pmu_cpu_context);
        }
        if (task_ctx) {
                preempt_disable();
-                perf_event_aux_ctx(task_ctx, match, output, data);
+                perf_event_aux_ctx(task_ctx, output, data);
                preempt_enable();
        }
        rcu_read_unlock();
@@ -4741,7 +4836,7 @@ next:
 /*
 * task tracking -- fork/exit
 *
- * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
+ * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
 */
 struct perf_task_event {
@@ -4759,6 +4854,13 @@ struct perf_task_event {
        } event_id;
 };
+static int perf_event_task_match(struct perf_event *event)
+{
+        return event->attr.comm  || event->attr.mmap ||
+               event->attr.mmap2 || event->attr.mmap_data ||
+               event->attr.task;
+}
 static void perf_event_task_output(struct perf_event *event,
                                   void *data)
 {
@@ -4768,6 +4870,9 @@ static void perf_event_task_output(struct perf_event *event,
        struct task_struct *task = task_event->task;
        int ret, size = task_event->event_id.header.size;
+        if (!perf_event_task_match(event))
+                return;
        perf_event_header__init_id(&task_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, event,
@@ -4790,13 +4895,6 @@ out:
        task_event->event_id.header.size = size;
 }
-static int perf_event_task_match(struct perf_event *event,
-                                 void *data __maybe_unused)
-{
-        return event->attr.comm || event->attr.mmap ||
-               event->attr.mmap_data || event->attr.task;
-}
 static void perf_event_task(struct task_struct *task,
                              struct perf_event_context *task_ctx,
                              int new)
@@ -4825,8 +4923,7 @@ static void perf_event_task(struct task_struct *task,
                },
        };
-        perf_event_aux(perf_event_task_match,
+        perf_event_aux(perf_event_task_output,
-                       perf_event_task_output,
                       &task_event,
                       task_ctx);
 }
@@ -4853,6 +4950,11 @@ struct perf_comm_event {
        } event_id;
 };
+static int perf_event_comm_match(struct perf_event *event)
+{
+        return event->attr.comm;
+}
 static void perf_event_comm_output(struct perf_event *event,
                                   void *data)
 {
@@ -4862,6 +4964,9 @@ static void perf_event_comm_output(struct perf_event *event,
        int size = comm_event->event_id.header.size;
        int ret;
+        if (!perf_event_comm_match(event))
+                return;
        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, event,
                                comm_event->event_id.header.size);
@@ -4883,12 +4988,6 @@ out:
        comm_event->event_id.header.size = size;
 }
-static int perf_event_comm_match(struct perf_event *event,
-                                 void *data __maybe_unused)
-{
-        return event->attr.comm;
-}
 static void perf_event_comm_event(struct perf_comm_event *comm_event)
 {
        char comm[TASK_COMM_LEN];
@@ -4903,8 +5002,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
-        perf_event_aux(perf_event_comm_match,
+        perf_event_aux(perf_event_comm_output,
-                       perf_event_comm_output,
                       comm_event,
                       NULL);
 }
@@ -4955,6 +5053,9 @@ struct perf_mmap_event {
        const char              *file_name;
        int                     file_size;
+        int                     maj, min;
+        u64                     ino;
+        u64                     ino_generation;
        struct {
                struct perf_event_header        header;
@@ -4967,6 +5068,17 @@ struct perf_mmap_event {
        } event_id;
 };
+static int perf_event_mmap_match(struct perf_event *event,
+                                 void *data)
+{
+        struct perf_mmap_event *mmap_event = data;
+        struct vm_area_struct *vma = mmap_event->vma;
+        int executable = vma->vm_flags & VM_EXEC;
+        return (!executable && event->attr.mmap_data) ||
+               (executable && (event->attr.mmap || event->attr.mmap2));
+}
 static void perf_event_mmap_output(struct perf_event *event,
                                   void *data)
 {
@@ -4976,6 +5088,17 @@ static void perf_event_mmap_output(struct perf_event *event,
        int size = mmap_event->event_id.header.size;
        int ret;
+        if (!perf_event_mmap_match(event, data))
+                return;
+        if (event->attr.mmap2) {
+                mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
+                mmap_event->event_id.header.size += sizeof(mmap_event->maj);
+                mmap_event->event_id.header.size += sizeof(mmap_event->min);
+                mmap_event->event_id.header.size += sizeof(mmap_event->ino);
+                mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
+        }
        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, event,
                                mmap_event->event_id.header.size);
@@ -4986,6 +5109,14 @@ static void perf_event_mmap_output(struct perf_event *event,
        mmap_event->event_id.tid = perf_event_tid(event, current);
        perf_output_put(&handle, mmap_event->event_id);
+        if (event->attr.mmap2) {
+                perf_output_put(&handle, mmap_event->maj);
+                perf_output_put(&handle, mmap_event->min);
+                perf_output_put(&handle, mmap_event->ino);
+                perf_output_put(&handle, mmap_event->ino_generation);
+        }
        __output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);
@@ -4996,82 +5127,89 @@ out:
        mmap_event->event_id.header.size = size;
 }
-static int perf_event_mmap_match(struct perf_event *event,
-                                 void *data)
-{
-        struct perf_mmap_event *mmap_event = data;
-        struct vm_area_struct *vma = mmap_event->vma;
-        int executable = vma->vm_flags & VM_EXEC;
-        return (!executable && event->attr.mmap_data) ||
-               (executable && event->attr.mmap);
-}
 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 {
        struct vm_area_struct *vma = mmap_event->vma;
        struct file *file = vma->vm_file;
+        int maj = 0, min = 0;
+        u64 ino = 0, gen = 0;
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
-        const char *name;
+        char *name;
-        memset(tmp, 0, sizeof(tmp));
        if (file) {
+                struct inode *inode;
+                dev_t dev;
+                buf = kmalloc(PATH_MAX, GFP_KERNEL);
+                if (!buf) {
+                        name = "//enomem";
+                        goto cpy_name;
+                }
                /*
-                 * d_path works from the end of the rb backwards, so we
+                 * d_path() works from the end of the rb backwards, so we
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
-                buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
+                name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
-                if (!buf) {
-                        name = strncpy(tmp, "//enomem", sizeof(tmp));
-                        goto got_name;
-                }
-                name = d_path(&file->f_path, buf, PATH_MAX);
                if (IS_ERR(name)) {
-                        name = strncpy(tmp, "//toolong", sizeof(tmp));
+                        name = "//toolong";
-                        goto got_name;
+                        goto cpy_name;
                }
+                inode = file_inode(vma->vm_file);
+                dev = inode->i_sb->s_dev;
+                ino = inode->i_ino;
+                gen = inode->i_generation;
+                maj = MAJOR(dev);
+                min = MINOR(dev);
+                goto got_name;
        } else {
-                if (arch_vma_name(mmap_event->vma)) {
+                name = (char *)arch_vma_name(vma);
-                        name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+                if (name)
-                                       sizeof(tmp) - 1);
+                        goto cpy_name;
-                        tmp[sizeof(tmp) - 1] = '\0';
-                        goto got_name;
-                }
-                if (!vma->vm_mm) {
+                if (vma->vm_start <= vma->vm_mm->start_brk &&
-                        name = strncpy(tmp, "[vdso]", sizeof(tmp));
-                        goto got_name;
-                } else if (vma->vm_start <= vma->vm_mm->start_brk &&
                                vma->vm_end >= vma->vm_mm->brk) {
-                        name = strncpy(tmp, "[heap]", sizeof(tmp));
+                        name = "[heap]";
-                        goto got_name;
+                        goto cpy_name;
-                } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+                }
+                if (vma->vm_start <= vma->vm_mm->start_stack &&
                                vma->vm_end >= vma->vm_mm->start_stack) {
-                        name = strncpy(tmp, "[stack]", sizeof(tmp));
+                        name = "[stack]";
-                        goto got_name;
+                        goto cpy_name;
                }
-                name = strncpy(tmp, "//anon", sizeof(tmp));
+                name = "//anon";
-                goto got_name;
+                goto cpy_name;
        }
+cpy_name:
+        strlcpy(tmp, name, sizeof(tmp));
+        name = tmp;
 got_name:
-        size = ALIGN(strlen(name)+1, sizeof(u64));
+        /*
+         * Since our buffer works in 8 byte units we need to align our string
+         * size to a multiple of 8. However, we must guarantee the tail end is
+         * zero'd out to avoid leaking random bits to userspace.
+         */
+        size = strlen(name)+1;
+        while (!IS_ALIGNED(size, sizeof(u64)))
+                name[size++] = '\0';
        mmap_event->file_name = name;
        mmap_event->file_size = size;
+        mmap_event->maj = maj;
+        mmap_event->min = min;
+        mmap_event->ino = ino;
+        mmap_event->ino_generation = gen;
        if (!(vma->vm_flags & VM_EXEC))
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
-        perf_event_aux(perf_event_mmap_match,
+        perf_event_aux(perf_event_mmap_output,
-                       perf_event_mmap_output,
                       mmap_event,
                       NULL);
@@ -5101,6 +5239,10 @@ void perf_event_mmap(struct vm_area_struct *vma)
                        .len    = vma->vm_end - vma->vm_start,
                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
                },
+                /* .maj (attr_mmap2 only) */
+                /* .min (attr_mmap2 only) */
+                /* .ino (attr_mmap2 only) */
+                /* .ino_generation (attr_mmap2 only) */
        };
        perf_event_mmap_event(&mmap_event);
@@ -5178,6 +5320,7 @@ static int __perf_event_overflow(struct perf_event *event,
                        __this_cpu_inc(perf_throttled_count);
                        hwc->interrupts = MAX_INTERRUPTS;
                        perf_log_throttle(event, 0);
+                        tick_nohz_full_kick();
                        ret = 1;
                }
        }
@@ -6189,6 +6332,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
 }
+static DEVICE_ATTR_RO(type);
 static ssize_t
 perf_event_mux_interval_ms_show(struct device *dev,
@@ -6233,17 +6377,19 @@ perf_event_mux_interval_ms_store(struct device *dev,
        return count;
 }
+static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
-static struct device_attribute pmu_dev_attrs[] = {
+static struct attribute *pmu_dev_attrs[] = {
-        __ATTR_RO(type),
+        &dev_attr_type.attr,
-        __ATTR_RW(perf_event_mux_interval_ms),
+        &dev_attr_perf_event_mux_interval_ms.attr,
-        __ATTR_NULL,
+        NULL,
 };
+ATTRIBUTE_GROUPS(pmu_dev);
 static int pmu_bus_running;
 static struct bus_type pmu_bus = {
        .name           = "event_source",
-        .dev_attrs      = pmu_dev_attrs,
+        .dev_groups     = pmu_dev_groups,
 };
 static void pmu_dev_release(struct device *dev)
@@ -6443,6 +6589,44 @@ unlock:
        return pmu;
 }
+static void account_event_cpu(struct perf_event *event, int cpu)
+{
+        if (event->parent)
+                return;
+        if (has_branch_stack(event)) {
+                if (!(event->attach_state & PERF_ATTACH_TASK))
+                        atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
+        }
+        if (is_cgroup_event(event))
+                atomic_inc(&per_cpu(perf_cgroup_events, cpu));
+}
+static void account_event(struct perf_event *event)
+{
+        if (event->parent)
+                return;
+        if (event->attach_state & PERF_ATTACH_TASK)
+                static_key_slow_inc(&perf_sched_events.key);
+        if (event->attr.mmap || event->attr.mmap_data)
+                atomic_inc(&nr_mmap_events);
+        if (event->attr.comm)
+                atomic_inc(&nr_comm_events);
+        if (event->attr.task)
+                atomic_inc(&nr_task_events);
+        if (event->attr.freq) {
+                if (atomic_inc_return(&nr_freq_events) == 1)
+                        tick_nohz_full_kick_all();
+        }
+        if (has_branch_stack(event))
+                static_key_slow_inc(&perf_sched_events.key);
+        if (is_cgroup_event(event))
+                static_key_slow_inc(&perf_sched_events.key);
+        account_event_cpu(event, event->cpu);
+}
 /*
 * Allocate and initialize a event structure
 */
@@ -6457,7 +6641,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        struct pmu *pmu;
        struct perf_event *event;
        struct hw_perf_event *hwc;
-        long err;
+        long err = -EINVAL;
        if ((unsigned)cpu >= nr_cpu_ids) {
                if (!task || cpu != -1)
@@ -6540,49 +6724,35 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
         * we currently do not support PERF_FORMAT_GROUP on inherited events
         */
        if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
-                goto done;
+                goto err_ns;
        pmu = perf_init_event(event);
-done:
-        err = 0;
        if (!pmu)
-                err = -EINVAL;
+                goto err_ns;
-        else if (IS_ERR(pmu))
+        else if (IS_ERR(pmu)) {
                err = PTR_ERR(pmu);
+                goto err_ns;
-        if (err) {
-                if (event->ns)
-                        put_pid_ns(event->ns);
-                kfree(event);
-                return ERR_PTR(err);
        }
        if (!event->parent) {
-                if (event->attach_state & PERF_ATTACH_TASK)
-                        static_key_slow_inc(&perf_sched_events.key);
-                if (event->attr.mmap || event->attr.mmap_data)
-                        atomic_inc(&nr_mmap_events);
-                if (event->attr.comm)
-                        atomic_inc(&nr_comm_events);
-                if (event->attr.task)
-                        atomic_inc(&nr_task_events);
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
                        err = get_callchain_buffers();
-                        if (err) {
+                        if (err)
-                                free_event(event);
+                                goto err_pmu;
-                                return ERR_PTR(err);
-                        }
-                }
-                if (has_branch_stack(event)) {
-                        static_key_slow_inc(&perf_sched_events.key);
-                        if (!(event->attach_state & PERF_ATTACH_TASK))
-                                atomic_inc(&per_cpu(perf_branch_stack_events,
-                                                    event->cpu));
                }
        }
        return event;
+err_pmu:
+        if (event->destroy)
+                event->destroy(event);
+err_ns:
+        if (event->ns)
+                put_pid_ns(event->ns);
+        kfree(event);
+        return ERR_PTR(err);
 }
 static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -6640,6 +6810,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (ret)
                return -EFAULT;
+        /* disabled for now */
+        if (attr->mmap2)
+                return -EINVAL;
        if (attr->__reserved_1)
                return -EINVAL;
@@ -6864,17 +7038,14 @@ SYSCALL_DEFINE5(perf_event_open,
        if (flags & PERF_FLAG_PID_CGROUP) {
                err = perf_cgroup_connect(pid, event, &attr, group_leader);
-                if (err)
+                if (err) {
-                        goto err_alloc;
+                        __free_event(event);
-                /*
+                        goto err_task;
-                 * one more event:
+                }
-                 * - that has cgroup constraint on event->cpu
-                 * - that may need work on context switch
-                 */
-                atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-                static_key_slow_inc(&perf_sched_events.key);
        }
+        account_event(event);
        /*
         * Special case software events and allow them to be part of
         * any hardware group.
@@ -6998,7 +7169,6 @@ SYSCALL_DEFINE5(perf_event_open,
        }
        perf_install_in_context(ctx, event, event->cpu);
-        ++ctx->generation;
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
@@ -7070,6 +7240,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                goto err;
        }
+        account_event(event);
        ctx = find_get_context(event->pmu, task, cpu);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
@@ -7079,7 +7251,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_install_in_context(ctx, event, cpu);
-        ++ctx->generation;
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
@@ -7106,18 +7277,20 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                 event_entry) {
                perf_remove_from_context(event);
+                unaccount_event_cpu(event, src_cpu);
                put_ctx(src_ctx);
-                list_add(&event->event_entry, &events);
+                list_add(&event->migrate_entry, &events);
        }
        mutex_unlock(&src_ctx->mutex);
        synchronize_rcu();
        mutex_lock(&dst_ctx->mutex);
-        list_for_each_entry_safe(event, tmp, &events, event_entry) {
+        list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
-                list_del(&event->event_entry);
+                list_del(&event->migrate_entry);
                if (event->state >= PERF_EVENT_STATE_OFF)
                        event->state = PERF_EVENT_STATE_INACTIVE;
+                account_event_cpu(event, dst_cpu);
                perf_install_in_context(dst_ctx, event, dst_cpu);
                get_ctx(dst_ctx);
        }
@@ -7798,7 +7971,8 @@ unlock:
 device_initcall(perf_event_sysfs_init);
 #ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
+static struct cgroup_subsys_state *
+perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
        struct perf_cgroup *jc;
@@ -7815,11 +7989,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
        return &jc->css;
 }
-static void perf_cgroup_css_free(struct cgroup *cont)
+static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
 {
-        struct perf_cgroup *jc;
+        struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
-        jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
-                          struct perf_cgroup, css);
        free_percpu(jc->info);
        kfree(jc);
 }
@@ -7831,15 +8004,17 @@ static int __perf_cgroup_move(void *info)
        return 0;
 }
-static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+static void perf_cgroup_attach(struct cgroup_subsys_state *css,
+                               struct cgroup_taskset *tset)
 {
        struct task_struct *task;
-        cgroup_taskset_for_each(task, cgrp, tset)
+        cgroup_taskset_for_each(task, css, tset)
                task_function_call(task, __perf_cgroup_move, task);
 }
-static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+static void perf_cgroup_exit(struct cgroup_subsys_state *css,
+                             struct cgroup_subsys_state *old_css,
                             struct task_struct *task)
 {
        /*
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca6599723be5..569b218782ad 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
 }
 #define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                      \
-static inline unsigned int                                              \
+static inline unsigned long                                             \
 func_name(struct perf_output_handle *handle,                            \
-          const void *buf, unsigned int len)                            \
+          const void *buf, unsigned long len)                           \
 {                                                                       \
        unsigned long size, written;                                    \
                                                                        \
        do {                                                            \
-                size = min_t(unsigned long, handle->size, len);         \
+                size    = min(handle->size, len);                       \
-                                                                        \
                written = memcpy_func(handle->addr, buf, size);         \
+                written = size - written;                               \
                                                                        \
                len -= written;                                         \
                handle->addr += written;                                \
@@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle,				\
        return len;                                                     \
 }
-static inline int memcpy_common(void *dst, const void *src, size_t n)
+static inline unsigned long
+memcpy_common(void *dst, const void *src, unsigned long n)
 {
        memcpy(dst, src, n);
-        return n;
+        return 0;
 }
 DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
-#define MEMCPY_SKIP(dst, src, n) (n)
+static inline unsigned long
+memcpy_skip(void *dst, const void *src, unsigned long n)
+{
+        return 0;
+}
-DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
+DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
 #ifndef arch_perf_out_copy_user
-#define arch_perf_out_copy_user __copy_from_user_inatomic
+#define arch_perf_out_copy_user arch_perf_out_copy_user
+static inline unsigned long
+arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
+{
+        unsigned long ret;
+        pagefault_disable();
+        ret = __copy_from_user_inatomic(dst, src, n);
+        pagefault_enable();
+        return ret;
+}
 #endif
 DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index cd55144270b5..e8b168af135b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,40 +12,10 @@
 #include <linux/perf_event.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/circ_buf.h>
 #include "internal.h"
-static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
-                              unsigned long offset, unsigned long head)
-{
-        unsigned long sz = perf_data_size(rb);
-        unsigned long mask = sz - 1;
-        /*
-         * check if user-writable
-         * overwrite : over-write its own tail
-         * !overwrite: buffer possibly drops events.
-         */
-        if (rb->overwrite)
-                return true;
-        /*
-         * verify that payload is not bigger than buffer
-         * otherwise masking logic may fail to detect
-         * the "not enough space" condition
-         */
-        if ((head - offset) > sz)
-                return false;
-        offset = (offset - tail) & mask;
-        head   = (head   - tail) & mask;
-        if ((int)(head - offset) < 0)
-                return false;
-        return true;
-}
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
        atomic_set(&handle->rb->poll, POLL_IN);
@@ -87,15 +57,36 @@ again:
                goto out;
        /*
-         * Publish the known good head. Rely on the full barrier implied
+         * Since the mmap() consumer (userspace) can run on a different CPU:
-         * by atomic_dec_and_test() order the rb->head read and this
+         *
-         * write.
+         *   kernel                             user
+         *
+         *   READ ->data_tail                   READ ->data_head
+         *   smp_mb()   (A)                     smp_rmb()       (C)
+         *   WRITE $data                        READ $data
+         *   smp_wmb()  (B)                     smp_mb()        (D)
+         *   STORE ->data_head                  WRITE ->data_tail
+         *
+         * Where A pairs with D, and B pairs with C.
+         *
+         * I don't think A needs to be a full barrier because we won't in fact
+         * write data until we see the store from userspace. So we simply don't
+         * issue the data WRITE until we observe it. Be conservative for now.
+         *
+         * OTOH, D needs to be a full barrier since it separates the data READ
+         * from the tail WRITE.
+         *
+         * For B a WMB is sufficient since it separates two WRITEs, and for C
+         * an RMB is sufficient since it separates two READs.
+         *
+         * See perf_output_begin().
         */
+        smp_wmb();
        rb->user_page->data_head = head;
        /*
-         * Now check if we missed an update, rely on the (compiler)
+         * Now check if we missed an update -- rely on previous implied
-         * barrier in atomic_dec_and_test() to re-read rb->head.
+         * compiler barriers to force a re-read.
         */
        if (unlikely(head != local_read(&rb->head))) {
                local_inc(&rb->nest);
@@ -114,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle,
 {
        struct ring_buffer *rb;
        unsigned long tail, offset, head;
-        int have_lost;
+        int have_lost, page_shift;
-        struct perf_sample_data sample_data;
        struct {
                struct perf_event_header header;
                u64                      id;
@@ -130,55 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle,
                event = event->parent;
        rb = rcu_dereference(event->rb);
-        if (!rb)
+        if (unlikely(!rb))
                goto out;
-        handle->rb      = rb;
+        if (unlikely(!rb->nr_pages))
-        handle->event   = event;
-        if (!rb->nr_pages)
                goto out;
+        handle->rb    = rb;
+        handle->event = event;
        have_lost = local_read(&rb->lost);
-        if (have_lost) {
+        if (unlikely(have_lost)) {
-                lost_event.header.size = sizeof(lost_event);
+                size += sizeof(lost_event);
-                perf_event_header__init_id(&lost_event.header, &sample_data,
+                if (event->attr.sample_id_all)
-                                           event);
+                        size += event->id_header_size;
-                size += lost_event.header.size;
        }
        perf_output_get_handle(handle);
        do {
-                /*
-                 * Userspace could choose to issue a mb() before updating the
-                 * tail pointer. So that all reads will be completed before the
-                 * write is issued.
-                 */
                tail = ACCESS_ONCE(rb->user_page->data_tail);
-                smp_rmb();
                offset = head = local_read(&rb->head);
-                head += size;
+                if (!rb->overwrite &&
-                if (unlikely(!perf_output_space(rb, tail, offset, head)))
+                    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
                        goto fail;
+                head += size;
        } while (local_cmpxchg(&rb->head, offset, head) != offset);
-        if (head - local_read(&rb->wakeup) > rb->watermark)
+        /*
+         * Separate the userpage->tail read from the data stores below.
+         * Matches the MB userspace SHOULD issue after reading the data
+         * and before storing the new tail position.
+         *
+         * See perf_output_put_handle().
+         */
+        smp_mb();
+        if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
                local_add(rb->watermark, &rb->wakeup);
-        handle->page = offset >> (PAGE_SHIFT + page_order(rb));
+        page_shift = PAGE_SHIFT + page_order(rb);
-        handle->page &= rb->nr_pages - 1;
-        handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
-        handle->addr = rb->data_pages[handle->page];
-        handle->addr += handle->size;
-        handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
-        if (have_lost) {
+        handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
+        offset &= (1UL << page_shift) - 1;
+        handle->addr = rb->data_pages[handle->page] + offset;
+        handle->size = (1UL << page_shift) - offset;
+        if (unlikely(have_lost)) {
+                struct perf_sample_data sample_data;
+                lost_event.header.size = sizeof(lost_event);
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
                lost_event.id          = event->id;
                lost_event.lost        = local_xchg(&rb->lost, 0);
+                perf_event_header__init_id(&lost_event.header,
+                                           &sample_data, event);
                perf_output_put(handle, lost_event);
                perf_event__output_id_sample(event, handle, &sample_data);
        }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f3569747d629..24b7d6ca871b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,6 +35,7 @@
 #include <linux/kdebug.h>       /* notifier mechanism */
 #include "../../mm/internal.h"  /* munlock_vma_page */
 #include <linux/percpu-rwsem.h>
+#include <linux/task_work.h>
 #include <linux/uprobes.h>
@@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
 * the architecture. If an arch has variable length instruction and the
 * breakpoint instruction is not of the smallest length instruction
 * supported by that architecture then we need to modify is_trap_at_addr and
- * write_opcode accordingly. This would never be a problem for archs that
+ * uprobe_write_opcode accordingly. This would never be a problem for archs
- * have fixed length instructions.
+ * that have fixed length instructions.
 */
 /*
- * write_opcode - write the opcode at a given virtual address.
+ * uprobe_write_opcode - write the opcode at a given virtual address.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to store the opcode.
 * @opcode: opcode to be written at @vaddr.
@@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
 * For mm @mm, write the opcode at @vaddr.
 * Return 0 (success) or a negative errno.
 */
-static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
+int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
                        uprobe_opcode_t opcode)
 {
        struct page *old_page, *new_page;
@@ -314,7 +315,7 @@ put_old:
 */
 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+        return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
 }
 /**
@@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
 int __weak
 set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+        return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
 }
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
        return ret;
 }
-static int
+static int __copy_insn(struct address_space *mapping, struct file *filp,
-__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
+                        void *insn, int nbytes, loff_t offset)
-                        unsigned long nbytes, loff_t offset)
 {
        struct page *page;
@@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
 static int copy_insn(struct uprobe *uprobe, struct file *filp)
 {
-        struct address_space *mapping;
+        struct address_space *mapping = uprobe->inode->i_mapping;
-        unsigned long nbytes;
+        loff_t offs = uprobe->offset;
-        int bytes;
+        void *insn = uprobe->arch.insn;
+        int size = MAX_UINSN_BYTES;
-        nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
+        int len, err = -EIO;
-        mapping = uprobe->inode->i_mapping;
-        /* Instruction at end of binary; copy only available bytes */
+        /* Copy only available bytes, -EIO if nothing was read */
-        if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
+        do {
-                bytes = uprobe->inode->i_size - uprobe->offset;
+                if (offs >= i_size_read(uprobe->inode))
-        else
+                        break;
-                bytes = MAX_UINSN_BYTES;
-        /* Instruction at the page-boundary; copy bytes in second page */
+                len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
-        if (nbytes < bytes) {
+                err = __copy_insn(mapping, filp, insn, len, offs);
-                int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
-                                bytes - nbytes, uprobe->offset + nbytes);
                if (err)
-                        return err;
+                        break;
-                bytes = nbytes;
-        }
+                insn += len;
-        return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
+                offs += len;
+                size -= len;
+        } while (size);
+        return err;
 }
 static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
        if (ret)
                goto out;
-        /* write_opcode() assumes we don't cross page boundary */
+        /* uprobe_write_opcode() assumes we don't cross page boundary */
        BUG_ON((uprobe->offset & ~PAGE_MASK) +
                        UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
@@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
 }
 /* Slot allocation for XOL */
-static int xol_add_vma(struct xol_area *area)
+static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 {
-        struct mm_struct *mm = current->mm;
        int ret = -EALREADY;
        down_write(&mm->mmap_sem);
        if (mm->uprobes_state.xol_area)
                goto fail;
-        ret = -ENOMEM;
+        if (!area->vaddr) {
-        /* Try to map as high as possible, this is only a hint. */
+                /* Try to map as high as possible, this is only a hint. */
-        area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
+                area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
-        if (area->vaddr & ~PAGE_MASK) {
+                                                PAGE_SIZE, 0, 0);
-                ret = area->vaddr;
+                if (area->vaddr & ~PAGE_MASK) {
-                goto fail;
+                        ret = area->vaddr;
+                        goto fail;
+                }
        }
        ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area)
        smp_wmb();      /* pairs with get_xol_area() */
        mm->uprobes_state.xol_area = area;
-        ret = 0;
 fail:
        up_write(&mm->mmap_sem);
        return ret;
 }
-/*
+static struct xol_area *__create_xol_area(unsigned long vaddr)
- * get_xol_area - Allocate process's xol_area if necessary.
- * This area will be used for storing instructions for execution out of line.
- *
- * Returns the allocated area or NULL.
- */
-static struct xol_area *get_xol_area(void)
 {
        struct mm_struct *mm = current->mm;
-        struct xol_area *area;
        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+        struct xol_area *area;
-        area = mm->uprobes_state.xol_area;
+        area = kmalloc(sizeof(*area), GFP_KERNEL);
-        if (area)
-                goto ret;
-        area = kzalloc(sizeof(*area), GFP_KERNEL);
        if (unlikely(!area))
                goto out;
@@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void)
        if (!area->page)
                goto free_bitmap;
-        /* allocate first slot of task's xol_area for the return probes */
+        area->vaddr = vaddr;
+        init_waitqueue_head(&area->wq);
+        /* Reserve the 1st slot for get_trampoline_vaddr() */
        set_bit(0, area->bitmap);
-        copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
        atomic_set(&area->slot_count, 1);
-        init_waitqueue_head(&area->wq);
+        copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
-        if (!xol_add_vma(area))
+        if (!xol_add_vma(mm, area))
                return area;
        __free_page(area->page);
@@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void)
 free_area:
        kfree(area);
 out:
+        return NULL;
+}
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
+{
+        struct mm_struct *mm = current->mm;
+        struct xol_area *area;
+        if (!mm->uprobes_state.xol_area)
+                __create_xol_area(0);
        area = mm->uprobes_state.xol_area;
- ret:
+        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
-        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
        return area;
 }
@@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
                return 0;
        /* Initialize the slot */
-        copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
+        copy_to_page(area->page, xol_vaddr,
+                        uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
        /*
         * We probably need flush_icache_user_range() but it needs vma.
         * This should work on supported architectures too.
@@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t)
 }
 /*
- * Called in context of a new clone/fork from copy_process.
- */
-void uprobe_copy_process(struct task_struct *t)
-{
-        t->utask = NULL;
-}
-/*
 * Allocate a uprobe_task object for the task if if necessary.
 * Called when the thread hits a breakpoint.
 *
@@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void)
        return current->utask;
 }
+static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
+{
+        struct uprobe_task *n_utask;
+        struct return_instance **p, *o, *n;
+        n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+        if (!n_utask)
+                return -ENOMEM;
+        t->utask = n_utask;
+        p = &n_utask->return_instances;
+        for (o = o_utask->return_instances; o; o = o->next) {
+                n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+                if (!n)
+                        return -ENOMEM;
+                *n = *o;
+                atomic_inc(&n->uprobe->ref);
+                n->next = NULL;
+                *p = n;
+                p = &n->next;
+                n_utask->depth++;
+        }
+        return 0;
+}
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+        pr_warn("uprobe: %s:%d failed to %s\n",
+                        current->comm, current->pid, msg);
+}
+static void dup_xol_work(struct callback_head *work)
+{
+        kfree(work);
+        if (current->flags & PF_EXITING)
+                return;
+        if (!__create_xol_area(current->utask->vaddr))
+                uprobe_warn(current, "dup xol area");
+}
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+{
+        struct uprobe_task *utask = current->utask;
+        struct mm_struct *mm = current->mm;
+        struct callback_head *work;
+        struct xol_area *area;
+        t->utask = NULL;
+        if (!utask || !utask->return_instances)
+                return;
+        if (mm == t->mm && !(flags & CLONE_VFORK))
+                return;
+        if (dup_utask(t, utask))
+                return uprobe_warn(t, "dup ret instances");
+        /* The task can fork() after dup_xol_work() fails */
+        area = mm->uprobes_state.xol_area;
+        if (!area)
+                return uprobe_warn(t, "dup xol area");
+        if (mm == t->mm)
+                return;
+        /* TODO: move it into the union in uprobe_task */
+        work = kmalloc(sizeof(*work), GFP_KERNEL);
+        if (!work)
+                return uprobe_warn(t, "dup xol area");
+        t->utask->vaddr = area->vaddr;
+        init_task_work(work, dup_xol_work);
+        task_work_add(t, work, true);
+}
 /*
 * Current area->vaddr notion assume the trampoline address is always
 * equal area->vaddr.
@@ -1682,12 +1766,10 @@ static bool handle_trampoline(struct pt_regs *regs)
                tmp = ri;
                ri = ri->next;
                kfree(tmp);
+                utask->depth--;
                if (!chained)
                        break;
-                utask->depth--;
                BUG_ON(!ri);
        }
@@ -1859,9 +1941,4 @@ static int __init init_uprobes(void)
        return register_die_notifier(&uprobe_exception_nb);
 }
-module_init(init_uprobes);
+__initcall(init_uprobes);
-static void __exit exit_uprobes(void)
-{
-}
-module_exit(exit_uprobes);
diff --git a/kernel/extable.c b/kernel/extable.c
index 67460b93b1a1..832cb28105bb 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1;
 /* Sort the kernel's built-in exception table */
 void __init sort_main_extable(void)
 {
-        if (main_extable_sort_needed) {
+        if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
                pr_notice("Sorting __ex_table...\n");
                sort_extable(__start___ex_table, __stop___ex_table);
        }
diff --git a/kernel/fork.c b/kernel/fork.c
index 403d2bb8a968..f6d11fc67f72 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        struct rb_node **rb_link, *rb_parent;
        int retval;
        unsigned long charge;
-        struct mempolicy *pol;
        uprobe_start_dup_mmap();
        down_write(&oldmm->mmap_sem);
@@ -400,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                        goto fail_nomem;
                *tmp = *mpnt;
                INIT_LIST_HEAD(&tmp->anon_vma_chain);
-                pol = mpol_dup(vma_policy(mpnt));
+                retval = vma_dup_policy(mpnt, tmp);
-                retval = PTR_ERR(pol);
+                if (retval)
-                if (IS_ERR(pol))
                        goto fail_nomem_policy;
-                vma_set_policy(tmp, pol);
                tmp->vm_mm = mm;
                if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
@@ -472,7 +469,7 @@ out:
        uprobe_end_dup_mmap();
        return retval;
 fail_nomem_anon_vma_fork:
-        mpol_put(pol);
+        mpol_put(vma_policy(tmp));
 fail_nomem_policy:
        kmem_cache_free(vm_area_cachep, tmp);
 fail_nomem:
@@ -522,7 +519,7 @@ static void mm_init_aio(struct mm_struct *mm)
 {
 #ifdef CONFIG_AIO
        spin_lock_init(&mm->ioctx_lock);
-        INIT_HLIST_HEAD(&mm->ioctx_list);
+        mm->ioctx_table = NULL;
 #endif
 }
@@ -820,9 +817,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        mm->pmd_huge_pte = NULL;
 #endif
-#ifdef CONFIG_NUMA_BALANCING
-        mm->first_nid = NUMA_PTE_SCAN_INIT;
-#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@ -1173,12 +1167,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                return ERR_PTR(-EINVAL);
        /*
-         * If the new process will be in a different pid namespace
+         * If the new process will be in a different pid or user namespace
-         * don't allow the creation of threads.
+         * do not allow it to share a thread group or signal handlers or
+         * parent with the forking task.
         */
-        if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
+        if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) {
-            (task_active_pid_ns(current) != current->nsproxy->pid_ns))
+                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
-                return ERR_PTR(-EINVAL);
+                    (task_active_pid_ns(current) !=
+                                current->nsproxy->pid_ns_for_children))
+                        return ERR_PTR(-EINVAL);
+        }
        retval = security_task_create(clone_flags);
        if (retval)
@@ -1312,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
        /* Perform scheduler related setup. Assign this task to a CPU. */
-        sched_fork(p);
+        sched_fork(clone_flags, p);
        retval = perf_event_init_task(p);
        if (retval)
@@ -1351,7 +1349,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (pid != &init_struct_pid) {
                retval = -ENOMEM;
-                pid = alloc_pid(p->nsproxy->pid_ns);
+                pid = alloc_pid(p->nsproxy->pid_ns_for_children);
                if (!pid)
                        goto bad_fork_cleanup_io;
        }
@@ -1372,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->pi_state_list);
        p->pi_state_cache = NULL;
 #endif
-        uprobe_copy_process(p);
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
@@ -1489,6 +1486,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        perf_event_fork(p);
        trace_task_newtask(p, clone_flags);
+        uprobe_copy_process(p, clone_flags);
        return p;
@@ -1575,15 +1573,6 @@ long do_fork(unsigned long clone_flags,
        long nr;
        /*
-         * Do some preliminary argument and permissions checking before we
-         * actually start allocating stuff
-         */
-        if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
-                if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
-                        return -EINVAL;
-        }
-        /*
         * Determine whether and which event to report to ptracer.  When
         * called from kernel_thread or CLONE_UNTRACED is explicitly
         * requested, no event is reported; otherwise, report if the event
@@ -1679,6 +1668,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 int, tls_val)
+#elif defined(CONFIG_CLONE_BACKWARDS3)
+SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
+                int, stack_size,
+                int __user *, parent_tidptr,
+                int __user *, child_tidptr,
+                int, tls_val)
 #else
 SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
@@ -1818,11 +1813,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
        if (unshare_flags & CLONE_NEWUSER)
                unshare_flags |= CLONE_THREAD | CLONE_FS;
        /*
-         * If unsharing a pid namespace must also unshare the thread.
-         */
-        if (unshare_flags & CLONE_NEWPID)
-                unshare_flags |= CLONE_THREAD;
-        /*
         * If unsharing a thread from a thread group, must also unshare vm.
         */
        if (unshare_flags & CLONE_THREAD)
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 9bd0934f6c33..7a7d2ee96d42 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str)
 {
        unsigned long val;
-        if (strict_strtoul(str, 0, &val)) {
+        if (kstrtoul(str, 0, &val)) {
                pr_warning("invalid gcov_persist parameter '%s'\n", str);
                return 0;
        }
diff --git a/kernel/groups.c b/kernel/groups.c
index 6b2588dd04ff..90cf1c38c8ea 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
        struct group_info *group_info;
        int retval;
-        if (!nsown_capable(CAP_SETGID))
+        if (!ns_capable(current_user_ns(), CAP_SETGID))
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 6df614912b9d..3e97fb126e6b 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,6 +15,7 @@
 #include <linux/lockdep.h>
 #include <linux/export.h>
 #include <linux/sysctl.h>
+#include <linux/utsname.h>
 /*
 * The number of tasks checked:
@@ -99,10 +100,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
         * Ok, the task did not get scheduled for more than 2 minutes,
         * complain:
         */
-        printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+        pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
-                        "%ld seconds.\n", t->comm, t->pid, timeout);
+                t->comm, t->pid, timeout);
-        printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+        pr_err("      %s %s %.*s\n",
-                        " disables this message.\n");
+                print_tainted(), init_utsname()->release,
+                (int)strcspn(init_utsname()->version, " "),
+                init_utsname()->version);
+        pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+                " disables this message.\n");
        sched_show_task(t);
        debug_show_held_locks(t);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d1a758bc972a..4a1fef09f658 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,15 +1,4 @@
-# Select this to activate the generic irq options below
-config HAVE_GENERIC_HARDIRQS
-        bool
-if HAVE_GENERIC_HARDIRQS
 menu "IRQ subsystem"
-#
-# Interrupt subsystem related configuration options
-#
-config GENERIC_HARDIRQS
-       def_bool y
 # Options selectable by the architecture code
 # Make sparse irq Kconfig switch below available
@@ -84,4 +73,3 @@ config SPARSE_IRQ
          If you don't know what to do here, say N.
 endmenu
-endif
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 706724e9835d..cf68bb36fe58 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -465,27 +465,26 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
 }
 EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
-unsigned int irq_create_of_mapping(struct device_node *controller,
+unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
-                                   const u32 *intspec, unsigned int intsize)
 {
        struct irq_domain *domain;
        irq_hw_number_t hwirq;
        unsigned int type = IRQ_TYPE_NONE;
        unsigned int virq;
-        domain = controller ? irq_find_host(controller) : irq_default_domain;
+        domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
        if (!domain) {
                pr_warn("no irq domain found for %s !\n",
-                        of_node_full_name(controller));
+                        of_node_full_name(irq_data->np));
                return 0;
        }
        /* If domain has no translation, then we assume interrupt line */
        if (domain->ops->xlate == NULL)
-                hwirq = intspec[0];
+                hwirq = irq_data->args[0];
        else {
-                if (domain->ops->xlate(domain, controller, intspec, intsize,
+                if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
-                                     &hwirq, &type))
+                                        irq_data->args_count, &hwirq, &type))
                        return 0;
        }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 514bcfd855a8..3e59f951d42f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                        goto out_mput;
                }
-                sched_setscheduler(t, SCHED_FIFO, &param);
+                sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
                /*
                 * We keep the reference to the task struct even if
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 60f48fa0fd0d..297a9247a3b3 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -13,6 +13,7 @@
 #include <linux/sort.h>
 #include <linux/err.h>
 #include <linux/static_key.h>
+#include <linux/jump_label_ratelimit.h>
 #ifdef HAVE_JUMP_LABEL
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f7b55ba745..2a74f307c5ec 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline,
        if (first_colon && (!first_space || first_colon < first_space))
                return parse_crashkernel_mem(ck_cmdline, system_ram,
                                crash_size, crash_base);
-        else
-                return parse_crashkernel_simple(ck_cmdline, crash_size,
-                                crash_base);
-        return 0;
+        return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
 }
 /*
diff --git a/kernel/kmod.c b/kernel/kmod.c
index fb326365b694..b086006c59e7 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -571,6 +571,10 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
        DECLARE_COMPLETION_ONSTACK(done);
        int retval = 0;
+        if (!sub_info->path) {
+                call_usermodehelper_freeinfo(sub_info);
+                return -EINVAL;
+        }
        helper_lock();
        if (!khelper_wq || usermodehelper_disabled) {
                retval = -EBUSY;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6e33498d665c..a0d367a49122 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
 struct kprobe_insn_page {
        struct list_head list;
        kprobe_opcode_t *insns;         /* Page of instruction slots */
+        struct kprobe_insn_cache *cache;
        int nused;
        int ngarbage;
        char slot_used[];
@@ -121,12 +122,6 @@ struct kprobe_insn_page {
        (offsetof(struct kprobe_insn_page, slot_used) + \
         (sizeof(char) * (slots)))
-struct kprobe_insn_cache {
-        struct list_head pages; /* list of kprobe_insn_page */
-        size_t insn_size;       /* size of instruction slot */
-        int nr_garbage;
-};
 static int slots_per_page(struct kprobe_insn_cache *c)
 {
        return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
@@ -138,8 +133,20 @@ enum kprobe_slot_state {
        SLOT_USED = 2,
 };
-static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */
+static void *alloc_insn_page(void)
-static struct kprobe_insn_cache kprobe_insn_slots = {
+{
+        return module_alloc(PAGE_SIZE);
+}
+static void free_insn_page(void *page)
+{
+        module_free(NULL, page);
+}
+struct kprobe_insn_cache kprobe_insn_slots = {
+        .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
+        .alloc = alloc_insn_page,
+        .free = free_insn_page,
        .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
        .insn_size = MAX_INSN_SIZE,
        .nr_garbage = 0,
@@ -150,10 +157,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
 * __get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
 */
-static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
+kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 {
        struct kprobe_insn_page *kip;
+        kprobe_opcode_t *slot = NULL;
+        mutex_lock(&c->mutex);
 retry:
        list_for_each_entry(kip, &c->pages, list) {
                if (kip->nused < slots_per_page(c)) {
@@ -162,7 +171,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
                                if (kip->slot_used[i] == SLOT_CLEAN) {
                                        kip->slot_used[i] = SLOT_USED;
                                        kip->nused++;
-                                        return kip->insns + (i * c->insn_size);
+                                        slot = kip->insns + (i * c->insn_size);
+                                        goto out;
                                }
                        }
                        /* kip->nused is broken. Fix it. */
@@ -178,37 +188,29 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
        /* All out of space.  Need to allocate a new page. */
        kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
        if (!kip)
-                return NULL;
+                goto out;
        /*
         * Use module_alloc so this page is within +/- 2GB of where the
         * kernel image and loaded module images reside. This is required
         * so x86_64 can correctly handle the %rip-relative fixups.
         */
-        kip->insns = module_alloc(PAGE_SIZE);
+        kip->insns = c->alloc();
        if (!kip->insns) {
                kfree(kip);
-                return NULL;
+                goto out;
        }
        INIT_LIST_HEAD(&kip->list);
        memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
        kip->slot_used[0] = SLOT_USED;
        kip->nused = 1;
        kip->ngarbage = 0;
+        kip->cache = c;
        list_add(&kip->list, &c->pages);
-        return kip->insns;
+        slot = kip->insns;
-}
+out:
+        mutex_unlock(&c->mutex);
+        return slot;
-kprobe_opcode_t __kprobes *get_insn_slot(void)
-{
-        kprobe_opcode_t *ret = NULL;
-        mutex_lock(&kprobe_insn_mutex);
-        ret = __get_insn_slot(&kprobe_insn_slots);
-        mutex_unlock(&kprobe_insn_mutex);
-        return ret;
 }
 /* Return 1 if all garbages are collected, otherwise 0. */
@@ -225,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
                 */
                if (!list_is_singular(&kip->list)) {
                        list_del(&kip->list);
-                        module_free(NULL, kip->insns);
+                        kip->cache->free(kip->insns);
                        kfree(kip);
                }
                return 1;
@@ -255,11 +257,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
        return 0;
 }
-static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
+void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
-                                       kprobe_opcode_t *slot, int dirty)
+                                kprobe_opcode_t *slot, int dirty)
 {
        struct kprobe_insn_page *kip;
+        mutex_lock(&c->mutex);
        list_for_each_entry(kip, &c->pages, list) {
                long idx = ((long)slot - (long)kip->insns) /
                                (c->insn_size * sizeof(kprobe_opcode_t));
@@ -272,45 +275,25 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
                                        collect_garbage_slots(c);
                        } else
                                collect_one_slot(kip, idx);
-                        return;
+                        goto out;
                }
        }
        /* Could not free this slot. */
        WARN_ON(1);
+out:
+        mutex_unlock(&c->mutex);
 }
-void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
-{
-        mutex_lock(&kprobe_insn_mutex);
-        __free_insn_slot(&kprobe_insn_slots, slot, dirty);
-        mutex_unlock(&kprobe_insn_mutex);
-}
 #ifdef CONFIG_OPTPROBES
 /* For optimized_kprobe buffer */
-static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
+struct kprobe_insn_cache kprobe_optinsn_slots = {
-static struct kprobe_insn_cache kprobe_optinsn_slots = {
+        .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
+        .alloc = alloc_insn_page,
+        .free = free_insn_page,
        .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
        /* .insn_size is initialized later */
        .nr_garbage = 0,
 };
-/* Get a slot for optimized_kprobe buffer */
-kprobe_opcode_t __kprobes *get_optinsn_slot(void)
-{
-        kprobe_opcode_t *ret = NULL;
-        mutex_lock(&kprobe_optinsn_mutex);
-        ret = __get_insn_slot(&kprobe_optinsn_slots);
-        mutex_unlock(&kprobe_optinsn_mutex);
-        return ret;
-}
-void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
-{
-        mutex_lock(&kprobe_optinsn_mutex);
-        __free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
-        mutex_unlock(&kprobe_optinsn_mutex);
-}
 #endif
 #endif
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6ada93c23a9a..9659d38e008f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
        unsigned long cnt;
        int ret;
-        if (strict_strtoul(buf, 0, &cnt))
+        if (kstrtoul(buf, 0, &cnt))
                return -EINVAL;
        ret = crash_shrink_memory(cnt);
diff --git a/kernel/lglock.c b/kernel/lglock.c
index 6535a667a5a7..86ae2aebf004 100644
--- a/kernel/lglock.c
+++ b/kernel/lglock.c
@@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg)
        arch_spinlock_t *lock;
        preempt_disable();
-        rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+        lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
        lock = this_cpu_ptr(lg->lock);
        arch_spin_lock(lock);
 }
@@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg)
 {
        arch_spinlock_t *lock;
-        rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+        lock_release(&lg->lock_dep_map, 1, _RET_IP_);
        lock = this_cpu_ptr(lg->lock);
        arch_spin_unlock(lock);
        preempt_enable();
@@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu)
        arch_spinlock_t *lock;
        preempt_disable();
-        rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+        lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
        lock = per_cpu_ptr(lg->lock, cpu);
        arch_spin_lock(lock);
 }
@@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu)
 {
        arch_spinlock_t *lock;
-        rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+        lock_release(&lg->lock_dep_map, 1, _RET_IP_);
        lock = per_cpu_ptr(lg->lock, cpu);
        arch_spin_unlock(lock);
        preempt_enable();
@@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg)
        int i;
        preempt_disable();
-        rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+        lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
        for_each_possible_cpu(i) {
                arch_spinlock_t *lock;
                lock = per_cpu_ptr(lg->lock, i);
@@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg)
 {
        int i;
-        rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+        lock_release(&lg->lock_dep_map, 1, _RET_IP_);
        for_each_possible_cpu(i) {
                arch_spinlock_t *lock;
                lock = per_cpu_ptr(lg->lock, i);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e16c45b9ee77..4e8e14c34e42 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
        printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
               !rcu_lockdep_current_cpu_online()
                        ? "RCU used illegally from offline CPU!\n"
-                        : rcu_is_cpu_idle()
+                        : !rcu_is_watching()
                                ? "RCU used illegally from idle CPU!\n"
                                : "",
               rcu_scheduler_active, debug_locks);
@@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
         * So complain bitterly if someone does call rcu_read_lock(),
         * rcu_read_lock_bh() and so on from extended quiescent states.
         */
-        if (rcu_is_cpu_idle())
+        if (!rcu_is_watching())
                printk("RCU used illegally from extended quiescent state!\n");
        lockdep_print_held_locks(curr);
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 2b6e69909c39..7cbd4507a7e6 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -18,14 +18,14 @@
 struct key *modsign_keyring;
-extern __initdata const u8 modsign_certificate_list[];
+extern __initconst const u8 modsign_certificate_list[];
-extern __initdata const u8 modsign_certificate_list_end[];
+extern __initconst const u8 modsign_certificate_list_end[];
 /*
 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
 * if modsign.pub changes.
 */
-static __initdata const char annoy_ccache[] = __TIME__ "foo";
+static __initconst const char annoy_ccache[] = __TIME__ "foo";
 /*
 * Load the compiled-in keys
diff --git a/kernel/module.c b/kernel/module.c
index 206915830d29..dc582749fa13 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -136,6 +136,7 @@ static int param_set_bool_enable_only(const char *val,
 }
 static const struct kernel_param_ops param_ops_bool_enable_only = {
+        .flags = KERNEL_PARAM_FL_NOARG,
        .set = param_set_bool_enable_only,
        .get = param_get_bool,
 };
@@ -603,7 +604,7 @@ static void setup_modinfo_##field(struct module *mod, const char *s)  \
 static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
                        struct module_kobject *mk, char *buffer)      \
 {                                                                     \
-        return sprintf(buffer, "%s\n", mk->mod->field);               \
+        return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field);  \
 }                                                                     \
 static int modinfo_##field##_exists(struct module *mod)               \
 {                                                                     \
@@ -1611,6 +1612,14 @@ static void module_remove_modinfo_attrs(struct module *mod)
        kfree(mod->modinfo_attrs);
 }
+static void mod_kobject_put(struct module *mod)
+{
+        DECLARE_COMPLETION_ONSTACK(c);
+        mod->mkobj.kobj_completion = &c;
+        kobject_put(&mod->mkobj.kobj);
+        wait_for_completion(&c);
+}
 static int mod_sysfs_init(struct module *mod)
 {
        int err;
@@ -1638,7 +1647,7 @@ static int mod_sysfs_init(struct module *mod)
        err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
                                   "%s", mod->name);
        if (err)
-                kobject_put(&mod->mkobj.kobj);
+                mod_kobject_put(mod);
        /* delay uevent until full sysfs population */
 out:
@@ -1682,7 +1691,7 @@ out_unreg_param:
 out_unreg_holders:
        kobject_put(mod->holders_dir);
 out_unreg:
-        kobject_put(&mod->mkobj.kobj);
+        mod_kobject_put(mod);
 out:
        return err;
 }
@@ -1691,7 +1700,7 @@ static void mod_sysfs_fini(struct module *mod)
 {
        remove_notes_attrs(mod);
        remove_sect_attrs(mod);
-        kobject_put(&mod->mkobj.kobj);
+        mod_kobject_put(mod);
 }
 #else /* !CONFIG_SYSFS */
@@ -2540,21 +2549,20 @@ static int copy_module_from_user(const void __user *umod, unsigned long len,
 /* Sets info->hdr and info->len. */
 static int copy_module_from_fd(int fd, struct load_info *info)
 {
-        struct file *file;
+        struct fd f = fdget(fd);
        int err;
        struct kstat stat;
        loff_t pos;
        ssize_t bytes = 0;
-        file = fget(fd);
+        if (!f.file)
-        if (!file)
                return -ENOEXEC;
-        err = security_kernel_module_from_file(file);
+        err = security_kernel_module_from_file(f.file);
        if (err)
                goto out;
-        err = vfs_getattr(&file->f_path, &stat);
+        err = vfs_getattr(&f.file->f_path, &stat);
        if (err)
                goto out;
@@ -2577,7 +2585,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
        pos = 0;
        while (pos < stat.size) {
-                bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
+                bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos,
                                    stat.size - pos);
                if (bytes < 0) {
                        vfree(info->hdr);
@@ -2591,7 +2599,7 @@ static int copy_module_from_fd(int fd, struct load_info *info)
        info->len = pos;
 out:
-        fput(file);
+        fdput(f);
        return err;
 }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index ff05f4bd86eb..d24105b1b794 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -209,11 +209,13 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 */
 static inline int mutex_can_spin_on_owner(struct mutex *lock)
 {
+        struct task_struct *owner;
        int retval = 1;
        rcu_read_lock();
-        if (lock->owner)
+        owner = ACCESS_ONCE(lock->owner);
-                retval = lock->owner->on_cpu;
+        if (owner)
+                retval = owner->on_cpu;
        rcu_read_unlock();
        /*
         * if lock->owner is not set, the mutex owner may have just acquired
@@ -408,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
 static __always_inline int __sched
 __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                    struct lockdep_map *nest_lock, unsigned long ip,
-                    struct ww_acquire_ctx *ww_ctx)
+                    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
 {
        struct task_struct *task = current;
        struct mutex_waiter waiter;
@@ -448,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                struct task_struct *owner;
                struct mspin_node  node;
-                if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
+                if (use_ww_ctx && ww_ctx->acquired > 0) {
                        struct ww_mutex *ww;
                        ww = container_of(lock, struct ww_mutex, base);
@@ -461,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                         * performed the optimistic spinning cannot be done.
                         */
                        if (ACCESS_ONCE(ww->ctx))
-                                break;
+                                goto slowpath;
                }
                /*
@@ -472,13 +474,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                owner = ACCESS_ONCE(lock->owner);
                if (owner && !mutex_spin_on_owner(lock, owner)) {
                        mspin_unlock(MLOCK(lock), &node);
-                        break;
+                        goto slowpath;
                }
                if ((atomic_read(&lock->count) == 1) &&
                    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
                        lock_acquired(&lock->dep_map, ip);
-                        if (!__builtin_constant_p(ww_ctx == NULL)) {
+                        if (use_ww_ctx) {
                                struct ww_mutex *ww;
                                ww = container_of(lock, struct ww_mutex, base);
@@ -499,7 +501,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * the owner complete.
                 */
                if (!owner && (need_resched() || rt_task(task)))
-                        break;
+                        goto slowpath;
                /*
                 * The cpu_relax() call is a compiler barrier which forces
@@ -513,6 +515,10 @@ slowpath:
 #endif
        spin_lock_mutex(&lock->wait_lock, flags);
+        /* once more, can we acquire the lock? */
+        if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1))
+                goto skip_wait;
        debug_mutex_lock_common(lock, &waiter);
        debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
@@ -520,9 +526,6 @@ slowpath:
        list_add_tail(&waiter.list, &lock->wait_list);
        waiter.task = task;
-        if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
-                goto done;
        lock_contended(&lock->dep_map, ip);
        for (;;) {
@@ -536,7 +539,7 @@ slowpath:
                 * other waiters:
                 */
                if (MUTEX_SHOW_NO_WAITER(lock) &&
-                   (atomic_xchg(&lock->count, -1) == 1))
+                    (atomic_xchg(&lock->count, -1) == 1))
                        break;
                /*
@@ -548,7 +551,7 @@ slowpath:
                        goto err;
                }
-                if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
+                if (use_ww_ctx && ww_ctx->acquired > 0) {
                        ret = __mutex_lock_check_stamp(lock, ww_ctx);
                        if (ret)
                                goto err;
@@ -561,24 +564,25 @@ slowpath:
                schedule_preempt_disabled();
                spin_lock_mutex(&lock->wait_lock, flags);
        }
+        mutex_remove_waiter(lock, &waiter, current_thread_info());
+        /* set it to 0 if there are no waiters left: */
+        if (likely(list_empty(&lock->wait_list)))
+                atomic_set(&lock->count, 0);
+        debug_mutex_free_waiter(&waiter);
-done:
+skip_wait:
+        /* got the lock - cleanup and rejoice! */
        lock_acquired(&lock->dep_map, ip);
-        /* got the lock - rejoice! */
-        mutex_remove_waiter(lock, &waiter, current_thread_info());
        mutex_set_owner(lock);
-        if (!__builtin_constant_p(ww_ctx == NULL)) {
+        if (use_ww_ctx) {
-                struct ww_mutex *ww = container_of(lock,
+                struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
-                                                      struct ww_mutex,
-                                                      base);
                struct mutex_waiter *cur;
                /*
                 * This branch gets optimized out for the common case,
                 * and is only important for ww_mutex_lock.
                 */
                ww_mutex_lock_acquired(ww, ww_ctx);
                ww->ctx = ww_ctx;
@@ -592,15 +596,8 @@ done:
                }
        }
-        /* set it to 0 if there are no waiters left: */
-        if (likely(list_empty(&lock->wait_list)))
-                atomic_set(&lock->count, 0);
        spin_unlock_mutex(&lock->wait_lock, flags);
-        debug_mutex_free_waiter(&waiter);
        preempt_enable();
        return 0;
 err:
@@ -618,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
-                            subclass, NULL, _RET_IP_, NULL);
+                            subclass, NULL, _RET_IP_, NULL, 0);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -628,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
 {
        might_sleep();
        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
-                            0, nest, _RET_IP_, NULL);
+                            0, nest, _RET_IP_, NULL, 0);
 }
 EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -638,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
        return __mutex_lock_common(lock, TASK_KILLABLE,
-                                   subclass, NULL, _RET_IP_, NULL);
+                                   subclass, NULL, _RET_IP_, NULL, 0);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
@@ -647,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
        might_sleep();
        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
-                                   subclass, NULL, _RET_IP_, NULL);
+                                   subclass, NULL, _RET_IP_, NULL, 0);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -685,8 +682,8 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
        might_sleep();
        ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
-                                   0, &ctx->dep_map, _RET_IP_, ctx);
+                                   0, &ctx->dep_map, _RET_IP_, ctx, 1);
-        if (!ret && ctx->acquired > 0)
+        if (!ret && ctx->acquired > 1)
                return ww_mutex_deadlock_injection(lock, ctx);
        return ret;
@@ -700,9 +697,9 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
        might_sleep();
        ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
-                                  0, &ctx->dep_map, _RET_IP_, ctx);
+                                  0, &ctx->dep_map, _RET_IP_, ctx, 1);
-        if (!ret && ctx->acquired > 0)
+        if (!ret && ctx->acquired > 1)
                return ww_mutex_deadlock_injection(lock, ctx);
        return ret;
@@ -812,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count)
        struct mutex *lock = container_of(lock_count, struct mutex, count);
        __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
-                            NULL, _RET_IP_, NULL);
+                            NULL, _RET_IP_, NULL, 0);
 }
 static noinline int __sched
 __mutex_lock_killable_slowpath(struct mutex *lock)
 {
        return __mutex_lock_common(lock, TASK_KILLABLE, 0,
-                                   NULL, _RET_IP_, NULL);
+                                   NULL, _RET_IP_, NULL, 0);
 }
 static noinline int __sched
 __mutex_lock_interruptible_slowpath(struct mutex *lock)
 {
        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
-                                   NULL, _RET_IP_, NULL);
+                                   NULL, _RET_IP_, NULL, 0);
 }
 static noinline int __sched
 __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
        return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
-                                   NULL, _RET_IP_, ctx);
+                                   NULL, _RET_IP_, ctx, 1);
 }
 static noinline int __sched
@@ -841,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
                                            struct ww_acquire_ctx *ctx)
 {
        return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
-                                   NULL, _RET_IP_, ctx);
+                                   NULL, _RET_IP_, ctx, 1);
 }
 #endif
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 364ceab15f0c..8e7811086b82 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -29,15 +29,15 @@
 static struct kmem_cache *nsproxy_cachep;
 struct nsproxy init_nsproxy = {
-        .count  = ATOMIC_INIT(1),
+        .count                  = ATOMIC_INIT(1),
-        .uts_ns = &init_uts_ns,
+        .uts_ns                 = &init_uts_ns,
 #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
-        .ipc_ns = &init_ipc_ns,
+        .ipc_ns                 = &init_ipc_ns,
 #endif
-        .mnt_ns = NULL,
+        .mnt_ns                 = NULL,
-        .pid_ns = &init_pid_ns,
+        .pid_ns_for_children    = &init_pid_ns,
 #ifdef CONFIG_NET
-        .net_ns = &init_net,
+        .net_ns                 = &init_net,
 #endif
 };
@@ -85,9 +85,10 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
                goto out_ipc;
        }
-        new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
+        new_nsp->pid_ns_for_children =
-        if (IS_ERR(new_nsp->pid_ns)) {
+                copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns_for_children);
-                err = PTR_ERR(new_nsp->pid_ns);
+        if (IS_ERR(new_nsp->pid_ns_for_children)) {
+                err = PTR_ERR(new_nsp->pid_ns_for_children);
                goto out_pid;
        }
@@ -100,8 +101,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
        return new_nsp;
 out_net:
-        if (new_nsp->pid_ns)
+        if (new_nsp->pid_ns_for_children)
-                put_pid_ns(new_nsp->pid_ns);
+                put_pid_ns(new_nsp->pid_ns_for_children);
 out_pid:
        if (new_nsp->ipc_ns)
                put_ipc_ns(new_nsp->ipc_ns);
@@ -125,22 +126,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
        struct nsproxy *old_ns = tsk->nsproxy;
        struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
        struct nsproxy *new_ns;
-        int err = 0;
-        if (!old_ns)
+        if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+                              CLONE_NEWPID | CLONE_NEWNET)))) {
+                get_nsproxy(old_ns);
                return 0;
-        get_nsproxy(old_ns);
-        if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-                                CLONE_NEWPID | CLONE_NEWNET)))
-                return 0;
-        if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
-                err = -EPERM;
-                goto out;
        }
+        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+                return -EPERM;
        /*
         * CLONE_NEWIPC must detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
@@ -148,22 +143,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
         * means share undolist with parent, so we must forbid using
         * it along with CLONE_NEWIPC.
         */
-        if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
+        if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
-                err = -EINVAL;
+                (CLONE_NEWIPC | CLONE_SYSVSEM)) 
-                goto out;
+                return -EINVAL;
-        }
        new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
-        if (IS_ERR(new_ns)) {
+        if (IS_ERR(new_ns))
-                err = PTR_ERR(new_ns);
+                return  PTR_ERR(new_ns);
-                goto out;
-        }
        tsk->nsproxy = new_ns;
+        return 0;
-out:
-        put_nsproxy(old_ns);
-        return err;
 }
 void free_nsproxy(struct nsproxy *ns)
@@ -174,8 +163,8 @@ void free_nsproxy(struct nsproxy *ns)
                put_uts_ns(ns->uts_ns);
        if (ns->ipc_ns)
                put_ipc_ns(ns->ipc_ns);
-        if (ns->pid_ns)
+        if (ns->pid_ns_for_children)
-                put_pid_ns(ns->pid_ns);
+                put_pid_ns(ns->pid_ns_for_children);
        put_net(ns->net_ns);
        kmem_cache_free(nsproxy_cachep, ns);
 }
diff --git a/kernel/padata.c b/kernel/padata.c
index 072f4ee4eb89..07af2c95dcfe 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -846,6 +846,8 @@ static int padata_cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
                if (!pinst_has_cpu(pinst, cpu))
                        break;
                mutex_lock(&pinst->lock);
@@ -857,6 +859,8 @@ static int padata_cpu_callback(struct notifier_block *nfb,
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
+        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
                if (!pinst_has_cpu(pinst, cpu))
                        break;
                mutex_lock(&pinst->lock);
@@ -865,22 +869,6 @@ static int padata_cpu_callback(struct notifier_block *nfb,
                if (err)
                        return notifier_from_errno(err);
                break;
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-                if (!pinst_has_cpu(pinst, cpu))
-                        break;
-                mutex_lock(&pinst->lock);
-                __padata_remove_cpu(pinst, cpu);
-                mutex_unlock(&pinst->lock);
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
-                if (!pinst_has_cpu(pinst, cpu))
-                        break;
-                mutex_lock(&pinst->lock);
-                __padata_add_cpu(pinst, cpu);
-                mutex_unlock(&pinst->lock);
        }
        return NOTIFY_OK;
@@ -1086,18 +1074,18 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq,
        pinst->flags = 0;
-#ifdef CONFIG_HOTPLUG_CPU
-        pinst->cpu_notifier.notifier_call = padata_cpu_callback;
-        pinst->cpu_notifier.priority = 0;
-        register_hotcpu_notifier(&pinst->cpu_notifier);
-#endif
        put_online_cpus();
        BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
        kobject_init(&pinst->kobj, &padata_attr_type);
        mutex_init(&pinst->lock);
+#ifdef CONFIG_HOTPLUG_CPU
+        pinst->cpu_notifier.notifier_call = padata_cpu_callback;
+        pinst->cpu_notifier.priority = 0;
+        register_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
        return pinst;
 err_free_masks:
diff --git a/kernel/panic.c b/kernel/panic.c
index 801864600514..b6c482ccc5db 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -123,10 +123,14 @@ void panic(const char *fmt, ...)
         */
        smp_send_stop();
-        kmsg_dump(KMSG_DUMP_PANIC);
+        /*
+         * Run any panic handlers, including those that might need to
+         * add information to the kmsg dump output.
+         */
        atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
+        kmsg_dump(KMSG_DUMP_PANIC);
        bust_spinlocks(0);
        if (!panic_blink)
diff --git a/kernel/params.c b/kernel/params.c
index 440e65d1a544..c00d5b502aa4 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -103,8 +103,8 @@ static int parse_one(char *param,
                            || params[i].level > max_level)
                                return 0;
                        /* No one handled NULL, so do it here. */
-                        if (!val && params[i].ops->set != param_set_bool
+                        if (!val &&
-                            && params[i].ops->set != param_set_bint)
+                            !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG))
                                return -EINVAL;
                        pr_debug("handling %s with %p\n", param,
                                params[i].ops->set);
@@ -241,7 +241,8 @@ int parse_args(const char *doing,
        }                                                               \
        int param_get_##name(char *buffer, const struct kernel_param *kp) \
        {                                                               \
-                return sprintf(buffer, format, *((type *)kp->arg));     \
+                return scnprintf(buffer, PAGE_SIZE, format,             \
+                                *((type *)kp->arg));                    \
        }                                                               \
        struct kernel_param_ops param_ops_##name = {                    \
                .set = param_set_##name,                                \
@@ -252,13 +253,13 @@ int parse_args(const char *doing,
        EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul);
-STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol);
+STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul);
-STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol);
+STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul);
-STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol);
+STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul);
 int param_set_charp(const char *val, const struct kernel_param *kp)
 {
@@ -285,7 +286,7 @@ EXPORT_SYMBOL(param_set_charp);
 int param_get_charp(char *buffer, const struct kernel_param *kp)
 {
-        return sprintf(buffer, "%s", *((char **)kp->arg));
+        return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
 }
 EXPORT_SYMBOL(param_get_charp);
@@ -320,6 +321,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
 EXPORT_SYMBOL(param_get_bool);
 struct kernel_param_ops param_ops_bool = {
+        .flags = KERNEL_PARAM_FL_NOARG,
        .set = param_set_bool,
        .get = param_get_bool,
 };
@@ -370,6 +372,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
 EXPORT_SYMBOL(param_set_bint);
 struct kernel_param_ops param_ops_bint = {
+        .flags = KERNEL_PARAM_FL_NOARG,
        .set = param_set_bint,
        .get = param_get_int,
 };
@@ -827,7 +830,7 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
        struct module_version_attribute *vattr =
                container_of(mattr, struct module_version_attribute, mattr);
-        return sprintf(buf, "%s\n", vattr->version);
+        return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
 }
 extern const struct module_version_attribute *__start___modver[];
@@ -912,7 +915,14 @@ static const struct kset_uevent_ops module_uevent_ops = {
 struct kset *module_kset;
 int module_sysfs_initialized;
+static void module_kobj_release(struct kobject *kobj)
+{
+        struct module_kobject *mk = to_module_kobject(kobj);
+        complete(mk->kobj_completion);
+}
 struct kobj_type module_ktype = {
+        .release   =    module_kobj_release,
        .sysfs_ops =    &module_sysfs_ops,
 };
diff --git a/kernel/pid.c b/kernel/pid.c
index 66505c1dfc51..9b9a26698144 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -265,6 +265,7 @@ void free_pid(struct pid *pid)
                struct pid_namespace *ns = upid->ns;
                hlist_del_rcu(&upid->pid_chain);
                switch(--ns->nr_hashed) {
+                case 2:
                case 1:
                        /* When all that is left in the pid namespace
                         * is the reaper wake up the reaper.  The reaper
@@ -272,6 +273,11 @@ void free_pid(struct pid *pid)
                         */
                        wake_up_process(ns->child_reaper);
                        break;
+                case PIDNS_HASH_ADDING:
+                        /* Handle a fork failure of the first process */
+                        WARN_ON(ns->child_reaper);
+                        ns->nr_hashed = 0;
+                        /* fall through */
                case 0:
                        schedule_work(&ns->proc_work);
                        break;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6917e8edb48e..42086551a24a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -329,7 +329,7 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
        struct pid_namespace *ancestor, *new = ns;
        if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
-            !nsown_capable(CAP_SYS_ADMIN))
+            !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
                return -EPERM;
        /*
@@ -349,8 +349,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
        if (ancestor != active)
                return -EINVAL;
-        put_pid_ns(nsproxy->pid_ns);
+        put_pid_ns(nsproxy->pid_ns_for_children);
-        nsproxy->pid_ns = get_pid_ns(new);
+        nsproxy->pid_ns_for_children = get_pid_ns(new);
        return 0;
 }
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b26f5f1e773e..0121dab83f43 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -39,7 +39,7 @@ static int resume_delay;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
-int in_suspend __nosavedata;
+__visible int in_suspend __nosavedata;
 enum {
        HIBERNATION_INVALID,
@@ -644,22 +644,23 @@ int hibernate(void)
        if (error)
                goto Exit;
-        /* Allocate memory management structures */
-        error = create_basic_memory_bitmaps();
-        if (error)
-                goto Exit;
        printk(KERN_INFO "PM: Syncing filesystems ... ");
        sys_sync();
        printk("done.\n");
        error = freeze_processes();
        if (error)
-                goto Free_bitmaps;
+                goto Exit;
+        lock_device_hotplug();
+        /* Allocate memory management structures */
+        error = create_basic_memory_bitmaps();
+        if (error)
+                goto Thaw;
        error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
        if (error || freezer_test_done)
-                goto Thaw;
+                goto Free_bitmaps;
        if (in_suspend) {
                unsigned int flags = 0;
@@ -682,14 +683,14 @@ int hibernate(void)
                pr_debug("PM: Image restored successfully.\n");
        }
+ Free_bitmaps:
+        free_basic_memory_bitmaps();
 Thaw:
+        unlock_device_hotplug();
        thaw_processes();
        /* Don't bother checking whether freezer_test_done is true */
        freezer_test_done = false;
- Free_bitmaps:
-        free_basic_memory_bitmaps();
 Exit:
        pm_notifier_call_chain(PM_POST_HIBERNATION);
        pm_restore_console();
@@ -806,21 +807,20 @@ static int software_resume(void)
        pm_prepare_console();
        error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
        if (error)
-                goto close_finish;
+                goto Close_Finish;
-        error = create_basic_memory_bitmaps();
-        if (error)
-                goto close_finish;
        pr_debug("PM: Preparing processes for restore.\n");
        error = freeze_processes();
-        if (error) {
+        if (error)
-                swsusp_close(FMODE_READ);
+                goto Close_Finish;
-                goto Done;
-        }
        pr_debug("PM: Loading hibernation image.\n");
+        lock_device_hotplug();
+        error = create_basic_memory_bitmaps();
+        if (error)
+                goto Thaw;
        error = swsusp_read(&flags);
        swsusp_close(FMODE_READ);
        if (!error)
@@ -828,9 +828,10 @@ static int software_resume(void)
        printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n");
        swsusp_free();
-        thaw_processes();
- Done:
        free_basic_memory_bitmaps();
+ Thaw:
+        unlock_device_hotplug();
+        thaw_processes();
 Finish:
        pm_notifier_call_chain(PM_POST_RESTORE);
        pm_restore_console();
@@ -840,12 +841,12 @@ static int software_resume(void)
        mutex_unlock(&pm_mutex);
        pr_debug("PM: Hibernation image not present or could not be loaded.\n");
        return error;
-close_finish:
+ Close_Finish:
        swsusp_close(FMODE_READ);
        goto Finish;
 }
-late_initcall(software_resume);
+late_initcall_sync(software_resume);
 static const char * const hibernation_modes[] = {
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 06fe28589e9c..a394297f8b2f 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -296,6 +296,17 @@ int pm_qos_request_active(struct pm_qos_request *req)
 }
 EXPORT_SYMBOL_GPL(pm_qos_request_active);
+static void __pm_qos_update_request(struct pm_qos_request *req,
+                           s32 new_value)
+{
+        trace_pm_qos_update_request(req->pm_qos_class, new_value);
+        if (new_value != req->node.prio)
+                pm_qos_update_target(
+                        pm_qos_array[req->pm_qos_class]->constraints,
+                        &req->node, PM_QOS_UPDATE_REQ, new_value);
+}
 /**
 * pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
 * @work: work struct for the delayed work (timeout)
@@ -308,7 +319,7 @@ static void pm_qos_work_fn(struct work_struct *work)
                                                  struct pm_qos_request,
                                                  work);
-        pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
+        __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
 }
 /**
@@ -364,12 +375,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
        }
        cancel_delayed_work_sync(&req->work);
+        __pm_qos_update_request(req, new_value);
-        trace_pm_qos_update_request(req->pm_qos_class, new_value);
-        if (new_value != req->node.prio)
-                pm_qos_update_target(
-                        pm_qos_array[req->pm_qos_class]->constraints,
-                        &req->node, PM_QOS_UPDATE_REQ, new_value);
 }
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 349587bb03e1..98c3b34a4cff 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
                struct mem_extent *ext, *cur, *aux;
                zone_start = zone->zone_start_pfn;
-                zone_end = zone->zone_start_pfn + zone->spanned_pages;
+                zone_end = zone_end_pfn(zone);
                list_for_each_entry(ext, list, hook)
                        if (zone_start <= ext->end)
@@ -743,7 +743,10 @@ int create_basic_memory_bitmaps(void)
        struct memory_bitmap *bm1, *bm2;
        int error = 0;
-        BUG_ON(forbidden_pages_map || free_pages_map);
+        if (forbidden_pages_map && free_pages_map)
+                return 0;
+        else
+                BUG_ON(forbidden_pages_map || free_pages_map);
        bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
        if (!bm1)
@@ -884,7 +887,7 @@ static unsigned int count_highmem_pages(void)
                        continue;
                mark_free_pages(zone);
-                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                        if (saveable_highmem_page(zone, pfn))
                                n++;
@@ -948,7 +951,7 @@ static unsigned int count_data_pages(void)
                        continue;
                mark_free_pages(zone);
-                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                        if (saveable_page(zone, pfn))
                                n++;
@@ -1041,7 +1044,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
                unsigned long max_zone_pfn;
                mark_free_pages(zone);
-                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                        if (page_is_saveable(zone, pfn))
                                memory_bm_set_bit(orig_bm, pfn);
@@ -1093,7 +1096,7 @@ void swsusp_free(void)
        unsigned long pfn, max_zone_pfn;
        for_each_populated_zone(zone) {
-                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                        if (pfn_valid(pfn)) {
                                struct page *page = pfn_to_page(pfn);
@@ -1755,7 +1758,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
        /* Clear page flags */
        for_each_populated_zone(zone) {
-                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                        if (pfn_valid(pfn))
                                swsusp_unset_page_free(pfn_to_page(pfn));
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ece04223bb1e..62ee437b5c7e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                goto Platform_wake;
        }
+        ftrace_stop();
        error = disable_nonboot_cpus();
        if (error || suspend_test(TEST_CPUS))
                goto Enable_cpus;
@@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 Enable_cpus:
        enable_nonboot_cpus();
+        ftrace_start();
 Platform_wake:
        if (need_suspend_ops(state) && suspend_ops->wake)
@@ -265,7 +267,6 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
-        ftrace_stop();
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
-        ftrace_start();
        resume_console();
 Close:
        if (need_suspend_ops(state) && suspend_ops->end)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4ed81e74f86f..957f06164ad1 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -39,6 +39,7 @@ static struct snapshot_data {
        char frozen;
        char ready;
        char platform_support;
+        bool free_bitmaps;
 } snapshot_state;
 atomic_t snapshot_device_available = ATOMIC_INIT(1);
@@ -60,11 +61,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                error = -ENOSYS;
                goto Unlock;
        }
-        if(create_basic_memory_bitmaps()) {
-                atomic_inc(&snapshot_device_available);
-                error = -ENOMEM;
-                goto Unlock;
-        }
        nonseekable_open(inode, filp);
        data = &snapshot_state;
        filp->private_data = data;
@@ -87,13 +83,16 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                data->swap = -1;
                data->mode = O_WRONLY;
                error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+                if (!error) {
+                        error = create_basic_memory_bitmaps();
+                        data->free_bitmaps = !error;
+                }
                if (error)
                        pm_notifier_call_chain(PM_POST_RESTORE);
        }
-        if (error) {
+        if (error)
-                free_basic_memory_bitmaps();
                atomic_inc(&snapshot_device_available);
-        }
        data->frozen = 0;
        data->ready = 0;
        data->platform_support = 0;
@@ -111,12 +110,14 @@ static int snapshot_release(struct inode *inode, struct file *filp)
        lock_system_sleep();
        swsusp_free();
-        free_basic_memory_bitmaps();
        data = filp->private_data;
        free_all_swap_pages(data->swap);
        if (data->frozen) {
                pm_restore_gfp_mask();
+                free_basic_memory_bitmaps();
                thaw_processes();
+        } else if (data->free_bitmaps) {
+                free_basic_memory_bitmaps();
        }
        pm_notifier_call_chain(data->mode == O_RDONLY ?
                        PM_POST_HIBERNATION : PM_POST_RESTORE);
@@ -207,6 +208,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        if (!mutex_trylock(&pm_mutex))
                return -EBUSY;
+        lock_device_hotplug();
        data = filp->private_data;
        switch (cmd) {
@@ -220,14 +222,23 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
                printk("done.\n");
                error = freeze_processes();
-                if (!error)
+                if (error)
+                        break;
+                error = create_basic_memory_bitmaps();
+                if (error)
+                        thaw_processes();
+                else
                        data->frozen = 1;
                break;
        case SNAPSHOT_UNFREEZE:
                if (!data->frozen || data->ready)
                        break;
                pm_restore_gfp_mask();
+                free_basic_memory_bitmaps();
+                data->free_bitmaps = false;
                thaw_processes();
                data->frozen = 0;
                break;
@@ -371,6 +382,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
        }
+        unlock_device_hotplug();
        mutex_unlock(&pm_mutex);
        return error;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5b5a7080e2a5..b4e8500afdb3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2226,6 +2226,13 @@ void register_console(struct console *newcon)
        struct console *bcon = NULL;
        struct console_cmdline *c;
+        if (console_drivers)
+                for_each_console(bcon)
+                        if (WARN(bcon == newcon,
+                                        "console '%s%d' already registered\n",
+                                        bcon->name, bcon->index))
+                                return;
        /*
         * before we register a new CON_BOOT console, make sure we don't
         * already have a valid console
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a146ee327f6a..dd562e9aa2c8 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
         */
        int dumpable = 0;
        /* Don't let security modules deny introspection */
-        if (task == current)
+        if (same_thread_group(task, current))
                return 0;
        rcu_read_lock();
        tcred = __task_cred(task);
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
new file mode 100644
index 000000000000..01e9ec37a3e3
--- /dev/null
+++ b/kernel/rcu/Makefile
@@ -0,0 +1,6 @@
+obj-y += update.o srcu.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
+obj-$(CONFIG_TREE_RCU) += tree.o
+obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
+obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
+obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h
index 7f8e7590e3e5..7859a0a3951e 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -67,12 +67,15 @@
 extern struct debug_obj_descr rcuhead_debug_descr;
-static inline void debug_rcu_head_queue(struct rcu_head *head)
+static inline int debug_rcu_head_queue(struct rcu_head *head)
 {
-        debug_object_activate(head, &rcuhead_debug_descr);
+        int r1;
+        r1 = debug_object_activate(head, &rcuhead_debug_descr);
        debug_object_active_state(head, &rcuhead_debug_descr,
                                  STATE_RCU_HEAD_READY,
                                  STATE_RCU_HEAD_QUEUED);
+        return r1;
 }
 static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
        debug_object_deactivate(head, &rcuhead_debug_descr);
 }
 #else   /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-static inline void debug_rcu_head_queue(struct rcu_head *head)
+static inline int debug_rcu_head_queue(struct rcu_head *head)
 {
+        return 0;
 }
 static inline void debug_rcu_head_unqueue(struct rcu_head *head)
@@ -94,7 +98,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 extern void kfree(const void *);
-static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
+static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
 {
        unsigned long offset = (unsigned long)head->func;
@@ -118,4 +122,11 @@ int rcu_jiffies_till_stall_check(void);
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+/*
+ * Strings used in tracepoints need to be exported via the
+ * tracing system such that tools like perf and trace-cmd can
+ * translate the string address pointers to actual text.
+ */
+#define TPS(x)  tracepoint_string(x)
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c
index 01d5ccb8bfe3..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/rcu/srcu.c
diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c
index aa344111de3e..0c9a934cfec1 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,6 +35,7 @@
 #include <linux/time.h>
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
+#include <linux/ftrace_event.h>
 #ifdef CONFIG_RCU_TRACE
 #include <trace/events/rcu.h>
@@ -42,7 +43,7 @@
 #include "rcu.h"
-/* Forward declarations for rcutiny_plugin.h. */
+/* Forward declarations for tiny_plugin.h. */
 struct rcu_ctrlblk;
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
 static void rcu_process_callbacks(struct softirq_action *unused);
@@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head,
 static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
-#include "rcutiny_plugin.h"
+#include "tiny_plugin.h"
 /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
 static void rcu_idle_enter_common(long long newval)
 {
        if (newval) {
-                RCU_TRACE(trace_rcu_dyntick("--=",
+                RCU_TRACE(trace_rcu_dyntick(TPS("--="),
                                            rcu_dynticks_nesting, newval));
                rcu_dynticks_nesting = newval;
                return;
        }
-        RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval));
+        RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
+                                    rcu_dynticks_nesting, newval));
        if (!is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
+                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
+                RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
                                            rcu_dynticks_nesting, newval));
                ftrace_dump(DUMP_ALL);
                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit);
 static void rcu_idle_exit_common(long long oldval)
 {
        if (oldval) {
-                RCU_TRACE(trace_rcu_dyntick("++=",
+                RCU_TRACE(trace_rcu_dyntick(TPS("++="),
                                            oldval, rcu_dynticks_nesting));
                return;
        }
-        RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
+        RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
        if (!is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
+                struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
-                RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
+                RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
                          oldval, rcu_dynticks_nesting));
                ftrace_dump(DUMP_ALL);
                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -174,18 +176,18 @@ void rcu_irq_enter(void)
 }
 EXPORT_SYMBOL_GPL(rcu_irq_enter);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
 /*
 * Test whether RCU thinks that the current CPU is idle.
 */
-int rcu_is_cpu_idle(void)
+bool __rcu_is_watching(void)
 {
-        return !rcu_dynticks_nesting;
+        return rcu_dynticks_nesting;
 }
-EXPORT_SYMBOL(rcu_is_cpu_idle);
+EXPORT_SYMBOL(__rcu_is_watching);
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 /*
 * Test whether the current CPU was interrupted from idle.  Nested
@@ -264,7 +266,7 @@ void rcu_check_callbacks(int cpu, int user)
 */
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
-        char *rn = NULL;
+        const char *rn = NULL;
        struct rcu_head *next, *list;
        unsigned long flags;
        RCU_TRACE(int cb_count = 0);
@@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        if (&rcp->rcucblist == rcp->donetail) {
                RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
                RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
-                                              ACCESS_ONCE(rcp->rcucblist),
+                                              !!ACCESS_ONCE(rcp->rcucblist),
                                              need_resched(),
                                              is_idle_task(current),
                                              false));
@@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                RCU_TRACE(cb_count++);
        }
        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
-        RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
+        RCU_TRACE(trace_rcu_batch_end(rcp->name,
+                                      cb_count, 0, need_resched(),
                                      is_idle_task(current),
                                      false));
 }
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 0cd385acccfa..280d06cae352 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -36,7 +36,7 @@ struct rcu_ctrlblk {
        RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
        RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
        RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
-        RCU_TRACE(char *name);          /* Name of RCU type. */
+        RCU_TRACE(const char *name);    /* Name of RCU type. */
 };
 /* Definition for rcupdate control block. */
diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c
index f4871e52c546..3929cd451511 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcu/torture.c
@@ -52,72 +52,84 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
-static int nreaders = -1;       /* # reader threads, defaults to 2*ncpus */
+MODULE_ALIAS("rcutorture");
-static int nfakewriters = 4;    /* # fake writer threads */
+#ifdef MODULE_PARAM_PREFIX
-static int stat_interval = 60;  /* Interval between stats, in seconds. */
+#undef MODULE_PARAM_PREFIX
-                                /*  Zero means "only at end of test". */
+#endif
-static bool verbose;            /* Print more debug info. */
+#define MODULE_PARAM_PREFIX "rcutorture."
-static bool test_no_idle_hz = true;
-                                /* Test RCU support for tickless idle CPUs. */
-static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
-static int stutter = 5;         /* Start/stop testing interval (in sec) */
-static int irqreader = 1;       /* RCU readers from irq (timers). */
-static int fqs_duration;        /* Duration of bursts (us), 0 to disable. */
-static int fqs_holdoff;         /* Hold time within burst (us). */
-static int fqs_stutter = 3;     /* Wait time between bursts (s). */
-static int n_barrier_cbs;       /* Number of callbacks to test RCU barriers. */
-static int onoff_interval;      /* Wait time between CPU hotplugs, 0=disable. */
-static int onoff_holdoff;       /* Seconds after boot before CPU hotplugs. */
-static int shutdown_secs;       /* Shutdown time (s).  <=0 for no shutdown. */
-static int stall_cpu;           /* CPU-stall duration (s).  0 for no stall. */
-static int stall_cpu_holdoff = 10; /* Time to wait until stall (s).  */
-static int test_boost = 1;      /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
-static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
-static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
-static char *torture_type = "rcu"; /* What RCU implementation to torture. */
-module_param(nreaders, int, 0444);
+static int fqs_duration;
-MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-module_param(nfakewriters, int, 0444);
-MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-module_param(stat_interval, int, 0644);
-MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-module_param(verbose, bool, 0444);
-MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-module_param(test_no_idle_hz, bool, 0444);
-MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
-module_param(shuffle_interval, int, 0444);
-MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-module_param(stutter, int, 0444);
-MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
-module_param(irqreader, int, 0444);
-MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
 module_param(fqs_duration, int, 0444);
-MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
+MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
+static int fqs_holdoff;
 module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
+static int fqs_stutter = 3;
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+static bool gp_exp;
+module_param(gp_exp, bool, 0444);
+MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
+static bool gp_normal;
+module_param(gp_normal, bool, 0444);
+MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
+static int irqreader = 1;
+module_param(irqreader, int, 0444);
+MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
+static int n_barrier_cbs;
 module_param(n_barrier_cbs, int, 0444);
 MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
-module_param(onoff_interval, int, 0444);
+static int nfakewriters = 4;
-MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
+module_param(nfakewriters, int, 0444);
+MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
+static int nreaders = -1;
+module_param(nreaders, int, 0444);
+MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+static int object_debug;
+module_param(object_debug, int, 0444);
+MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
+static int onoff_holdoff;
 module_param(onoff_holdoff, int, 0444);
 MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
+static int onoff_interval;
+module_param(onoff_interval, int, 0444);
+MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
+static int shuffle_interval = 3;
+module_param(shuffle_interval, int, 0444);
+MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
+static int shutdown_secs;
 module_param(shutdown_secs, int, 0444);
-MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
+MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
+static int stall_cpu;
 module_param(stall_cpu, int, 0444);
 MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
+static int stall_cpu_holdoff = 10;
 module_param(stall_cpu_holdoff, int, 0444);
 MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
+static int stat_interval = 60;
+module_param(stat_interval, int, 0644);
+MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
+static int stutter = 5;
+module_param(stutter, int, 0444);
+MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
+static int test_boost = 1;
 module_param(test_boost, int, 0444);
 MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-module_param(test_boost_interval, int, 0444);
+static int test_boost_duration = 4;
-MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
 module_param(test_boost_duration, int, 0444);
 MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
+static int test_boost_interval = 7;
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+static bool test_no_idle_hz = true;
+module_param(test_no_idle_hz, bool, 0444);
+MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
+static char *torture_type = "rcu";
 module_param(torture_type, charp, 0444);
-MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
+static bool verbose;
+module_param(verbose, bool, 0444);
+MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
 #define TORTURE_FLAG "-torture:"
 #define PRINTK_STRING(s) \
@@ -267,7 +279,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
 * Absorb kthreads into a kernel function that won't return, so that
 * they won't ever access module text or data again.
 */
-static void rcutorture_shutdown_absorb(char *title)
+static void rcutorture_shutdown_absorb(const char *title)
 {
        if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
                pr_notice(
@@ -337,7 +349,7 @@ rcu_random(struct rcu_random_state *rrsp)
 }
 static void
-rcu_stutter_wait(char *title)
+rcu_stutter_wait(const char *title)
 {
        while (stutter_pause_test || !rcutorture_runnable) {
                if (rcutorture_runnable)
@@ -360,13 +372,14 @@ struct rcu_torture_ops {
        int (*completed)(void);
        void (*deferred_free)(struct rcu_torture *p);
        void (*sync)(void);
+        void (*exp_sync)(void);
        void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
        void (*cb_barrier)(void);
        void (*fqs)(void);
        int (*stats)(char *page);
        int irq_capable;
        int can_boost;
-        char *name;
+        const char *name;
 };
 static struct rcu_torture_ops *cur_ops;
@@ -443,81 +456,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
        call_rcu(&p->rtort_rcu, rcu_torture_cb);
 }
-static struct rcu_torture_ops rcu_ops = {
-        .init           = NULL,
-        .readlock       = rcu_torture_read_lock,
-        .read_delay     = rcu_read_delay,
-        .readunlock     = rcu_torture_read_unlock,
-        .completed      = rcu_torture_completed,
-        .deferred_free  = rcu_torture_deferred_free,
-        .sync           = synchronize_rcu,
-        .call           = call_rcu,
-        .cb_barrier     = rcu_barrier,
-        .fqs            = rcu_force_quiescent_state,
-        .stats          = NULL,
-        .irq_capable    = 1,
-        .can_boost      = rcu_can_boost(),
-        .name           = "rcu"
-};
-static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
-{
-        int i;
-        struct rcu_torture *rp;
-        struct rcu_torture *rp1;
-        cur_ops->sync();
-        list_add(&p->rtort_free, &rcu_torture_removed);
-        list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
-                i = rp->rtort_pipe_count;
-                if (i > RCU_TORTURE_PIPE_LEN)
-                        i = RCU_TORTURE_PIPE_LEN;
-                atomic_inc(&rcu_torture_wcount[i]);
-                if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
-                        rp->rtort_mbtest = 0;
-                        list_del(&rp->rtort_free);
-                        rcu_torture_free(rp);
-                }
-        }
-}
 static void rcu_sync_torture_init(void)
 {
        INIT_LIST_HEAD(&rcu_torture_removed);
 }
-static struct rcu_torture_ops rcu_sync_ops = {
+static struct rcu_torture_ops rcu_ops = {
        .init           = rcu_sync_torture_init,
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,
        .readunlock     = rcu_torture_read_unlock,
        .completed      = rcu_torture_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
+        .deferred_free  = rcu_torture_deferred_free,
        .sync           = synchronize_rcu,
-        .call           = NULL,
+        .exp_sync       = synchronize_rcu_expedited,
-        .cb_barrier     = NULL,
+        .call           = call_rcu,
-        .fqs            = rcu_force_quiescent_state,
+        .cb_barrier     = rcu_barrier,
-        .stats          = NULL,
-        .irq_capable    = 1,
-        .can_boost      = rcu_can_boost(),
-        .name           = "rcu_sync"
-};
-static struct rcu_torture_ops rcu_expedited_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = rcu_torture_read_lock,
-        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
-        .readunlock     = rcu_torture_read_unlock,
-        .completed      = rcu_no_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = synchronize_rcu_expedited,
-        .call           = NULL,
-        .cb_barrier     = NULL,
        .fqs            = rcu_force_quiescent_state,
        .stats          = NULL,
        .irq_capable    = 1,
        .can_boost      = rcu_can_boost(),
-        .name           = "rcu_expedited"
+        .name           = "rcu"
 };
 /*
@@ -546,13 +505,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
 }
 static struct rcu_torture_ops rcu_bh_ops = {
-        .init           = NULL,
+        .init           = rcu_sync_torture_init,
        .readlock       = rcu_bh_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_bh_torture_read_unlock,
        .completed      = rcu_bh_torture_completed,
        .deferred_free  = rcu_bh_torture_deferred_free,
        .sync           = synchronize_rcu_bh,
+        .exp_sync       = synchronize_rcu_bh_expedited,
        .call           = call_rcu_bh,
        .cb_barrier     = rcu_barrier_bh,
        .fqs            = rcu_bh_force_quiescent_state,
@@ -561,38 +521,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
        .name           = "rcu_bh"
 };
-static struct rcu_torture_ops rcu_bh_sync_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = rcu_bh_torture_read_lock,
-        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
-        .readunlock     = rcu_bh_torture_read_unlock,
-        .completed      = rcu_bh_torture_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = synchronize_rcu_bh,
-        .call           = NULL,
-        .cb_barrier     = NULL,
-        .fqs            = rcu_bh_force_quiescent_state,
-        .stats          = NULL,
-        .irq_capable    = 1,
-        .name           = "rcu_bh_sync"
-};
-static struct rcu_torture_ops rcu_bh_expedited_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = rcu_bh_torture_read_lock,
-        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
-        .readunlock     = rcu_bh_torture_read_unlock,
-        .completed      = rcu_bh_torture_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = synchronize_rcu_bh_expedited,
-        .call           = NULL,
-        .cb_barrier     = NULL,
-        .fqs            = rcu_bh_force_quiescent_state,
-        .stats          = NULL,
-        .irq_capable    = 1,
-        .name           = "rcu_bh_expedited"
-};
 /*
 * Definitions for srcu torture testing.
 */
@@ -667,6 +595,11 @@ static int srcu_torture_stats(char *page)
        return cnt;
 }
+static void srcu_torture_synchronize_expedited(void)
+{
+        synchronize_srcu_expedited(&srcu_ctl);
+}
 static struct rcu_torture_ops srcu_ops = {
        .init           = rcu_sync_torture_init,
        .readlock       = srcu_torture_read_lock,
@@ -675,45 +608,13 @@ static struct rcu_torture_ops srcu_ops = {
        .completed      = srcu_torture_completed,
        .deferred_free  = srcu_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
+        .exp_sync       = srcu_torture_synchronize_expedited,
        .call           = srcu_torture_call,
        .cb_barrier     = srcu_torture_barrier,
        .stats          = srcu_torture_stats,
        .name           = "srcu"
 };
-static struct rcu_torture_ops srcu_sync_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = srcu_torture_read_lock,
-        .read_delay     = srcu_read_delay,
-        .readunlock     = srcu_torture_read_unlock,
-        .completed      = srcu_torture_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = srcu_torture_synchronize,
-        .call           = NULL,
-        .cb_barrier     = NULL,
-        .stats          = srcu_torture_stats,
-        .name           = "srcu_sync"
-};
-static void srcu_torture_synchronize_expedited(void)
-{
-        synchronize_srcu_expedited(&srcu_ctl);
-}
-static struct rcu_torture_ops srcu_expedited_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = srcu_torture_read_lock,
-        .read_delay     = srcu_read_delay,
-        .readunlock     = srcu_torture_read_unlock,
-        .completed      = srcu_torture_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = srcu_torture_synchronize_expedited,
-        .call           = NULL,
-        .cb_barrier     = NULL,
-        .stats          = srcu_torture_stats,
-        .name           = "srcu_expedited"
-};
 /*
 * Definitions for sched torture testing.
 */
@@ -742,6 +643,8 @@ static struct rcu_torture_ops sched_ops = {
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sched_torture_deferred_free,
        .sync           = synchronize_sched,
+        .exp_sync       = synchronize_sched_expedited,
+        .call           = call_rcu_sched,
        .cb_barrier     = rcu_barrier_sched,
        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
@@ -749,35 +652,6 @@ static struct rcu_torture_ops sched_ops = {
        .name           = "sched"
 };
-static struct rcu_torture_ops sched_sync_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = sched_torture_read_lock,
-        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
-        .readunlock     = sched_torture_read_unlock,
-        .completed      = rcu_no_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = synchronize_sched,
-        .cb_barrier     = NULL,
-        .fqs            = rcu_sched_force_quiescent_state,
-        .stats          = NULL,
-        .name           = "sched_sync"
-};
-static struct rcu_torture_ops sched_expedited_ops = {
-        .init           = rcu_sync_torture_init,
-        .readlock       = sched_torture_read_lock,
-        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
-        .readunlock     = sched_torture_read_unlock,
-        .completed      = rcu_no_completed,
-        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = synchronize_sched_expedited,
-        .cb_barrier     = NULL,
-        .fqs            = rcu_sched_force_quiescent_state,
-        .stats          = NULL,
-        .irq_capable    = 1,
-        .name           = "sched_expedited"
-};
 /*
 * RCU torture priority-boost testing.  Runs one real-time thread per
 * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -927,9 +801,10 @@ rcu_torture_fqs(void *arg)
 static int
 rcu_torture_writer(void *arg)
 {
+        bool exp;
        int i;
-        long oldbatch = rcu_batches_completed();
        struct rcu_torture *rp;
+        struct rcu_torture *rp1;
        struct rcu_torture *old_rp;
        static DEFINE_RCU_RANDOM(rand);
@@ -954,10 +829,33 @@ rcu_torture_writer(void *arg)
                                i = RCU_TORTURE_PIPE_LEN;
                        atomic_inc(&rcu_torture_wcount[i]);
                        old_rp->rtort_pipe_count++;
-                        cur_ops->deferred_free(old_rp);
+                        if (gp_normal == gp_exp)
+                                exp = !!(rcu_random(&rand) & 0x80);
+                        else
+                                exp = gp_exp;
+                        if (!exp) {
+                                cur_ops->deferred_free(old_rp);
+                        } else {
+                                cur_ops->exp_sync();
+                                list_add(&old_rp->rtort_free,
+                                         &rcu_torture_removed);
+                                list_for_each_entry_safe(rp, rp1,
+                                                         &rcu_torture_removed,
+                                                         rtort_free) {
+                                        i = rp->rtort_pipe_count;
+                                        if (i > RCU_TORTURE_PIPE_LEN)
+                                                i = RCU_TORTURE_PIPE_LEN;
+                                        atomic_inc(&rcu_torture_wcount[i]);
+                                        if (++rp->rtort_pipe_count >=
+                                            RCU_TORTURE_PIPE_LEN) {
+                                                rp->rtort_mbtest = 0;
+                                                list_del(&rp->rtort_free);
+                                                rcu_torture_free(rp);
+                                        }
+                                 }
+                        }
                }
                rcutorture_record_progress(++rcu_torture_current_version);
-                oldbatch = cur_ops->completed();
                rcu_stutter_wait("rcu_torture_writer");
        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
        VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
@@ -983,10 +881,18 @@ rcu_torture_fakewriter(void *arg)
                schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
                udelay(rcu_random(&rand) & 0x3ff);
                if (cur_ops->cb_barrier != NULL &&
-                    rcu_random(&rand) % (nfakewriters * 8) == 0)
+                    rcu_random(&rand) % (nfakewriters * 8) == 0) {
                        cur_ops->cb_barrier();
-                else
+                } else if (gp_normal == gp_exp) {
+                        if (rcu_random(&rand) & 0x80)
+                                cur_ops->sync();
+                        else
+                                cur_ops->exp_sync();
+                } else if (gp_normal) {
                        cur_ops->sync();
+                } else {
+                        cur_ops->exp_sync();
+                }
                rcu_stutter_wait("rcu_torture_fakewriter");
        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1364,7 +1270,7 @@ rcu_torture_stutter(void *arg)
 }
 static inline void
-rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 {
        pr_alert("%s" TORTURE_FLAG
                 "--- %s: nreaders=%d nfakewriters=%d "
@@ -1534,7 +1440,13 @@ rcu_torture_onoff(void *arg)
                                         torture_type, cpu);
                        starttime = jiffies;
                        n_online_attempts++;
-                        if (cpu_up(cpu) == 0) {
+                        ret = cpu_up(cpu);
+                        if (ret) {
+                                if (verbose)
+                                        pr_alert("%s" TORTURE_FLAG
+                                                 "rcu_torture_onoff task: online %d failed: errno %d\n",
+                                                 torture_type, cpu, ret);
+                        } else {
                                if (verbose)
                                        pr_alert("%s" TORTURE_FLAG
                                                 "rcu_torture_onoff task: onlined %d\n",
@@ -1934,6 +1846,62 @@ rcu_torture_cleanup(void)
                rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
 }
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static void rcu_torture_leak_cb(struct rcu_head *rhp)
+{
+}
+static void rcu_torture_err_cb(struct rcu_head *rhp)
+{
+        /*
+         * This -might- happen due to race conditions, but is unlikely.
+         * The scenario that leads to this happening is that the
+         * first of the pair of duplicate callbacks is queued,
+         * someone else starts a grace period that includes that
+         * callback, then the second of the pair must wait for the
+         * next grace period.  Unlikely, but can happen.  If it
+         * does happen, the debug-objects subsystem won't have splatted.
+         */
+        pr_alert("rcutorture: duplicated callback was invoked.\n");
+}
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+/*
+ * Verify that double-free causes debug-objects to complain, but only
+ * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.  Otherwise, say that the test
+ * cannot be carried out.
+ */
+static void rcu_test_debug_objects(void)
+{
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+        struct rcu_head rh1;
+        struct rcu_head rh2;
+        init_rcu_head_on_stack(&rh1);
+        init_rcu_head_on_stack(&rh2);
+        pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
+        /* Try to queue the rh2 pair of callbacks for the same grace period. */
+        preempt_disable(); /* Prevent preemption from interrupting test. */
+        rcu_read_lock(); /* Make it impossible to finish a grace period. */
+        call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+        local_irq_disable(); /* Make it harder to start a new grace period. */
+        call_rcu(&rh2, rcu_torture_leak_cb);
+        call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+        local_irq_enable();
+        rcu_read_unlock();
+        preempt_enable();
+        /* Wait for them all to get done so we can safely return. */
+        rcu_barrier();
+        pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
+        destroy_rcu_head_on_stack(&rh1);
+        destroy_rcu_head_on_stack(&rh2);
+#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+        pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
+#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+}
 static int __init
 rcu_torture_init(void)
 {
@@ -1941,11 +1909,9 @@ rcu_torture_init(void)
        int cpu;
        int firsterr = 0;
        int retval;
-        static struct rcu_torture_ops *torture_ops[] =
+        static struct rcu_torture_ops *torture_ops[] = {
-                { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
+                &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
-                  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
+        };
-                  &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
-                  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
        mutex_lock(&fullstop_mutex);
@@ -2163,6 +2129,8 @@ rcu_torture_init(void)
                firsterr = retval;
                goto unwind;
        }
+        if (object_debug)
+                rcu_test_debug_objects();
        rcutorture_record_test_transition();
        mutex_unlock(&fullstop_mutex);
        return 0;
diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c
index 068de3a93606..4c06ddfea7cd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcu/tree.c
@@ -41,6 +41,7 @@
 #include <linux/export.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
+#include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
@@ -53,18 +54,37 @@
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
 #include <linux/random.h>
+#include <linux/ftrace_event.h>
+#include <linux/suspend.h>
-#include "rcutree.h"
+#include "tree.h"
 #include <trace/events/rcu.h>
 #include "rcu.h"
+MODULE_ALIAS("rcutree");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcutree."
 /* Data structures. */
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
+/*
+ * In order to export the rcu_state name to the tracing tools, it
+ * needs to be added in the __tracepoint_string section.
+ * This requires defining a separate variable tp_<sname>_varname
+ * that points to the string being used, and this will allow
+ * the tracing userspace tools to be able to decipher the string
+ * address to the matching string.
+ */
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
+static char sname##_varname[] = #sname; \
+static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \
+struct rcu_state sname##_state = { \
        .level = { &sname##_state.node[0] }, \
        .call = cr, \
        .fqs_state = RCU_GP_IDLE, \
@@ -75,16 +95,13 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
        .orphan_donetail = &sname##_state.orphan_donelist, \
        .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
        .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
-        .name = #sname, \
+        .name = sname##_varname, \
        .abbr = sabbr, \
-}
+}; \
+DEFINE_PER_CPU(struct rcu_data, sname##_data)
-struct rcu_state rcu_sched_state =
-        RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
-DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
+RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
 static struct rcu_state *rcu_state;
 LIST_HEAD(rcu_struct_flavors);
@@ -178,7 +195,7 @@ void rcu_sched_qs(int cpu)
        struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
        if (rdp->passed_quiesce == 0)
-                trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
+                trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs"));
        rdp->passed_quiesce = 1;
 }
@@ -187,7 +204,7 @@ void rcu_bh_qs(int cpu)
        struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
        if (rdp->passed_quiesce == 0)
-                trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
+                trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs"));
        rdp->passed_quiesce = 1;
 }
@@ -198,16 +215,20 @@ void rcu_bh_qs(int cpu)
 */
 void rcu_note_context_switch(int cpu)
 {
-        trace_rcu_utilization("Start context switch");
+        trace_rcu_utilization(TPS("Start context switch"));
        rcu_sched_qs(cpu);
        rcu_preempt_note_context_switch(cpu);
-        trace_rcu_utilization("End context switch");
+        trace_rcu_utilization(TPS("End context switch"));
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
        .dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+        .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+        .dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
 static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
@@ -226,7 +247,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644);
 static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
                                  struct rcu_data *rdp);
-static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
+static void force_qs_rnp(struct rcu_state *rsp,
+                         int (*f)(struct rcu_data *rsp, bool *isidle,
+                                  unsigned long *maxj),
+                         bool *isidle, unsigned long *maxj);
 static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(int cpu);
@@ -345,11 +369,12 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
                                bool user)
 {
-        trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
+        trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
        if (!user && !is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
+                struct task_struct *idle __maybe_unused =
+                        idle_task(smp_processor_id());
-                trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
+                trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
                ftrace_dump(DUMP_ORIG);
                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
                          current->pid, current->comm,
@@ -383,7 +408,7 @@ static void rcu_eqs_enter(bool user)
        long long oldval;
        struct rcu_dynticks *rdtp;
-        rdtp = &__get_cpu_var(rcu_dynticks);
+        rdtp = this_cpu_ptr(&rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
        WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
        if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
@@ -411,6 +436,7 @@ void rcu_idle_enter(void)
        local_irq_save(flags);
        rcu_eqs_enter(false);
+        rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -428,27 +454,6 @@ void rcu_user_enter(void)
 {
        rcu_eqs_enter(1);
 }
-/**
- * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
- * after the current irq returns.
- *
- * This is similar to rcu_user_enter() but in the context of a non-nesting
- * irq. After this call, RCU enters into idle mode when the interrupt
- * returns.
- */
-void rcu_user_enter_after_irq(void)
-{
-        unsigned long flags;
-        struct rcu_dynticks *rdtp;
-        local_irq_save(flags);
-        rdtp = &__get_cpu_var(rcu_dynticks);
-        /* Ensure this irq is interrupting a non-idle RCU state.  */
-        WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
-        rdtp->dynticks_nesting = 1;
-        local_irq_restore(flags);
-}
 #endif /* CONFIG_RCU_USER_QS */
 /**
@@ -474,14 +479,15 @@ void rcu_irq_exit(void)
        struct rcu_dynticks *rdtp;
        local_irq_save(flags);
-        rdtp = &__get_cpu_var(rcu_dynticks);
+        rdtp = this_cpu_ptr(&rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
        rdtp->dynticks_nesting--;
        WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
        if (rdtp->dynticks_nesting)
-                trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
+                trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
        else
                rcu_eqs_enter_common(rdtp, oldval, true);
+        rcu_sysidle_enter(rdtp, 1);
        local_irq_restore(flags);
 }
@@ -501,11 +507,12 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
        smp_mb__after_atomic_inc();  /* See above. */
        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
        rcu_cleanup_after_idle(smp_processor_id());
-        trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
+        trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
        if (!user && !is_idle_task(current)) {
-                struct task_struct *idle = idle_task(smp_processor_id());
+                struct task_struct *idle __maybe_unused =
+                        idle_task(smp_processor_id());
-                trace_rcu_dyntick("Error on exit: not idle task",
+                trace_rcu_dyntick(TPS("Error on exit: not idle task"),
                                  oldval, rdtp->dynticks_nesting);
                ftrace_dump(DUMP_ORIG);
                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -523,7 +530,7 @@ static void rcu_eqs_exit(bool user)
        struct rcu_dynticks *rdtp;
        long long oldval;
-        rdtp = &__get_cpu_var(rcu_dynticks);
+        rdtp = this_cpu_ptr(&rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
        WARN_ON_ONCE(oldval < 0);
        if (oldval & DYNTICK_TASK_NEST_MASK)
@@ -550,6 +557,7 @@ void rcu_idle_exit(void)
        local_irq_save(flags);
        rcu_eqs_exit(false);
+        rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -565,28 +573,6 @@ void rcu_user_exit(void)
 {
        rcu_eqs_exit(1);
 }
-/**
- * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
- * idle mode after the current non-nesting irq returns.
- *
- * This is similar to rcu_user_exit() but in the context of an irq.
- * This is called when the irq has interrupted a userspace RCU idle mode
- * context. When the current non-nesting interrupt returns after this call,
- * the CPU won't restore the RCU idle mode.
- */
-void rcu_user_exit_after_irq(void)
-{
-        unsigned long flags;
-        struct rcu_dynticks *rdtp;
-        local_irq_save(flags);
-        rdtp = &__get_cpu_var(rcu_dynticks);
-        /* Ensure we are interrupting an RCU idle mode. */
-        WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
-        rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
-        local_irq_restore(flags);
-}
 #endif /* CONFIG_RCU_USER_QS */
 /**
@@ -615,14 +601,15 @@ void rcu_irq_enter(void)
        long long oldval;
        local_irq_save(flags);
-        rdtp = &__get_cpu_var(rcu_dynticks);
+        rdtp = this_cpu_ptr(&rcu_dynticks);
        oldval = rdtp->dynticks_nesting;
        rdtp->dynticks_nesting++;
        WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
        if (oldval)
-                trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
+                trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
        else
                rcu_eqs_exit_common(rdtp, oldval, true);
+        rcu_sysidle_exit(rdtp, 1);
        local_irq_restore(flags);
 }
@@ -635,7 +622,7 @@ void rcu_irq_enter(void)
 */
 void rcu_nmi_enter(void)
 {
-        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
        if (rdtp->dynticks_nmi_nesting == 0 &&
            (atomic_read(&rdtp->dynticks) & 0x1))
@@ -657,7 +644,7 @@ void rcu_nmi_enter(void)
 */
 void rcu_nmi_exit(void)
 {
-        struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
        if (rdtp->dynticks_nmi_nesting == 0 ||
            --rdtp->dynticks_nmi_nesting != 0)
@@ -670,21 +657,34 @@ void rcu_nmi_exit(void)
 }
 /**
- * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
+ * __rcu_is_watching - are RCU read-side critical sections safe?
+ *
+ * Return true if RCU is watching the running CPU, which means that
+ * this CPU can safely enter RCU read-side critical sections.  Unlike
+ * rcu_is_watching(), the caller of __rcu_is_watching() must have at
+ * least disabled preemption.
+ */
+bool __rcu_is_watching(void)
+{
+        return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
+}
+/**
+ * rcu_is_watching - see if RCU thinks that the current CPU is idle
 *
 * If the current CPU is in its idle loop and is neither in an interrupt
 * or NMI handler, return true.
 */
-int rcu_is_cpu_idle(void)
+bool rcu_is_watching(void)
 {
        int ret;
        preempt_disable();
-        ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
+        ret = __rcu_is_watching();
        preempt_enable();
        return ret;
 }
-EXPORT_SYMBOL(rcu_is_cpu_idle);
+EXPORT_SYMBOL_GPL(rcu_is_watching);
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
@@ -718,7 +718,7 @@ bool rcu_lockdep_current_cpu_online(void)
        if (in_nmi())
                return 1;
        preempt_disable();
-        rdp = &__get_cpu_var(rcu_sched_data);
+        rdp = this_cpu_ptr(&rcu_sched_data);
        rnp = rdp->mynode;
        ret = (rdp->grpmask & rnp->qsmaskinit) ||
              !rcu_scheduler_fully_active;
@@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
 */
 static int rcu_is_cpu_rrupt_from_idle(void)
 {
-        return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
+        return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
 }
 /*
@@ -746,9 +746,11 @@ static int rcu_is_cpu_rrupt_from_idle(void)
 * credit them with an implicit quiescent state.  Return 1 if this CPU
 * is in dynticks idle mode, which is an extended quiescent state.
 */
-static int dyntick_save_progress_counter(struct rcu_data *rdp)
+static int dyntick_save_progress_counter(struct rcu_data *rdp,
+                                         bool *isidle, unsigned long *maxj)
 {
        rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+        rcu_sysidle_check_cpu(rdp, isidle, maxj);
        return (rdp->dynticks_snap & 0x1) == 0;
 }
@@ -758,7 +760,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
 * idle state since the last call to dyntick_save_progress_counter()
 * for this same CPU, or by virtue of having been offline.
 */
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
+                                    bool *isidle, unsigned long *maxj)
 {
        unsigned int curr;
        unsigned int snap;
@@ -775,7 +778,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
         * of the current RCU grace period.
         */
        if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
-                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
+                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
                rdp->dynticks_fqs++;
                return 1;
        }
@@ -795,7 +798,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
                return 0;  /* Grace period is not old enough. */
        barrier();
        if (cpu_is_offline(rdp->cpu)) {
-                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
+                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
                rdp->offline_fqs++;
                return 1;
        }
@@ -814,8 +817,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
-        rsp->gp_start = jiffies;
+        unsigned long j = ACCESS_ONCE(jiffies);
-        rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+        rsp->gp_start = j;
+        smp_wmb(); /* Record start time before stall time. */
+        rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();
 }
 /*
@@ -910,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        force_quiescent_state(rsp);  /* Kick them all. */
 }
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
 static void print_cpu_stall(struct rcu_state *rsp)
 {
        int cpu;
@@ -939,22 +951,60 @@ static void print_cpu_stall(struct rcu_state *rsp)
                                     3 * rcu_jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        set_need_resched();  /* kick ourselves to get things going. */
+        /*
+         * Attempt to revive the RCU machinery by forcing a context switch.
+         *
+         * A context switch would normally allow the RCU state machine to make
+         * progress and it could be we're stuck in kernel space without context
+         * switches for an entirely unreasonable amount of time.
+         */
+        resched_cpu(smp_processor_id());
 }
 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
+        unsigned long completed;
+        unsigned long gpnum;
+        unsigned long gps;
        unsigned long j;
        unsigned long js;
        struct rcu_node *rnp;
-        if (rcu_cpu_stall_suppress)
+        if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
                return;
        j = ACCESS_ONCE(jiffies);
+        /*
+         * Lots of memory barriers to reject false positives.
+         *
+         * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
+         * then rsp->gp_start, and finally rsp->completed.  These values
+         * are updated in the opposite order with memory barriers (or
+         * equivalent) during grace-period initialization and cleanup.
+         * Now, a false positive can occur if we get an new value of
+         * rsp->gp_start and a old value of rsp->jiffies_stall.  But given
+         * the memory barriers, the only way that this can happen is if one
+         * grace period ends and another starts between these two fetches.
+         * Detect this by comparing rsp->completed with the previous fetch
+         * from rsp->gpnum.
+         *
+         * Given this check, comparisons of jiffies, rsp->jiffies_stall,
+         * and rsp->gp_start suffice to forestall false positives.
+         */
+        gpnum = ACCESS_ONCE(rsp->gpnum);
+        smp_rmb(); /* Pick up ->gpnum first... */
        js = ACCESS_ONCE(rsp->jiffies_stall);
+        smp_rmb(); /* ...then ->jiffies_stall before the rest... */
+        gps = ACCESS_ONCE(rsp->gp_start);
+        smp_rmb(); /* ...and finally ->gp_start before ->completed. */
+        completed = ACCESS_ONCE(rsp->completed);
+        if (ULONG_CMP_GE(completed, gpnum) ||
+            ULONG_CMP_LT(j, js) ||
+            ULONG_CMP_GE(gps, js))
+                return; /* No stall or GP completed since entering function. */
        rnp = rdp->mynode;
        if (rcu_gp_in_progress(rsp) &&
-            (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
+            (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
                /* We haven't checked in, so go dump stack. */
                print_cpu_stall(rsp);
@@ -1032,7 +1082,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
 * rcu_nocb_wait_gp().
 */
 static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
-                                unsigned long c, char *s)
+                                unsigned long c, const char *s)
 {
        trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
                                      rnp->completed, c, rnp->level,
@@ -1058,9 +1108,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
         * grace period is already marked as needed, return to the caller.
         */
        c = rcu_cbs_completed(rdp->rsp, rnp);
-        trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
+        trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
        if (rnp->need_future_gp[c & 0x1]) {
-                trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
+                trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
                return c;
        }
@@ -1074,7 +1124,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
        if (rnp->gpnum != rnp->completed ||
            ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
                rnp->need_future_gp[c & 0x1]++;
-                trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
+                trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
                return c;
        }
@@ -1102,7 +1152,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
         * recorded, trace and leave.
         */
        if (rnp_root->need_future_gp[c & 0x1]) {
-                trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
+                trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot"));
                goto unlock_out;
        }
@@ -1111,9 +1161,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
        /* If a grace period is not already in progress, start one. */
        if (rnp_root->gpnum != rnp_root->completed) {
-                trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
+                trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
        } else {
-                trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
+                trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
                rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
        }
 unlock_out:
@@ -1137,7 +1187,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
        rcu_nocb_gp_cleanup(rsp, rnp);
        rnp->need_future_gp[c & 0x1] = 0;
        needmore = rnp->need_future_gp[(c + 1) & 0x1];
-        trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
+        trace_rcu_future_gp(rnp, rdp, c,
+                            needmore ? TPS("CleanupMore") : TPS("Cleanup"));
        return needmore;
 }
@@ -1205,9 +1256,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
        /* Trace depending on how much we were able to accelerate. */
        if (!*rdp->nxttail[RCU_WAIT_TAIL])
-                trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
        else
-                trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
 }
 /*
@@ -1273,7 +1324,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
-                trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
        }
        if (rdp->gpnum != rnp->gpnum) {
@@ -1283,7 +1334,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
                 * go looking for one.
                 */
                rdp->gpnum = rnp->gpnum;
-                trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
                rdp->passed_quiesce = 0;
                rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
                zero_cpu_stall_ticks(rdp);
@@ -1308,26 +1359,36 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 }
 /*
- * Initialize a new grace period.
+ * Initialize a new grace period.  Return 0 if no grace period required.
 */
 static int rcu_gp_init(struct rcu_state *rsp)
 {
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        rcu_bind_gp_kthread();
        raw_spin_lock_irq(&rnp->lock);
+        if (rsp->gp_flags == 0) {
+                /* Spurious wakeup, tell caller to go back to sleep.  */
+                raw_spin_unlock_irq(&rnp->lock);
+                return 0;
+        }
        rsp->gp_flags = 0; /* Clear all flags: New grace period. */
-        if (rcu_gp_in_progress(rsp)) {
+        if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
-                /* Grace period already in progress, don't start another.  */
+                /*
+                 * Grace period already in progress, don't start another.
+                 * Not supposed to be able to happen.
+                 */
                raw_spin_unlock_irq(&rnp->lock);
                return 0;
        }
        /* Advance to a new grace period and initialize state. */
-        rsp->gpnum++;
-        trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
        record_gp_stall_check_time(rsp);
+        smp_wmb(); /* Record GP times before starting GP. */
+        rsp->gpnum++;
+        trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
        raw_spin_unlock_irq(&rnp->lock);
        /* Exclude any concurrent CPU-hotplug operations. */
@@ -1376,19 +1437,28 @@ static int rcu_gp_init(struct rcu_state *rsp)
 /*
 * Do one round of quiescent-state forcing.
 */
-int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
+static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 {
        int fqs_state = fqs_state_in;
+        bool isidle = false;
+        unsigned long maxj;
        struct rcu_node *rnp = rcu_get_root(rsp);
        rsp->n_force_qs++;
        if (fqs_state == RCU_SAVE_DYNTICK) {
                /* Collect dyntick-idle snapshots. */
-                force_qs_rnp(rsp, dyntick_save_progress_counter);
+                if (is_sysidle_rcu_state(rsp)) {
+                        isidle = 1;
+                        maxj = jiffies - ULONG_MAX / 4;
+                }
+                force_qs_rnp(rsp, dyntick_save_progress_counter,
+                             &isidle, &maxj);
+                rcu_sysidle_report_gp(rsp, isidle, maxj);
                fqs_state = RCU_FORCE_QS;
        } else {
                /* Handle dyntick-idle and offline CPUs. */
-                force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
+                isidle = 0;
+                force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
        }
        /* Clear flag to prevent immediate re-entry. */
        if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
@@ -1448,12 +1518,16 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        rcu_nocb_gp_set(rnp, nocb);
        rsp->completed = rsp->gpnum; /* Declare grace period done. */
-        trace_rcu_grace_period(rsp->name, rsp->completed, "end");
+        trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
        rsp->fqs_state = RCU_GP_IDLE;
        rdp = this_cpu_ptr(rsp->rda);
        rcu_advance_cbs(rsp, rnp, rdp);  /* Reduce false positives below. */
-        if (cpu_needs_another_gp(rsp, rdp))
+        if (cpu_needs_another_gp(rsp, rdp)) {
-                rsp->gp_flags = 1;
+                rsp->gp_flags = RCU_GP_FLAG_INIT;
+                trace_rcu_grace_period(rsp->name,
+                                       ACCESS_ONCE(rsp->gpnum),
+                                       TPS("newreq"));
+        }
        raw_spin_unlock_irq(&rnp->lock);
 }
@@ -1463,6 +1537,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 static int __noreturn rcu_gp_kthread(void *arg)
 {
        int fqs_state;
+        int gf;
        unsigned long j;
        int ret;
        struct rcu_state *rsp = arg;
@@ -1472,14 +1547,19 @@ static int __noreturn rcu_gp_kthread(void *arg)
                /* Handle grace-period start. */
                for (;;) {
+                        trace_rcu_grace_period(rsp->name,
+                                               ACCESS_ONCE(rsp->gpnum),
+                                               TPS("reqwait"));
                        wait_event_interruptible(rsp->gp_wq,
-                                                 rsp->gp_flags &
+                                                 ACCESS_ONCE(rsp->gp_flags) &
                                                 RCU_GP_FLAG_INIT);
-                        if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
+                        if (rcu_gp_init(rsp))
-                            rcu_gp_init(rsp))
                                break;
                        cond_resched();
                        flush_signals(current);
+                        trace_rcu_grace_period(rsp->name,
+                                               ACCESS_ONCE(rsp->gpnum),
+                                               TPS("reqwaitsig"));
                }
                /* Handle quiescent-state forcing. */
@@ -1489,10 +1569,16 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        j = HZ;
                        jiffies_till_first_fqs = HZ;
                }
+                ret = 0;
                for (;;) {
-                        rsp->jiffies_force_qs = jiffies + j;
+                        if (!ret)
+                                rsp->jiffies_force_qs = jiffies + j;
+                        trace_rcu_grace_period(rsp->name,
+                                               ACCESS_ONCE(rsp->gpnum),
+                                               TPS("fqswait"));
                        ret = wait_event_interruptible_timeout(rsp->gp_wq,
-                                        (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
+                                        ((gf = ACCESS_ONCE(rsp->gp_flags)) &
+                                         RCU_GP_FLAG_FQS) ||
                                        (!ACCESS_ONCE(rnp->qsmask) &&
                                         !rcu_preempt_blocked_readers_cgp(rnp)),
                                        j);
@@ -1501,13 +1587,23 @@ static int __noreturn rcu_gp_kthread(void *arg)
                            !rcu_preempt_blocked_readers_cgp(rnp))
                                break;
                        /* If time for quiescent-state forcing, do it. */
-                        if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
+                        if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
+                            (gf & RCU_GP_FLAG_FQS)) {
+                                trace_rcu_grace_period(rsp->name,
+                                                       ACCESS_ONCE(rsp->gpnum),
+                                                       TPS("fqsstart"));
                                fqs_state = rcu_gp_fqs(rsp, fqs_state);
+                                trace_rcu_grace_period(rsp->name,
+                                                       ACCESS_ONCE(rsp->gpnum),
+                                                       TPS("fqsend"));
                                cond_resched();
                        } else {
                                /* Deal with stray signal. */
                                cond_resched();
                                flush_signals(current);
+                                trace_rcu_grace_period(rsp->name,
+                                                       ACCESS_ONCE(rsp->gpnum),
+                                                       TPS("fqswaitsig"));
                        }
                        j = jiffies_till_next_fqs;
                        if (j > HZ) {
@@ -1555,13 +1651,17 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
                return;
        }
        rsp->gp_flags = RCU_GP_FLAG_INIT;
+        trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
+                               TPS("newreq"));
        /*
         * We can't do wakeups while holding the rnp->lock, as that
-         * could cause possible deadlocks with the rq->lock. Deter
+         * could cause possible deadlocks with the rq->lock. Defer
-         * the wakeup to interrupt context.
+         * the wakeup to interrupt context.  And don't bother waking
+         * up the running kthread.
         */
-        irq_work_queue(&rsp->wakeup_work);
+        if (current != rsp->gp_kthread)
+                irq_work_queue(&rsp->wakeup_work);
 }
 /*
@@ -1857,7 +1957,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
        RCU_TRACE(mask = rdp->grpmask);
        trace_rcu_grace_period(rsp->name,
                               rnp->gpnum + 1 - !!(rnp->qsmask & mask),
-                               "cpuofl");
+                               TPS("cpuofl"));
 }
 /*
@@ -2044,7 +2144,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 void rcu_check_callbacks(int cpu, int user)
 {
-        trace_rcu_utilization("Start scheduler-tick");
+        trace_rcu_utilization(TPS("Start scheduler-tick"));
        increment_cpu_stall_ticks();
        if (user || rcu_is_cpu_rrupt_from_idle()) {
@@ -2077,7 +2177,7 @@ void rcu_check_callbacks(int cpu, int user)
        rcu_preempt_check_callbacks(cpu);
        if (rcu_pending(cpu))
                invoke_rcu_core();
-        trace_rcu_utilization("End scheduler-tick");
+        trace_rcu_utilization(TPS("End scheduler-tick"));
 }
 /*
@@ -2087,7 +2187,10 @@ void rcu_check_callbacks(int cpu, int user)
 *
 * The caller must have suppressed start of new grace periods.
 */
-static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
+static void force_qs_rnp(struct rcu_state *rsp,
+                         int (*f)(struct rcu_data *rsp, bool *isidle,
+                                  unsigned long *maxj),
+                         bool *isidle, unsigned long *maxj)
 {
        unsigned long bit;
        int cpu;
@@ -2110,9 +2213,12 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
                cpu = rnp->grplo;
                bit = 1;
                for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
-                        if ((rnp->qsmask & bit) != 0 &&
+                        if ((rnp->qsmask & bit) != 0) {
-                            f(per_cpu_ptr(rsp->rda, cpu)))
+                                if ((rnp->qsmaskinit & bit) != 0)
-                                mask |= bit;
+                                        *isidle = 0;
+                                if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
+                                        mask |= bit;
+                        }
                }
                if (mask != 0) {
@@ -2208,10 +2314,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
        if (cpu_is_offline(smp_processor_id()))
                return;
-        trace_rcu_utilization("Start RCU core");
+        trace_rcu_utilization(TPS("Start RCU core"));
        for_each_rcu_flavor(rsp)
                __rcu_process_callbacks(rsp);
-        trace_rcu_utilization("End RCU core");
+        trace_rcu_utilization(TPS("End RCU core"));
 }
 /*
@@ -2248,7 +2354,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
         * If called from an extended quiescent state, invoke the RCU
         * core in order to force a re-evaluation of RCU's idleness.
         */
-        if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
+        if (!rcu_is_watching() && cpu_online(smp_processor_id()))
                invoke_rcu_core();
        /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2287,6 +2393,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
 }
 /*
+ * RCU callback function to leak a callback.
+ */
+static void rcu_leak_callback(struct rcu_head *rhp)
+{
+}
+/*
 * Helper function for call_rcu() and friends.  The cpu argument will
 * normally be -1, indicating "currently running CPU".  It may specify
 * a CPU only if that CPU is a no-CBs CPU.  Currently, only _rcu_barrier()
@@ -2300,7 +2413,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        struct rcu_data *rdp;
        WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
-        debug_rcu_head_queue(head);
+        if (debug_rcu_head_queue(head)) {
+                /* Probable double call_rcu(), so leak the callback. */
+                ACCESS_ONCE(head->func) = rcu_leak_callback;
+                WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n");
+                return;
+        }
        head->func = func;
        head->next = NULL;
@@ -2706,10 +2824,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
        for_each_rcu_flavor(rsp) {
                rdp = per_cpu_ptr(rsp->rda, cpu);
-                if (rdp->qlen != rdp->qlen_lazy)
+                if (!rdp->nxtlist)
+                        continue;
+                hc = true;
+                if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
                        al = false;
-                if (rdp->nxtlist)
+                        break;
-                        hc = true;
+                }
        }
        if (all_lazy)
                *all_lazy = al;
@@ -2720,7 +2841,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
 * Helper function for _rcu_barrier() tracing.  If tracing is disabled,
 * the compiler is expected to optimize this away.
 */
-static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
+static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s,
                               int cpu, unsigned long done)
 {
        trace_rcu_barrier(rsp->name, s, cpu,
@@ -2785,9 +2906,20 @@ static void _rcu_barrier(struct rcu_state *rsp)
         * transition.  The "if" expression below therefore rounds the old
         * value up to the next even number and adds two before comparing.
         */
-        snap_done = ACCESS_ONCE(rsp->n_barrier_done);
+        snap_done = rsp->n_barrier_done;
        _rcu_barrier_trace(rsp, "Check", -1, snap_done);
-        if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
+        /*
+         * If the value in snap is odd, we needed to wait for the current
+         * rcu_barrier() to complete, then wait for the next one, in other
+         * words, we need the value of snap_done to be three larger than
+         * the value of snap.  On the other hand, if the value in snap is
+         * even, we only had to wait for the next rcu_barrier() to complete,
+         * in other words, we need the value of snap_done to be only two
+         * greater than the value of snap.  The "(snap + 3) & ~0x1" computes
+         * this for us (thank you, Linus!).
+         */
+        if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
                _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
                smp_mb(); /* caller's subsequent code after above check. */
                mutex_unlock(&rsp->barrier_mutex);
@@ -2930,6 +3062,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        rdp->blimit = blimit;
        init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
        rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+        rcu_sysidle_init_percpu_data(rdp->dynticks);
        atomic_set(&rdp->dynticks->dynticks,
                   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
@@ -2952,7 +3085,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
                        rdp->completed = rnp->completed;
                        rdp->passed_quiesce = 0;
                        rdp->qs_pending = 0;
-                        trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
+                        trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
                }
                raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
                rnp = rnp->parent;
@@ -2982,7 +3115,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
        struct rcu_node *rnp = rdp->mynode;
        struct rcu_state *rsp;
-        trace_rcu_utilization("Start CPU hotplug");
+        trace_rcu_utilization(TPS("Start CPU hotplug"));
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
@@ -3011,7 +3144,26 @@ static int rcu_cpu_notify(struct notifier_block *self,
        default:
                break;
        }
-        trace_rcu_utilization("End CPU hotplug");
+        trace_rcu_utilization(TPS("End CPU hotplug"));
+        return NOTIFY_OK;
+}
+static int rcu_pm_notify(struct notifier_block *self,
+                         unsigned long action, void *hcpu)
+{
+        switch (action) {
+        case PM_HIBERNATION_PREPARE:
+        case PM_SUSPEND_PREPARE:
+                if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+                        rcu_expedited = 1;
+                break;
+        case PM_POST_HIBERNATION:
+        case PM_POST_SUSPEND:
+                rcu_expedited = 0;
+                break;
+        default:
+                break;
+        }
        return NOTIFY_OK;
 }
@@ -3166,7 +3318,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 /*
 * Compute the rcu_node tree geometry from kernel parameters.  This cannot
- * replace the definitions in rcutree.h because those are needed to size
+ * replace the definitions in tree.h because those are needed to size
 * the ->node array in the rcu_state structure.
 */
 static void __init rcu_init_geometry(void)
@@ -3245,8 +3397,8 @@ void __init rcu_init(void)
        rcu_bootup_announce();
        rcu_init_geometry();
-        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
+        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        __rcu_init_preempt();
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
@@ -3256,8 +3408,9 @@ void __init rcu_init(void)
         * or the scheduler are operational.
         */
        cpu_notifier(rcu_cpu_notify, 0);
+        pm_notifier(rcu_pm_notify, 0);
        for_each_online_cpu(cpu)
                rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
 }
-#include "rcutree_plugin.h"
+#include "tree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h
index b3832581043c..52be957c9fe2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcu/tree.h
@@ -88,6 +88,14 @@ struct rcu_dynticks {
                                    /* Process level is worth LLONG_MAX/2. */
        int dynticks_nmi_nesting;   /* Track NMI nesting level. */
        atomic_t dynticks;          /* Even value for idle, else odd. */
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+        long long dynticks_idle_nesting;
+                                    /* irq/process nesting level from idle. */
+        atomic_t dynticks_idle;     /* Even value for idle, else odd. */
+                                    /*  "Idle" excludes userspace execution. */
+        unsigned long dynticks_idle_jiffies;
+                                    /* End of last non-NMI non-idle period. */
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 #ifdef CONFIG_RCU_FAST_NO_HZ
        bool all_lazy;              /* Are all CPU's CBs lazy? */
        unsigned long nonlazy_posted;
@@ -96,6 +104,8 @@ struct rcu_dynticks {
                                    /* idle-period nonlazy_posted snapshot. */
        unsigned long last_accelerate;
                                    /* Last jiffy CBs were accelerated. */
+        unsigned long last_advance_all;
+                                    /* Last jiffy CBs were all advanced. */
        int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
@@ -445,7 +455,7 @@ struct rcu_state {
                                                /*  for CPU stalls. */
        unsigned long gp_max;                   /* Maximum GP duration in */
                                                /*  jiffies. */
-        char *name;                             /* Name of structure. */
+        const char *name;                       /* Name of structure. */
        char abbr;                              /* Abbreviated name. */
        struct list_head flavors;               /* List of RCU flavors. */
        struct irq_work wakeup_work;            /* Postponed wakeups */
@@ -545,6 +555,15 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
 static void rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
+static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+                                  unsigned long *maxj);
+static bool is_sysidle_rcu_state(struct rcu_state *rsp);
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+                                  unsigned long maxj);
+static void rcu_bind_gp_kthread(void);
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h
index 769e12e3151b..3822ac0c4b27 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,7 +28,7 @@
 #include <linux/gfp.h>
 #include <linux/oom.h>
 #include <linux/smpboot.h>
-#include <linux/tick.h>
+#include "../time/tick-internal.h"
 #define RCU_KTHREAD_PRIO 1
@@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void)
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
 #ifdef CONFIG_RCU_NOCB_CPU_ALL
        pr_info("\tOffload RCU callbacks from all CPUs\n");
-        cpumask_setall(rcu_nocb_mask);
+        cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
        if (have_rcu_nocb_mask) {
+                if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+                        pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
+                        cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+                                    rcu_nocb_mask);
+                }
                cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
                pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
                if (rcu_nocb_poll)
@@ -110,9 +115,7 @@ static void __init rcu_bootup_announce_oddness(void)
 #ifdef CONFIG_TREE_PREEMPT_RCU
-struct rcu_state rcu_preempt_state =
+RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-        RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
@@ -169,7 +172,7 @@ static void rcu_preempt_qs(int cpu)
        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
        if (rdp->passed_quiesce == 0)
-                trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
+                trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs"));
        rdp->passed_quiesce = 1;
        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
@@ -388,7 +391,7 @@ void rcu_read_unlock_special(struct task_struct *t)
                np = rcu_next_node_entry(t, rnp);
                list_del_init(&t->rcu_node_entry);
                t->rcu_blocked_node = NULL;
-                trace_rcu_unlock_preempted_task("rcu_preempt",
+                trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
                                                rnp->gpnum, t->pid);
                if (&t->rcu_node_entry == rnp->gp_tasks)
                        rnp->gp_tasks = np;
@@ -412,7 +415,7 @@ void rcu_read_unlock_special(struct task_struct *t)
                 */
                empty_exp_now = !rcu_preempted_readers_exp(rnp);
                if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
-                        trace_rcu_quiescent_state_report("preempt_rcu",
+                        trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
                                                         rnp->gpnum,
                                                         0, rnp->qsmask,
                                                         rnp->level,
@@ -662,7 +665,7 @@ static void rcu_preempt_check_callbacks(int cpu)
 static void rcu_preempt_do_callbacks(void)
 {
-        rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data));
+        rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
 }
 #endif /* #ifdef CONFIG_RCU_BOOST */
@@ -1130,7 +1133,7 @@ void exit_rcu(void)
 #ifdef CONFIG_RCU_BOOST
-#include "rtmutex_common.h"
+#include "../rtmutex_common.h"
 #ifdef CONFIG_RCU_TRACE
@@ -1250,12 +1253,12 @@ static int rcu_boost_kthread(void *arg)
        int spincnt = 0;
        int more2boost;
-        trace_rcu_utilization("Start boost kthread@init");
+        trace_rcu_utilization(TPS("Start boost kthread@init"));
        for (;;) {
                rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
-                trace_rcu_utilization("End boost kthread@rcu_wait");
+                trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
                rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
-                trace_rcu_utilization("Start boost kthread@rcu_wait");
+                trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
                rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
                more2boost = rcu_boost(rnp);
                if (more2boost)
@@ -1264,14 +1267,14 @@ static int rcu_boost_kthread(void *arg)
                        spincnt = 0;
                if (spincnt > 10) {
                        rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
-                        trace_rcu_utilization("End boost kthread@rcu_yield");
+                        trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
                        schedule_timeout_interruptible(2);
-                        trace_rcu_utilization("Start boost kthread@rcu_yield");
+                        trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
                        spincnt = 0;
                }
        }
        /* NOTREACHED */
-        trace_rcu_utilization("End boost kthread@notreached");
+        trace_rcu_utilization(TPS("End boost kthread@notreached"));
        return 0;
 }
@@ -1334,7 +1337,7 @@ static void invoke_rcu_callbacks_kthread(void)
 */
 static bool rcu_is_callbacks_kthread(void)
 {
-        return __get_cpu_var(rcu_cpu_kthread_task) == current;
+        return __this_cpu_read(rcu_cpu_kthread_task) == current;
 }
 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1384,8 +1387,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 static void rcu_kthread_do_work(void)
 {
-        rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
+        rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
-        rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+        rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
        rcu_preempt_do_callbacks();
 }
@@ -1404,7 +1407,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu)
 static int rcu_cpu_kthread_should_run(unsigned int cpu)
 {
-        return __get_cpu_var(rcu_cpu_has_work);
+        return __this_cpu_read(rcu_cpu_has_work);
 }
 /*
@@ -1414,12 +1417,12 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)
 */
 static void rcu_cpu_kthread(unsigned int cpu)
 {
-        unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
+        unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
-        char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
+        char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
        int spincnt;
        for (spincnt = 0; spincnt < 10; spincnt++) {
-                trace_rcu_utilization("Start CPU kthread@rcu_wait");
+                trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
                local_bh_disable();
                *statusp = RCU_KTHREAD_RUNNING;
                this_cpu_inc(rcu_cpu_kthread_loops);
@@ -1431,15 +1434,15 @@ static void rcu_cpu_kthread(unsigned int cpu)
                        rcu_kthread_do_work();
                local_bh_enable();
                if (*workp == 0) {
-                        trace_rcu_utilization("End CPU kthread@rcu_wait");
+                        trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
                        *statusp = RCU_KTHREAD_WAITING;
                        return;
                }
        }
        *statusp = RCU_KTHREAD_YIELDING;
-        trace_rcu_utilization("Start CPU kthread@rcu_yield");
+        trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
        schedule_timeout_interruptible(2);
-        trace_rcu_utilization("End CPU kthread@rcu_yield");
+        trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
        *statusp = RCU_KTHREAD_WAITING;
 }
@@ -1632,17 +1635,23 @@ module_param(rcu_idle_lazy_gp_delay, int, 0644);
 extern int tick_nohz_enabled;
 /*
- * Try to advance callbacks for all flavors of RCU on the current CPU.
+ * Try to advance callbacks for all flavors of RCU on the current CPU, but
- * Afterwards, if there are any callbacks ready for immediate invocation,
+ * only if it has been awhile since the last time we did so.  Afterwards,
- * return true.
+ * if there are any callbacks ready for immediate invocation, return true.
 */
 static bool rcu_try_advance_all_cbs(void)
 {
        bool cbs_ready = false;
        struct rcu_data *rdp;
+        struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
        struct rcu_node *rnp;
        struct rcu_state *rsp;
+        /* Exit early if we advanced recently. */
+        if (jiffies == rdtp->last_advance_all)
+                return 0;
+        rdtp->last_advance_all = jiffies;
        for_each_rcu_flavor(rsp) {
                rdp = this_cpu_ptr(rsp->rda);
                rnp = rdp->mynode;
@@ -1741,6 +1750,8 @@ static void rcu_prepare_for_idle(int cpu)
         */
        if (rdtp->all_lazy &&
            rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
+                rdtp->all_lazy = false;
+                rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
                invoke_rcu_core();
                return;
        }
@@ -1770,17 +1781,11 @@ static void rcu_prepare_for_idle(int cpu)
 */
 static void rcu_cleanup_after_idle(int cpu)
 {
-        struct rcu_data *rdp;
-        struct rcu_state *rsp;
        if (rcu_is_nocb_cpu(cpu))
                return;
-        rcu_try_advance_all_cbs();
+        if (rcu_try_advance_all_cbs())
-        for_each_rcu_flavor(rsp) {
+                invoke_rcu_core();
-                rdp = per_cpu_ptr(rsp->rda, cpu);
-                if (cpu_has_callbacks_ready_to_invoke(rdp))
-                        invoke_rcu_core();
-        }
 }
 /*
@@ -2110,15 +2115,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
        /* If we are not being polled and there is a kthread, awaken it ... */
        t = ACCESS_ONCE(rdp->nocb_kthread);
-        if (rcu_nocb_poll | !t)
+        if (rcu_nocb_poll || !t) {
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                    TPS("WakeNotPoll"));
                return;
+        }
        len = atomic_long_read(&rdp->nocb_q_count);
        if (old_rhpp == &rdp->nocb_head) {
                wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
                rdp->qlen_last_fqs_check = 0;
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
        } else if (len > rdp->qlen_last_fqs_check + qhimark) {
                wake_up_process(t); /* ... or if many callbacks queued. */
                rdp->qlen_last_fqs_check = LONG_MAX / 2;
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
+        } else {
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
        }
        return;
 }
@@ -2142,10 +2154,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
        if (__is_kfree_rcu_offset((unsigned long)rhp->func))
                trace_rcu_kfree_callback(rdp->rsp->name, rhp,
                                         (unsigned long)rhp->func,
-                                         rdp->qlen_lazy, rdp->qlen);
+                                         -atomic_long_read(&rdp->nocb_q_count_lazy),
+                                         -atomic_long_read(&rdp->nocb_q_count));
        else
                trace_rcu_callback(rdp->rsp->name, rhp,
-                                   rdp->qlen_lazy, rdp->qlen);
+                                   -atomic_long_read(&rdp->nocb_q_count_lazy),
+                                   -atomic_long_read(&rdp->nocb_q_count));
        return 1;
 }
@@ -2202,7 +2216,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
         * Wait for the grace period.  Do so interruptibly to avoid messing
         * up the load average.
         */
-        trace_rcu_future_gp(rnp, rdp, c, "StartWait");
+        trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
        for (;;) {
                wait_event_interruptible(
                        rnp->nocb_gp_wq[c & 0x1],
@@ -2210,9 +2224,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
                if (likely(d))
                        break;
                flush_signals(current);
-                trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
+                trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
        }
-        trace_rcu_future_gp(rnp, rdp, c, "EndWait");
+        trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
        smp_mb(); /* Ensure that CB invocation happens after GP end. */
 }
@@ -2223,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 static int rcu_nocb_kthread(void *arg)
 {
        int c, cl;
+        bool firsttime = 1;
        struct rcu_head *list;
        struct rcu_head *next;
        struct rcu_head **tail;
@@ -2231,14 +2246,27 @@ static int rcu_nocb_kthread(void *arg)
        /* Each pass through this loop invokes one batch of callbacks */
        for (;;) {
                /* If not polling, wait for next batch of callbacks. */
-                if (!rcu_nocb_poll)
+                if (!rcu_nocb_poll) {
+                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                            TPS("Sleep"));
                        wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
+                } else if (firsttime) {
+                        firsttime = 0;
+                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                            TPS("Poll"));
+                }
                list = ACCESS_ONCE(rdp->nocb_head);
                if (!list) {
+                        if (!rcu_nocb_poll)
+                                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                                    TPS("WokeEmpty"));
                        schedule_timeout_interruptible(1);
                        flush_signals(current);
                        continue;
                }
+                firsttime = 1;
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                    TPS("WokeNonEmpty"));
                /*
                 * Extract queued callbacks, update counts, and wait
@@ -2259,7 +2287,11 @@ static int rcu_nocb_kthread(void *arg)
                        next = list->next;
                        /* Wait for enqueuing to complete, if needed. */
                        while (next == NULL && &list->next != tail) {
+                                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                                    TPS("WaitQueue"));
                                schedule_timeout_interruptible(1);
+                                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                                    TPS("WokeQueue"));
                                next = list->next;
                        }
                        debug_rcu_head_unqueue(list);
@@ -2375,3 +2407,425 @@ static void rcu_kick_nohz_cpu(int cpu)
                smp_send_reschedule(cpu);
 #endif /* #ifdef CONFIG_NO_HZ_FULL */
 }
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+/*
+ * Define RCU flavor that holds sysidle state.  This needs to be the
+ * most active flavor of RCU.
+ */
+#ifdef CONFIG_PREEMPT_RCU
+static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
+#else /* #ifdef CONFIG_PREEMPT_RCU */
+static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+static int full_sysidle_state;          /* Current system-idle state. */
+#define RCU_SYSIDLE_NOT         0       /* Some CPU is not idle. */
+#define RCU_SYSIDLE_SHORT       1       /* All CPUs idle for brief period. */
+#define RCU_SYSIDLE_LONG        2       /* All CPUs idle for long enough. */
+#define RCU_SYSIDLE_FULL        3       /* All CPUs idle, ready for sysidle. */
+#define RCU_SYSIDLE_FULL_NOTED  4       /* Actually entered sysidle state. */
+/*
+ * Invoked to note exit from irq or task transition to idle.  Note that
+ * usermode execution does -not- count as idle here!  After all, we want
+ * to detect full-system idle states, not RCU quiescent states and grace
+ * periods.  The caller must have disabled interrupts.
+ */
+static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+{
+        unsigned long j;
+        /* Adjust nesting, check for fully idle. */
+        if (irq) {
+                rdtp->dynticks_idle_nesting--;
+                WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
+                if (rdtp->dynticks_idle_nesting != 0)
+                        return;  /* Still not fully idle. */
+        } else {
+                if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
+                    DYNTICK_TASK_NEST_VALUE) {
+                        rdtp->dynticks_idle_nesting = 0;
+                } else {
+                        rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
+                        WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
+                        return;  /* Still not fully idle. */
+                }
+        }
+        /* Record start of fully idle period. */
+        j = jiffies;
+        ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
+        smp_mb__before_atomic_inc();
+        atomic_inc(&rdtp->dynticks_idle);
+        smp_mb__after_atomic_inc();
+        WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
+}
+/*
+ * Unconditionally force exit from full system-idle state.  This is
+ * invoked when a normal CPU exits idle, but must be called separately
+ * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
+ * is that the timekeeping CPU is permitted to take scheduling-clock
+ * interrupts while the system is in system-idle state, and of course
+ * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
+ * interrupt from any other type of interrupt.
+ */
+void rcu_sysidle_force_exit(void)
+{
+        int oldstate = ACCESS_ONCE(full_sysidle_state);
+        int newoldstate;
+        /*
+         * Each pass through the following loop attempts to exit full
+         * system-idle state.  If contention proves to be a problem,
+         * a trylock-based contention tree could be used here.
+         */
+        while (oldstate > RCU_SYSIDLE_SHORT) {
+                newoldstate = cmpxchg(&full_sysidle_state,
+                                      oldstate, RCU_SYSIDLE_NOT);
+                if (oldstate == newoldstate &&
+                    oldstate == RCU_SYSIDLE_FULL_NOTED) {
+                        rcu_kick_nohz_cpu(tick_do_timer_cpu);
+                        return; /* We cleared it, done! */
+                }
+                oldstate = newoldstate;
+        }
+        smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
+}
+/*
+ * Invoked to note entry to irq or task transition from idle.  Note that
+ * usermode execution does -not- count as idle here!  The caller must
+ * have disabled interrupts.
+ */
+static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+{
+        /* Adjust nesting, check for already non-idle. */
+        if (irq) {
+                rdtp->dynticks_idle_nesting++;
+                WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
+                if (rdtp->dynticks_idle_nesting != 1)
+                        return; /* Already non-idle. */
+        } else {
+                /*
+                 * Allow for irq misnesting.  Yes, it really is possible
+                 * to enter an irq handler then never leave it, and maybe
+                 * also vice versa.  Handle both possibilities.
+                 */
+                if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
+                        rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
+                        WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
+                        return; /* Already non-idle. */
+                } else {
+                        rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
+                }
+        }
+        /* Record end of idle period. */
+        smp_mb__before_atomic_inc();
+        atomic_inc(&rdtp->dynticks_idle);
+        smp_mb__after_atomic_inc();
+        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
+        /*
+         * If we are the timekeeping CPU, we are permitted to be non-idle
+         * during a system-idle state.  This must be the case, because
+         * the timekeeping CPU has to take scheduling-clock interrupts
+         * during the time that the system is transitioning to full
+         * system-idle state.  This means that the timekeeping CPU must
+         * invoke rcu_sysidle_force_exit() directly if it does anything
+         * more than take a scheduling-clock interrupt.
+         */
+        if (smp_processor_id() == tick_do_timer_cpu)
+                return;
+        /* Update system-idle state: We are clearly no longer fully idle! */
+        rcu_sysidle_force_exit();
+}
+/*
+ * Check to see if the current CPU is idle.  Note that usermode execution
+ * does not count as idle.  The caller must have disabled interrupts.
+ */
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+                                  unsigned long *maxj)
+{
+        int cur;
+        unsigned long j;
+        struct rcu_dynticks *rdtp = rdp->dynticks;
+        /*
+         * If some other CPU has already reported non-idle, if this is
+         * not the flavor of RCU that tracks sysidle state, or if this
+         * is an offline or the timekeeping CPU, nothing to do.
+         */
+        if (!*isidle || rdp->rsp != rcu_sysidle_state ||
+            cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
+                return;
+        if (rcu_gp_in_progress(rdp->rsp))
+                WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
+        /* Pick up current idle and NMI-nesting counter and check. */
+        cur = atomic_read(&rdtp->dynticks_idle);
+        if (cur & 0x1) {
+                *isidle = false; /* We are not idle! */
+                return;
+        }
+        smp_mb(); /* Read counters before timestamps. */
+        /* Pick up timestamps. */
+        j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
+        /* If this CPU entered idle more recently, update maxj timestamp. */
+        if (ULONG_CMP_LT(*maxj, j))
+                *maxj = j;
+}
+/*
+ * Is this the flavor of RCU that is handling full-system idle?
+ */
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+        return rsp == rcu_sysidle_state;
+}
+/*
+ * Bind the grace-period kthread for the sysidle flavor of RCU to the
+ * timekeeping CPU.
+ */
+static void rcu_bind_gp_kthread(void)
+{
+        int cpu = ACCESS_ONCE(tick_do_timer_cpu);
+        if (cpu < 0 || cpu >= nr_cpu_ids)
+                return;
+        if (raw_smp_processor_id() != cpu)
+                set_cpus_allowed_ptr(current, cpumask_of(cpu));
+}
+/*
+ * Return a delay in jiffies based on the number of CPUs, rcu_node
+ * leaf fanout, and jiffies tick rate.  The idea is to allow larger
+ * systems more time to transition to full-idle state in order to
+ * avoid the cache thrashing that otherwise occur on the state variable.
+ * Really small systems (less than a couple of tens of CPUs) should
+ * instead use a single global atomically incremented counter, and later
+ * versions of this will automatically reconfigure themselves accordingly.
+ */
+static unsigned long rcu_sysidle_delay(void)
+{
+        if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
+                return 0;
+        return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
+}
+/*
+ * Advance the full-system-idle state.  This is invoked when all of
+ * the non-timekeeping CPUs are idle.
+ */
+static void rcu_sysidle(unsigned long j)
+{
+        /* Check the current state. */
+        switch (ACCESS_ONCE(full_sysidle_state)) {
+        case RCU_SYSIDLE_NOT:
+                /* First time all are idle, so note a short idle period. */
+                ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
+                break;
+        case RCU_SYSIDLE_SHORT:
+                /*
+                 * Idle for a bit, time to advance to next state?
+                 * cmpxchg failure means race with non-idle, let them win.
+                 */
+                if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
+                        (void)cmpxchg(&full_sysidle_state,
+                                      RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
+                break;
+        case RCU_SYSIDLE_LONG:
+                /*
+                 * Do an additional check pass before advancing to full.
+                 * cmpxchg failure means race with non-idle, let them win.
+                 */
+                if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
+                        (void)cmpxchg(&full_sysidle_state,
+                                      RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
+                break;
+        default:
+                break;
+        }
+}
+/*
+ * Found a non-idle non-timekeeping CPU, so kick the system-idle state
+ * back to the beginning.
+ */
+static void rcu_sysidle_cancel(void)
+{
+        smp_mb();
+        ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+}
+/*
+ * Update the sysidle state based on the results of a force-quiescent-state
+ * scan of the CPUs' dyntick-idle state.
+ */
+static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
+                               unsigned long maxj, bool gpkt)
+{
+        if (rsp != rcu_sysidle_state)
+                return;  /* Wrong flavor, ignore. */
+        if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
+                return;  /* Running state machine from timekeeping CPU. */
+        if (isidle)
+                rcu_sysidle(maxj);    /* More idle! */
+        else
+                rcu_sysidle_cancel(); /* Idle is over. */
+}
+/*
+ * Wrapper for rcu_sysidle_report() when called from the grace-period
+ * kthread's context.
+ */
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+                                  unsigned long maxj)
+{
+        rcu_sysidle_report(rsp, isidle, maxj, true);
+}
+/* Callback and function for forcing an RCU grace period. */
+struct rcu_sysidle_head {
+        struct rcu_head rh;
+        int inuse;
+};
+static void rcu_sysidle_cb(struct rcu_head *rhp)
+{
+        struct rcu_sysidle_head *rshp;
+        /*
+         * The following memory barrier is needed to replace the
+         * memory barriers that would normally be in the memory
+         * allocator.
+         */
+        smp_mb();  /* grace period precedes setting inuse. */
+        rshp = container_of(rhp, struct rcu_sysidle_head, rh);
+        ACCESS_ONCE(rshp->inuse) = 0;
+}
+/*
+ * Check to see if the system is fully idle, other than the timekeeping CPU.
+ * The caller must have disabled interrupts.
+ */
+bool rcu_sys_is_idle(void)
+{
+        static struct rcu_sysidle_head rsh;
+        int rss = ACCESS_ONCE(full_sysidle_state);
+        if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
+                return false;
+        /* Handle small-system case by doing a full scan of CPUs. */
+        if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
+                int oldrss = rss - 1;
+                /*
+                 * One pass to advance to each state up to _FULL.
+                 * Give up if any pass fails to advance the state.
+                 */
+                while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
+                        int cpu;
+                        bool isidle = true;
+                        unsigned long maxj = jiffies - ULONG_MAX / 4;
+                        struct rcu_data *rdp;
+                        /* Scan all the CPUs looking for nonidle CPUs. */
+                        for_each_possible_cpu(cpu) {
+                                rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu);
+                                rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
+                                if (!isidle)
+                                        break;
+                        }
+                        rcu_sysidle_report(rcu_sysidle_state,
+                                           isidle, maxj, false);
+                        oldrss = rss;
+                        rss = ACCESS_ONCE(full_sysidle_state);
+                }
+        }
+        /* If this is the first observation of an idle period, record it. */
+        if (rss == RCU_SYSIDLE_FULL) {
+                rss = cmpxchg(&full_sysidle_state,
+                              RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
+                return rss == RCU_SYSIDLE_FULL;
+        }
+        smp_mb(); /* ensure rss load happens before later caller actions. */
+        /* If already fully idle, tell the caller (in case of races). */
+        if (rss == RCU_SYSIDLE_FULL_NOTED)
+                return true;
+        /*
+         * If we aren't there yet, and a grace period is not in flight,
+         * initiate a grace period.  Either way, tell the caller that
+         * we are not there yet.  We use an xchg() rather than an assignment
+         * to make up for the memory barriers that would otherwise be
+         * provided by the memory allocator.
+         */
+        if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
+            !rcu_gp_in_progress(rcu_sysidle_state) &&
+            !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
+                call_rcu(&rsh.rh, rcu_sysidle_cb);
+        return false;
+}
+/*
+ * Initialize dynticks sysidle state for CPUs coming online.
+ */
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
+{
+        rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
+}
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+{
+}
+static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+{
+}
+static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
+                                  unsigned long *maxj)
+{
+}
+static bool is_sysidle_rcu_state(struct rcu_state *rsp)
+{
+        return false;
+}
+static void rcu_bind_gp_kthread(void)
+{
+}
+static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
+                                  unsigned long maxj)
+{
+}
+static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
+{
+}
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c
index cf6c17412932..3596797b7e46 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -44,7 +44,7 @@
 #include <linux/seq_file.h>
 #define RCU_TREE_NONCORE
-#include "rcutree.h"
+#include "tree.h"
 static int r_open(struct inode *inode, struct file *file,
                                        const struct seq_operations *op)
diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c
index cce6ba8bbace..6cb3dff89e2b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcu/update.c
@@ -53,6 +53,12 @@
 #include "rcu.h"
+MODULE_ALIAS("rcupdate");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcupdate."
 module_param(rcu_expedited, int, 0);
 #ifdef CONFIG_PREEMPT_RCU
@@ -122,7 +128,7 @@ struct lockdep_map rcu_sched_lock_map =
        STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
 EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
-int debug_lockdep_rcu_enabled(void)
+int notrace debug_lockdep_rcu_enabled(void)
 {
        return rcu_scheduler_active && debug_locks &&
               current->lockdep_recursion == 0;
@@ -148,7 +154,7 @@ int rcu_read_lock_bh_held(void)
 {
        if (!debug_lockdep_rcu_enabled())
                return 1;
-        if (rcu_is_cpu_idle())
+        if (!rcu_is_watching())
                return 0;
        if (!rcu_lockdep_current_cpu_online())
                return 0;
@@ -212,43 +218,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head)
 }
 /*
- * fixup_init is called when:
- * - an active object is initialized
- */
-static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
-{
-        struct rcu_head *head = addr;
-        switch (state) {
-        case ODEBUG_STATE_ACTIVE:
-                /*
-                 * Ensure that queued callbacks are all executed.
-                 * If we detect that we are nested in a RCU read-side critical
-                 * section, we should simply fail, otherwise we would deadlock.
-                 * In !PREEMPT configurations, there is no way to tell if we are
-                 * in a RCU read-side critical section or not, so we never
-                 * attempt any fixup and just print a warning.
-                 */
-#ifndef CONFIG_PREEMPT
-                WARN_ON_ONCE(1);
-                return 0;
-#endif
-                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
-                    irqs_disabled()) {
-                        WARN_ON_ONCE(1);
-                        return 0;
-                }
-                rcu_barrier();
-                rcu_barrier_sched();
-                rcu_barrier_bh();
-                debug_object_init(head, &rcuhead_debug_descr);
-                return 1;
-        default:
-                return 0;
-        }
-}
-/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown object is activated (might be a statically initialized object)
@@ -268,69 +237,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
                debug_object_init(head, &rcuhead_debug_descr);
                debug_object_activate(head, &rcuhead_debug_descr);
                return 0;
-        case ODEBUG_STATE_ACTIVE:
-                /*
-                 * Ensure that queued callbacks are all executed.
-                 * If we detect that we are nested in a RCU read-side critical
-                 * section, we should simply fail, otherwise we would deadlock.
-                 * In !PREEMPT configurations, there is no way to tell if we are
-                 * in a RCU read-side critical section or not, so we never
-                 * attempt any fixup and just print a warning.
-                 */
-#ifndef CONFIG_PREEMPT
-                WARN_ON_ONCE(1);
-                return 0;
-#endif
-                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
-                    irqs_disabled()) {
-                        WARN_ON_ONCE(1);
-                        return 0;
-                }
-                rcu_barrier();
-                rcu_barrier_sched();
-                rcu_barrier_bh();
-                debug_object_activate(head, &rcuhead_debug_descr);
-                return 1;
        default:
-                return 0;
-        }
-}
-/*
- * fixup_free is called when:
- * - an active object is freed
- */
-static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
-{
-        struct rcu_head *head = addr;
-        switch (state) {
-        case ODEBUG_STATE_ACTIVE:
-                /*
-                 * Ensure that queued callbacks are all executed.
-                 * If we detect that we are nested in a RCU read-side critical
-                 * section, we should simply fail, otherwise we would deadlock.
-                 * In !PREEMPT configurations, there is no way to tell if we are
-                 * in a RCU read-side critical section or not, so we never
-                 * attempt any fixup and just print a warning.
-                 */
-#ifndef CONFIG_PREEMPT
-                WARN_ON_ONCE(1);
-                return 0;
-#endif
-                if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
-                    irqs_disabled()) {
-                        WARN_ON_ONCE(1);
-                        return 0;
-                }
-                rcu_barrier();
-                rcu_barrier_sched();
-                rcu_barrier_bh();
-                debug_object_free(head, &rcuhead_debug_descr);
                return 1;
-        default:
-                return 0;
        }
 }
@@ -369,15 +277,13 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
 struct debug_obj_descr rcuhead_debug_descr = {
        .name = "rcu_head",
-        .fixup_init = rcuhead_fixup_init,
        .fixup_activate = rcuhead_fixup_activate,
-        .fixup_free = rcuhead_fixup_free,
 };
 EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
-void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
+void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
                               unsigned long secs,
                               unsigned long c_old, unsigned long c)
 {
@@ -398,7 +304,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
 #endif
 int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
-int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
 module_param(rcu_cpu_stall_suppress, int, 0644);
 module_param(rcu_cpu_stall_timeout, int, 0644);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 269ed9384cc4..f813b3474646 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -32,7 +32,14 @@ EXPORT_SYMBOL(cad_pid);
 #endif
 enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
-int reboot_default;
+/*
+ * This variable is used privately to keep track of whether or not
+ * reboot_type is still set to its default value (i.e., reboot= hasn't
+ * been set on the command line).  This is needed so that we can
+ * suppress DMI scanning for reboot quirks.  Without it, it's
+ * impossible to override a faulty reboot quirk without recompiling.
+ */
+int reboot_default = 1;
 int reboot_cpu;
 enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ff55247e7049..4aa8a305aede 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -17,8 +17,8 @@
 void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 {
        spin_lock_init(&counter->lock);
-        counter->limit = RESOURCE_MAX;
+        counter->limit = RES_COUNTER_MAX;
-        counter->soft_limit = RESOURCE_MAX;
+        counter->soft_limit = RES_COUNTER_MAX;
        counter->parent = parent;
 }
@@ -178,23 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
 #endif
 int res_counter_memparse_write_strategy(const char *buf,
-                                        unsigned long long *res)
+                                        unsigned long long *resp)
 {
        char *end;
+        unsigned long long res;
-        /* return RESOURCE_MAX(unlimited) if "-1" is specified */
+        /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
        if (*buf == '-') {
-                *res = simple_strtoull(buf + 1, &end, 10);
+                res = simple_strtoull(buf + 1, &end, 10);
-                if (*res != 1 || *end != '\0')
+                if (res != 1 || *end != '\0')
                        return -EINVAL;
-                *res = RESOURCE_MAX;
+                *resp = RES_COUNTER_MAX;
                return 0;
        }
-        *res = memparse(buf, &end);
+        res = memparse(buf, &end);
        if (*end != '\0')
                return -EINVAL;
-        *res = PAGE_ALIGN(*res);
+        if (PAGE_ALIGN(res) >= res)
+                res = PAGE_ALIGN(res);
+        else
+                res = RES_COUNTER_MAX;
+        *resp = res;
        return 0;
 }
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 54adcf35f495..7b621409cf15 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
+obj-y += wait.o completion.o
 obj-$(CONFIG_SMP) += cpupri.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
new file mode 100644
index 000000000000..a63f4dc27909
--- /dev/null
+++ b/kernel/sched/completion.c
@@ -0,0 +1,299 @@
+/*
+ * Generic wait-for-completion handler;
+ *
+ * It differs from semaphores in that their default case is the opposite,
+ * wait_for_completion default blocks whereas semaphore default non-block. The
+ * interface also makes it easy to 'complete' multiple waiting threads,
+ * something which isn't entirely natural for semaphores.
+ *
+ * But more importantly, the primitive documents the usage. Semaphores would
+ * typically be used for exclusion which gives rise to priority inversion.
+ * Waiting for completion is a typically sync point, but not an exclusion point.
+ */
+#include <linux/sched.h>
+#include <linux/completion.h>
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete(struct completion *x)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&x->wait.lock, flags);
+        x->done++;
+        __wake_up_locked(&x->wait, TASK_NORMAL, 1);
+        spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete);
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void complete_all(struct completion *x)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&x->wait.lock, flags);
+        x->done += UINT_MAX/2;
+        __wake_up_locked(&x->wait, TASK_NORMAL, 0);
+        spin_unlock_irqrestore(&x->wait.lock, flags);
+}
+EXPORT_SYMBOL(complete_all);
+static inline long __sched
+do_wait_for_common(struct completion *x,
+                   long (*action)(long), long timeout, int state)
+{
+        if (!x->done) {
+                DECLARE_WAITQUEUE(wait, current);
+                __add_wait_queue_tail_exclusive(&x->wait, &wait);
+                do {
+                        if (signal_pending_state(state, current)) {
+                                timeout = -ERESTARTSYS;
+                                break;
+                        }
+                        __set_current_state(state);
+                        spin_unlock_irq(&x->wait.lock);
+                        timeout = action(timeout);
+                        spin_lock_irq(&x->wait.lock);
+                } while (!x->done && timeout);
+                __remove_wait_queue(&x->wait, &wait);
+                if (!x->done)
+                        return timeout;
+        }
+        x->done--;
+        return timeout ?: 1;
+}
+static inline long __sched
+__wait_for_common(struct completion *x,
+                  long (*action)(long), long timeout, int state)
+{
+        might_sleep();
+        spin_lock_irq(&x->wait.lock);
+        timeout = do_wait_for_common(x, action, timeout, state);
+        spin_unlock_irq(&x->wait.lock);
+        return timeout;
+}
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+        return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+        return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
+void __sched wait_for_completion(struct completion *x)
+{
+        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion);
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_timeout(struct completion *x, unsigned long timeout)
+{
+        return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_timeout);
+/**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+        wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
+ * till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+        return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_interruptible(struct completion *x)
+{
+        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
+        if (t == -ERESTARTSYS)
+                return t;
+        return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible);
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_interruptible_timeout(struct completion *x,
+                                          unsigned long timeout)
+{
+        return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if completed.
+ */
+int __sched wait_for_completion_killable(struct completion *x)
+{
+        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
+        if (t == -ERESTARTSYS)
+                return t;
+        return 0;
+}
+EXPORT_SYMBOL(wait_for_completion_killable);
+/**
+ * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be
+ * signaled or for a specified timeout to expire. It can be
+ * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
+ * or number of jiffies left till timeout) if completed.
+ */
+long __sched
+wait_for_completion_killable_timeout(struct completion *x,
+                                     unsigned long timeout)
+{
+        return wait_for_common(x, timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(wait_for_completion_killable_timeout);
+/**
+ *      try_wait_for_completion - try to decrement a completion without blocking
+ *      @x:     completion structure
+ *
+ *      Return: 0 if a decrement cannot be done without blocking
+ *               1 if a decrement succeeded.
+ *
+ *      If a completion is being used as a counting completion,
+ *      attempt to decrement the counter without blocking. This
+ *      enables us to avoid waiting if the resource the completion
+ *      is protecting is not available.
+ */
+bool try_wait_for_completion(struct completion *x)
+{
+        unsigned long flags;
+        int ret = 1;
+        spin_lock_irqsave(&x->wait.lock, flags);
+        if (!x->done)
+                ret = 0;
+        else
+                x->done--;
+        spin_unlock_irqrestore(&x->wait.lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL(try_wait_for_completion);
+/**
+ *      completion_done - Test to see if a completion has any waiters
+ *      @x:     completion structure
+ *
+ *      Return: 0 if there are waiters (wait_for_completion() in progress)
+ *               1 if there are no waiters.
+ *
+ */
+bool completion_done(struct completion *x)
+{
+        unsigned long flags;
+        int ret = 1;
+        spin_lock_irqsave(&x->wait.lock, flags);
+        if (!x->done)
+                ret = 0;
+        spin_unlock_irqrestore(&x->wait.lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7c32cb7bfeb..1deccd78be98 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
-#ifdef CONFIG_SMP
 void resched_task(struct task_struct *p)
 {
        int cpu;
-        assert_raw_spin_locked(&task_rq(p)->lock);
+        lockdep_assert_held(&task_rq(p)->lock);
        if (test_tsk_need_resched(p))
                return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
        set_tsk_need_resched(p);
        cpu = task_cpu(p);
-        if (cpu == smp_processor_id())
+        if (cpu == smp_processor_id()) {
+                set_preempt_need_resched();
                return;
+        }
        /* NEED_RESCHED must be visible before we test polling */
        smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
+#ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ_COMMON
 /*
 * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
        }
 }
-#else /* !CONFIG_SMP */
-void resched_task(struct task_struct *p)
-{
-        assert_raw_spin_locked(&task_rq(p)->lock);
-        set_tsk_need_resched(p);
-}
 #endif /* CONFIG_SMP */
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-        sched_info_queued(p);
+        sched_info_queued(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-        sched_info_dequeued(p);
+        sched_info_dequeued(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
 }
@@ -933,6 +929,8 @@ static int effective_prio(struct task_struct *p)
 /**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
 */
 inline int task_curr(const struct task_struct *p)
 {
@@ -976,13 +974,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                rq->skip_clock_update = 1;
 }
-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-void register_task_migration_notifier(struct notifier_block *n)
-{
-        atomic_notifier_chain_register(&task_migration_notifier, n);
-}
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -992,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * ttwu() will sort out the placement.
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+                        !(task_preempt_count(p) & PREEMPT_ACTIVE));
 #ifdef CONFIG_LOCKDEP
        /*
@@ -1013,21 +1004,114 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        trace_sched_migrate_task(p, new_cpu);
        if (task_cpu(p) != new_cpu) {
-                struct task_migration_notifier tmn;
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+        }
+        __set_task_cpu(p, new_cpu);
+}
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+        if (p->on_rq) {
+                struct rq *src_rq, *dst_rq;
-                tmn.task = p;
+                src_rq = task_rq(p);
-                tmn.from_cpu = task_cpu(p);
+                dst_rq = cpu_rq(cpu);
-                tmn.to_cpu = new_cpu;
-                atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
+                deactivate_task(src_rq, p, 0);
+                set_task_cpu(p, cpu);
+                activate_task(dst_rq, p, 0);
+                check_preempt_curr(dst_rq, p, 0);
+        } else {
+                /*
+                 * Task isn't running anymore; make it appear like we migrated
+                 * it before it went to sleep. This means on wakeup we make the
+                 * previous cpu our targer instead of where it really is.
+                 */
+                p->wake_cpu = cpu;
        }
+}
-        __set_task_cpu(p, new_cpu);
+struct migration_swap_arg {
+        struct task_struct *src_task, *dst_task;
+        int src_cpu, dst_cpu;
+};
+static int migrate_swap_stop(void *data)
+{
+        struct migration_swap_arg *arg = data;
+        struct rq *src_rq, *dst_rq;
+        int ret = -EAGAIN;
+        src_rq = cpu_rq(arg->src_cpu);
+        dst_rq = cpu_rq(arg->dst_cpu);
+        double_raw_lock(&arg->src_task->pi_lock,
+                        &arg->dst_task->pi_lock);
+        double_rq_lock(src_rq, dst_rq);
+        if (task_cpu(arg->dst_task) != arg->dst_cpu)
+                goto unlock;
+        if (task_cpu(arg->src_task) != arg->src_cpu)
+                goto unlock;
+        if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+                goto unlock;
+        if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+                goto unlock;
+        __migrate_swap_task(arg->src_task, arg->dst_cpu);
+        __migrate_swap_task(arg->dst_task, arg->src_cpu);
+        ret = 0;
+unlock:
+        double_rq_unlock(src_rq, dst_rq);
+        raw_spin_unlock(&arg->dst_task->pi_lock);
+        raw_spin_unlock(&arg->src_task->pi_lock);
+        return ret;
+}
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+        struct migration_swap_arg arg;
+        int ret = -EINVAL;
+        arg = (struct migration_swap_arg){
+                .src_task = cur,
+                .src_cpu = task_cpu(cur),
+                .dst_task = p,
+                .dst_cpu = task_cpu(p),
+        };
+        if (arg.src_cpu == arg.dst_cpu)
+                goto out;
+        /*
+         * These three tests are all lockless; this is OK since all of them
+         * will be re-checked with proper locks held further down the line.
+         */
+        if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+                goto out;
+        if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+                goto out;
+        if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+                goto out;
+        ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+out:
+        return ret;
 }
 struct migration_arg {
@@ -1249,9 +1333,9 @@ out:
 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
 */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
-        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+        cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -1343,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
        if (rq->idle_stamp) {
                u64 delta = rq_clock(rq) - rq->idle_stamp;
-                u64 max = 2*sysctl_sched_migration_cost;
+                u64 max = 2*rq->max_idle_balance_cost;
-                if (delta > max)
+                update_avg(&rq->avg_idle, delta);
+                if (rq->avg_idle > max)
                        rq->avg_idle = max;
-                else
-                        update_avg(&rq->avg_idle, delta);
                rq->idle_stamp = 0;
        }
 #endif
@@ -1409,6 +1494,14 @@ static void sched_ttwu_pending(void)
 void scheduler_ipi(void)
 {
+        /*
+         * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+         * TIF_NEED_RESCHED remotely (for the first time) will also send
+         * this IPI.
+         */
+        if (tif_need_resched())
+                set_preempt_need_resched();
        if (llist_empty(&this_rq()->wake_list)
                        && !tick_nohz_full_cpu(smp_processor_id())
                        && !got_nohz_idle_kick())
@@ -1482,7 +1575,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 * the simpler "current->state = TASK_RUNNING" to mark yourself
 * runnable without the overhead of this.
 *
- * Returns %true if @p was woken up, %false if it was already running
+ * Return: %true if @p was woken up, %false if it was already running.
 * or @state didn't match @p's state.
 */
 static int
@@ -1491,7 +1584,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        unsigned long flags;
        int cpu, success = 0;
-        smp_wmb();
+        /*
+         * If we are going to wake up a thread waiting for CONDITION we
+         * need to ensure that CONDITION=1 done by the caller can not be
+         * reordered with p->state check below. This pairs with mb() in
+         * set_current_state() the waiting thread does.
+         */
+        smp_mb__before_spinlock();
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        if (!(p->state & state))
                goto out;
@@ -1520,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        if (p->sched_class->task_waking)
                p->sched_class->task_waking(p);
-        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+        cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
        if (task_cpu(p) != cpu) {
                wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
@@ -1577,8 +1676,9 @@ out:
 * @p: The process to be woken up.
 *
 * Attempt to wake up the nominated process and move it to the set of runnable
- * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * processes.
- * running.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
 *
 * It may be assumed that this function implies a write memory barrier before
 * changing the task state if and only if any tasks are woken up.
@@ -1601,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 *
 * __sched_fork() is basic setup used by init_idle() too:
 */
-static void __sched_fork(struct task_struct *p)
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        p->on_rq                        = 0;
@@ -1625,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_NUMA_BALANCING
        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-                p->mm->numa_next_scan = jiffies;
+                p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-                p->mm->numa_next_reset = jiffies;
                p->mm->numa_scan_seq = 0;
        }
+        if (clone_flags & CLONE_VM)
+                p->numa_preferred_nid = current->numa_preferred_nid;
+        else
+                p->numa_preferred_nid = -1;
        p->node_stamp = 0ULL;
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
+        p->numa_faults = NULL;
+        p->numa_faults_buffer = NULL;
+        INIT_LIST_HEAD(&p->numa_entry);
+        p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
 }
@@ -1660,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
 /*
 * fork()/clone()-time setup:
 */
-void sched_fork(struct task_struct *p)
+void sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long flags;
        int cpu = get_cpu();
-        __sched_fork(p);
+        __sched_fork(clone_flags, p);
        /*
         * We mark the process as running here. This guarantees that
         * nobody will actually run it, and a signal or other external
@@ -1723,10 +1831,7 @@ void sched_fork(struct task_struct *p)
 #if defined(CONFIG_SMP)
        p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT_COUNT
+        init_task_preempt_count(p);
-        /* Want to start with kernel preemption disabled. */
-        task_thread_info(p)->preempt_count = 1;
-#endif
 #ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
 #endif
@@ -1753,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
         */
-        set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
+        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
        /* Initialize new task's runnable average */
@@ -1844,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
        trace_sched_switch(prev, next);
-        sched_info_switch(prev, next);
+        sched_info_switch(rq, prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
@@ -1896,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_state == TASK_DEAD)) {
+                task_numa_free(prev);
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
@@ -2079,7 +2186,7 @@ void sched_exec(void)
        int dest_cpu;
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+        dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
        if (dest_cpu == smp_processor_id())
                goto unlock;
@@ -2191,6 +2298,8 @@ void scheduler_tick(void)
 * This makes sure that uptime, CFS vruntime, load
 * balancing, etc... continue to move forward, even
 * with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
 */
 u64 scheduler_tick_max_deferment(void)
 {
@@ -2219,7 +2328,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                defined(CONFIG_PREEMPT_TRACER))
-void __kprobes add_preempt_count(int val)
+void __kprobes preempt_count_add(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
@@ -2228,7 +2337,7 @@ void __kprobes add_preempt_count(int val)
        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                return;
 #endif
-        preempt_count() += val;
+        __preempt_count_add(val);
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Spinlock count overflowing soon?
@@ -2239,9 +2348,9 @@ void __kprobes add_preempt_count(int val)
        if (preempt_count() == val)
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
-EXPORT_SYMBOL(add_preempt_count);
+EXPORT_SYMBOL(preempt_count_add);
-void __kprobes sub_preempt_count(int val)
+void __kprobes preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
@@ -2259,9 +2368,9 @@ void __kprobes sub_preempt_count(int val)
        if (preempt_count() == val)
                trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-        preempt_count() -= val;
+        __preempt_count_sub(val);
 }
-EXPORT_SYMBOL(sub_preempt_count);
+EXPORT_SYMBOL(preempt_count_sub);
 #endif
@@ -2394,6 +2503,12 @@ need_resched:
        if (sched_feat(HRTICK))
                hrtick_clear(rq);
+        /*
+         * Make sure that signal_pending_state()->signal_pending() below
+         * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+         * done by the caller to avoid the race with signal_wake_up().
+         */
+        smp_mb__before_spinlock();
        raw_spin_lock_irq(&rq->lock);
        switch_count = &prev->nivcsw;
@@ -2428,6 +2543,7 @@ need_resched:
        put_prev_task(rq, prev);
        next = pick_next_task(rq);
        clear_tsk_need_resched(prev);
+        clear_preempt_need_resched();
        rq->skip_clock_update = 0;
        if (likely(prev != next)) {
@@ -2510,19 +2626,17 @@ void __sched schedule_preempt_disabled(void)
 */
 asmlinkage void __sched notrace preempt_schedule(void)
 {
-        struct thread_info *ti = current_thread_info();
        /*
         * If there is a non-zero preempt_count or interrupts are disabled,
         * we do not want to preempt the current task. Just return..
         */
-        if (likely(ti->preempt_count || irqs_disabled()))
+        if (likely(!preemptible()))
                return;
        do {
-                add_preempt_count_notrace(PREEMPT_ACTIVE);
+                __preempt_count_add(PREEMPT_ACTIVE);
                __schedule();
-                sub_preempt_count_notrace(PREEMPT_ACTIVE);
+                __preempt_count_sub(PREEMPT_ACTIVE);
                /*
                 * Check again in case we missed a preemption opportunity
@@ -2541,20 +2655,19 @@ EXPORT_SYMBOL(preempt_schedule);
 */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
-        struct thread_info *ti = current_thread_info();
        enum ctx_state prev_state;
        /* Catch callers which need to be fixed */
-        BUG_ON(ti->preempt_count || !irqs_disabled());
+        BUG_ON(preempt_count() || !irqs_disabled());
        prev_state = exception_enter();
        do {
-                add_preempt_count(PREEMPT_ACTIVE);
+                __preempt_count_add(PREEMPT_ACTIVE);
                local_irq_enable();
                __schedule();
                local_irq_disable();
-                sub_preempt_count(PREEMPT_ACTIVE);
+                __preempt_count_sub(PREEMPT_ACTIVE);
                /*
                 * Check again in case we missed a preemption opportunity
@@ -2575,393 +2688,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
 }
 EXPORT_SYMBOL(default_wake_function);
-/*
- * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
- * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
- *
- * There are circumstances in which we can try to wake a task which has already
- * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
- * zero in this (rare) case, and we handle it by continuing to scan the queue.
- */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-                        int nr_exclusive, int wake_flags, void *key)
-{
-        wait_queue_t *curr, *next;
-        list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
-                unsigned flags = curr->flags;
-                if (curr->func(curr, mode, wake_flags, key) &&
-                                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
-                        break;
-        }
-}
-/**
- * __wake_up - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: is directly passed to the wakeup function
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up(wait_queue_head_t *q, unsigned int mode,
-                        int nr_exclusive, void *key)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&q->lock, flags);
-        __wake_up_common(q, mode, nr_exclusive, 0, key);
-        spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(__wake_up);
-/*
- * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
- */
-void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
-{
-        __wake_up_common(q, mode, nr, 0, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
-{
-        __wake_up_common(q, mode, 1, 0, key);
-}
-EXPORT_SYMBOL_GPL(__wake_up_locked_key);
-/**
- * __wake_up_sync_key - wake up threads blocked on a waitqueue.
- * @q: the waitqueue
- * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
- * @key: opaque value to be passed to wakeup targets
- *
- * The sync wakeup differs that the waker knows that it will schedule
- * away soon, so while the target thread will be woken up, it will not
- * be migrated to another CPU - ie. the two threads are 'synchronized'
- * with each other. This can prevent needless bouncing between CPUs.
- *
- * On UP it can prevent extra preemption.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
-                        int nr_exclusive, void *key)
-{
-        unsigned long flags;
-        int wake_flags = WF_SYNC;
-        if (unlikely(!q))
-                return;
-        if (unlikely(!nr_exclusive))
-                wake_flags = 0;
-        spin_lock_irqsave(&q->lock, flags);
-        __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
-        spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync_key);
-/*
- * __wake_up_sync - see __wake_up_sync_key()
- */
-void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
-{
-        __wake_up_sync_key(q, mode, nr_exclusive, NULL);
-}
-EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
-/**
- * complete: - signals a single thread waiting on this completion
- * @x:  holds the state of this particular completion
- *
- * This will wake up a single thread waiting on this completion. Threads will be
- * awakened in the same order in which they were queued.
- *
- * See also complete_all(), wait_for_completion() and related routines.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete(struct completion *x)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        x->done++;
-        __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete);
-/**
- * complete_all: - signals all threads waiting on this completion
- * @x:  holds the state of this particular completion
- *
- * This will wake up all threads waiting on this particular completion event.
- *
- * It may be assumed that this function implies a write memory barrier before
- * changing the task state if and only if any tasks are woken up.
- */
-void complete_all(struct completion *x)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        x->done += UINT_MAX/2;
-        __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-}
-EXPORT_SYMBOL(complete_all);
-static inline long __sched
-do_wait_for_common(struct completion *x,
-                   long (*action)(long), long timeout, int state)
-{
-        if (!x->done) {
-                DECLARE_WAITQUEUE(wait, current);
-                __add_wait_queue_tail_exclusive(&x->wait, &wait);
-                do {
-                        if (signal_pending_state(state, current)) {
-                                timeout = -ERESTARTSYS;
-                                break;
-                        }
-                        __set_current_state(state);
-                        spin_unlock_irq(&x->wait.lock);
-                        timeout = action(timeout);
-                        spin_lock_irq(&x->wait.lock);
-                } while (!x->done && timeout);
-                __remove_wait_queue(&x->wait, &wait);
-                if (!x->done)
-                        return timeout;
-        }
-        x->done--;
-        return timeout ?: 1;
-}
-static inline long __sched
-__wait_for_common(struct completion *x,
-                  long (*action)(long), long timeout, int state)
-{
-        might_sleep();
-        spin_lock_irq(&x->wait.lock);
-        timeout = do_wait_for_common(x, action, timeout, state);
-        spin_unlock_irq(&x->wait.lock);
-        return timeout;
-}
-static long __sched
-wait_for_common(struct completion *x, long timeout, int state)
-{
-        return __wait_for_common(x, schedule_timeout, timeout, state);
-}
-static long __sched
-wait_for_common_io(struct completion *x, long timeout, int state)
-{
-        return __wait_for_common(x, io_schedule_timeout, timeout, state);
-}
-/**
- * wait_for_completion: - waits for completion of a task
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout.
- *
- * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
- * and interrupt capability. Also see complete().
- */
-void __sched wait_for_completion(struct completion *x)
-{
-        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion);
-/**
- * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible.
- *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_timeout(struct completion *x, unsigned long timeout)
-{
-        return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_timeout);
-/**
- * wait_for_completion_io: - waits for completion of a task
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It is NOT
- * interruptible and there is no timeout. The caller is accounted as waiting
- * for IO.
- */
-void __sched wait_for_completion_io(struct completion *x)
-{
-        wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_io);
-/**
- * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible. The caller is accounted as waiting for IO.
- *
- * The return value is 0 if timed out, and positive (at least 1, or number of
- * jiffies left till timeout) if completed.
- */
-unsigned long __sched
-wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
-{
-        return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_io_timeout);
-/**
- * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
- * @x:  holds the state of this particular completion
- *
- * This waits for completion of a specific task to be signaled. It is
- * interruptible.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_interruptible(struct completion *x)
-{
-        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
-        if (t == -ERESTARTSYS)
-                return t;
-        return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible);
-/**
- * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be signaled or for a
- * specified timeout to expire. It is interruptible. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_interruptible_timeout(struct completion *x,
-                                          unsigned long timeout)
-{
-        return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
-}
-EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
-/**
- * wait_for_completion_killable: - waits for completion of a task (killable)
- * @x:  holds the state of this particular completion
- *
- * This waits to be signaled for completion of a specific task. It can be
- * interrupted by a kill signal.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if completed.
- */
-int __sched wait_for_completion_killable(struct completion *x)
-{
-        long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
-        if (t == -ERESTARTSYS)
-                return t;
-        return 0;
-}
-EXPORT_SYMBOL(wait_for_completion_killable);
-/**
- * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
- * @x:  holds the state of this particular completion
- * @timeout:  timeout value in jiffies
- *
- * This waits for either a completion of a specific task to be
- * signaled or for a specified timeout to expire. It can be
- * interrupted by a kill signal. The timeout is in jiffies.
- *
- * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
- * positive (at least 1, or number of jiffies left till timeout) if completed.
- */
-long __sched
-wait_for_completion_killable_timeout(struct completion *x,
-                                     unsigned long timeout)
-{
-        return wait_for_common(x, timeout, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(wait_for_completion_killable_timeout);
-/**
- *      try_wait_for_completion - try to decrement a completion without blocking
- *      @x:     completion structure
- *
- *      Returns: 0 if a decrement cannot be done without blocking
- *               1 if a decrement succeeded.
- *
- *      If a completion is being used as a counting completion,
- *      attempt to decrement the counter without blocking. This
- *      enables us to avoid waiting if the resource the completion
- *      is protecting is not available.
- */
-bool try_wait_for_completion(struct completion *x)
-{
-        unsigned long flags;
-        int ret = 1;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        if (!x->done)
-                ret = 0;
-        else
-                x->done--;
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL(try_wait_for_completion);
-/**
- *      completion_done - Test to see if a completion has any waiters
- *      @x:     completion structure
- *
- *      Returns: 0 if there are waiters (wait_for_completion() in progress)
- *               1 if there are no waiters.
- *
- */
-bool completion_done(struct completion *x)
-{
-        unsigned long flags;
-        int ret = 1;
-        spin_lock_irqsave(&x->wait.lock, flags);
-        if (!x->done)
-                ret = 0;
-        spin_unlock_irqrestore(&x->wait.lock, flags);
-        return ret;
-}
-EXPORT_SYMBOL(completion_done);
 static long __sched
 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
 {
@@ -3182,7 +2908,7 @@ SYSCALL_DEFINE1(nice, int, increment)
 * task_prio - return the priority value of a given task.
 * @p: the task in question.
 *
- * This is the priority value as seen by users in /proc.
+ * Return: The priority value as seen by users in /proc.
 * RT tasks are offset by -200. Normal tasks are centered
 * around 0, value goes from -16 to +15.
 */
@@ -3194,6 +2920,8 @@ int task_prio(const struct task_struct *p)
 /**
 * task_nice - return the nice value of a given task.
 * @p: the task in question.
+ *
+ * Return: The nice value [ -20 ... 0 ... 19 ].
 */
 int task_nice(const struct task_struct *p)
 {
@@ -3204,6 +2932,8 @@ EXPORT_SYMBOL(task_nice);
 /**
 * idle_cpu - is a given cpu idle currently?
 * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
 */
 int idle_cpu(int cpu)
 {
@@ -3226,6 +2956,8 @@ int idle_cpu(int cpu)
 /**
 * idle_task - return the idle task for a given cpu.
 * @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
 */
 struct task_struct *idle_task(int cpu)
 {
@@ -3235,6 +2967,8 @@ struct task_struct *idle_task(int cpu)
 /**
 * find_process_by_pid - find a process with a matching PID value.
 * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
 */
 static struct task_struct *find_process_by_pid(pid_t pid)
 {
@@ -3432,6 +3166,8 @@ recheck:
 * @policy: new policy.
 * @param: structure containing the new RT priority.
 *
+ * Return: 0 on success. An error code otherwise.
+ *
 * NOTE that the task may be already dead.
 */
 int sched_setscheduler(struct task_struct *p, int policy,
@@ -3451,6 +3187,8 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 * current context has permission.  For example, this is needed in
 * stop_machine(): we create temporary high priority worker threads,
 * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
 */
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                               const struct sched_param *param)
@@ -3485,6 +3223,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 * @pid: the pid in question.
 * @policy: new policy.
 * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
 */
 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
                struct sched_param __user *, param)
@@ -3500,6 +3240,8 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
 * sys_sched_setparam - set/change the RT priority of a thread
 * @pid: the pid in question.
 * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
 */
 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
@@ -3509,6 +3251,9 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 /**
 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
 * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
 */
 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 {
@@ -3535,6 +3280,9 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 * sys_sched_getparam - get the RT priority of a thread
 * @pid: the pid in question.
 * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
 */
 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
@@ -3576,13 +3324,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        struct task_struct *p;
        int retval;
-        get_online_cpus();
        rcu_read_lock();
        p = find_process_by_pid(pid);
        if (!p) {
                rcu_read_unlock();
-                put_online_cpus();
                return -ESRCH;
        }
@@ -3639,7 +3385,6 @@ out_free_cpus_allowed:
        free_cpumask_var(cpus_allowed);
 out_put_task:
        put_task_struct(p);
-        put_online_cpus();
        return retval;
 }
@@ -3659,6 +3404,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 * @pid: pid of the process
 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
 * @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
 */
 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
                unsigned long __user *, user_mask_ptr)
@@ -3682,7 +3429,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        unsigned long flags;
        int retval;
-        get_online_cpus();
        rcu_read_lock();
        retval = -ESRCH;
@@ -3695,12 +3441,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
                goto out_unlock;
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+        cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
        rcu_read_unlock();
-        put_online_cpus();
        return retval;
 }
@@ -3710,6 +3455,8 @@ out_unlock:
 * @pid: pid of the process
 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
 * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
 */
 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
                unsigned long __user *, user_mask_ptr)
@@ -3744,6 +3491,8 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 *
 * This function yields the current CPU to other tasks. If there are no
 * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
 */
 SYSCALL_DEFINE0(sched_yield)
 {
@@ -3766,16 +3515,11 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
-static inline int should_resched(void)
-{
-        return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
 static void __cond_resched(void)
 {
-        add_preempt_count(PREEMPT_ACTIVE);
+        __preempt_count_add(PREEMPT_ACTIVE);
        __schedule();
-        sub_preempt_count(PREEMPT_ACTIVE);
+        __preempt_count_sub(PREEMPT_ACTIVE);
 }
 int __sched _cond_resched(void)
@@ -3869,7 +3613,7 @@ EXPORT_SYMBOL(yield);
 * It's the caller's job to ensure that the target task struct
 * can't go away on us before we can do any checks.
 *
- * Returns:
+ * Return:
 *      true (>0) if we indeed boosted the target task.
 *      false (0) if we failed to boost the target.
 *      -ESRCH if there's no task to yield to.
@@ -3972,8 +3716,9 @@ long __sched io_schedule_timeout(long timeout)
 * sys_sched_get_priority_max - return maximum RT priority.
 * @policy: scheduling class.
 *
- * this syscall returns the maximum rt_priority that can be used
+ * Return: On success, this syscall returns the maximum
- * by a given scheduling class.
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
 */
 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 {
@@ -3997,8 +3742,9 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 * sys_sched_get_priority_min - return minimum RT priority.
 * @policy: scheduling class.
 *
- * this syscall returns the minimum rt_priority that can be used
+ * Return: On success, this syscall returns the minimum
- * by a given scheduling class.
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
 */
 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 {
@@ -4024,6 +3770,9 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 *
 * this syscall writes the default timeslice value of a given process
 * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
 */
 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                struct timespec __user *, interval)
@@ -4153,7 +3902,7 @@ void init_idle(struct task_struct *idle, int cpu)
        raw_spin_lock_irqsave(&rq->lock, flags);
-        __sched_fork(idle);
+        __sched_fork(0, idle);
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
@@ -4179,7 +3928,7 @@ void init_idle(struct task_struct *idle, int cpu)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
        /* Set the preempt count _outside_ the spinlocks! */
-        task_thread_info(idle)->preempt_count = 0;
+        init_idle_preempt_count(idle, cpu);
        /*
         * The idle tasks have their own, simple scheduling class:
@@ -4313,6 +4062,53 @@ fail:
        return ret;
 }
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+        struct migration_arg arg = { p, target_cpu };
+        int curr_cpu = task_cpu(p);
+        if (curr_cpu == target_cpu)
+                return 0;
+        if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+                return -EINVAL;
+        /* TODO: This is not properly updating schedstats */
+        return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+        struct rq *rq;
+        unsigned long flags;
+        bool on_rq, running;
+        rq = task_rq_lock(p, &flags);
+        on_rq = p->on_rq;
+        running = task_current(rq, p);
+        if (on_rq)
+                dequeue_task(rq, p, 0);
+        if (running)
+                p->sched_class->put_prev_task(rq, p);
+        p->numa_preferred_nid = nid;
+        if (running)
+                p->sched_class->set_curr_task(rq);
+        if (on_rq)
+                enqueue_task(rq, p, 0);
+        task_rq_unlock(rq, p, &flags);
+}
+#endif
 /*
 * migration_cpu_stop - this will be executed by a highprio stopper thread
 * and performs thread migration by bumping thread off CPU then
@@ -4914,7 +4710,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_FORK |
                                SD_BALANCE_EXEC |
                                SD_SHARE_CPUPOWER |
-                                SD_SHARE_PKG_RESOURCES);
+                                SD_SHARE_PKG_RESOURCES |
+                                SD_PREFER_SIBLING);
                if (nr_node_ids == 1)
                        pflags &= ~SD_SERIALIZE;
        }
@@ -5083,19 +4880,34 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 * two cpus are in the same cache domain, see cpus_share_cache().
 */
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain *, sd_busy);
+DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 static void update_top_cache_domain(int cpu)
 {
        struct sched_domain *sd;
        int id = cpu;
+        int size = 1;
        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-        if (sd)
+        if (sd) {
                id = cpumask_first(sched_domain_span(sd));
+                size = cpumask_weight(sched_domain_span(sd));
+                rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
+        }
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+        per_cpu(sd_llc_size, cpu) = size;
        per_cpu(sd_llc_id, cpu) = id;
+        sd = lowest_flag_domain(cpu, SD_NUMA);
+        rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+        sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+        rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
 }
 /*
@@ -5118,6 +4930,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
                        tmp->parent = parent->parent;
                        if (parent->parent)
                                parent->parent->child = tmp;
+                        /*
+                         * Transfer SD_PREFER_SIBLING down in case of a
+                         * degenerate parent; the spans match for this
+                         * so the property transfers.
+                         */
+                        if (parent->flags & SD_PREFER_SIBLING)
+                                tmp->flags |= SD_PREFER_SIBLING;
                        destroy_sched_domain(parent, cpu);
                } else
                        tmp = tmp->parent;
@@ -5608,6 +5427,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                        | 0*SD_SHARE_PKG_RESOURCES
                                        | 1*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
+                                        | 1*SD_NUMA
                                        | sd_local_flags(level)
                                        ,
                .last_balance           = jiffies,
@@ -6184,8 +6004,9 @@ match1:
                ;
        }
+        n = ndoms_cur;
        if (doms_new == NULL) {
-                ndoms_cur = 0;
+                n = 0;
                doms_new = &fallback_doms;
                cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
                WARN_ON_ONCE(dattr_new);
@@ -6193,7 +6014,7 @@ match1:
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
-                for (j = 0; j < ndoms_cur && !new_topology; j++) {
+                for (j = 0; j < n && !new_topology; j++) {
                        if (cpumask_equal(doms_new[i], doms_cur[j])
                            && dattrs_equal(dattr_new, i, dattr_cur, j))
                                goto match2;
@@ -6288,14 +6109,17 @@ void __init sched_init_smp(void)
        sched_init_numa();
-        get_online_cpus();
+        /*
+         * There's no userspace yet to cause hotplug operations; hence all the
+         * cpu masks are stable and all blatant races in the below code cannot
+         * happen.
+         */
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
-        put_online_cpus();
        hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
        hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6458,6 +6282,7 @@ void __init sched_init(void)
                rq->online = 0;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
+                rq->max_idle_balance_cost = sysctl_sched_migration_cost;
                INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -6632,6 +6457,8 @@ void normalize_rt_tasks(void)
 * @cpu: the processor in question.
 *
 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
 */
 struct task_struct *curr_task(int cpu)
 {
@@ -6763,7 +6590,7 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->put_prev_task(rq, tsk);
-        tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+        tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
                                lockdep_is_held(&tsk->sighand->siglock)),
                          struct task_group, css);
        tg = autogroup_task_group(tsk, tg);
@@ -7085,23 +6912,22 @@ int sched_rt_handler(struct ctl_table *table, int write,
 #ifdef CONFIG_CGROUP_SCHED
-/* return corresponding task_group object of a cgroup */
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
 {
-        return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
+        return css ? container_of(css, struct task_group, css) : NULL;
-                            struct task_group, css);
 }
-static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
-        struct task_group *tg, *parent;
+        struct task_group *parent = css_tg(parent_css);
+        struct task_group *tg;
-        if (!cgrp->parent) {
+        if (!parent) {
                /* This is early initialization for the top cgroup */
                return &root_task_group.css;
        }
-        parent = cgroup_tg(cgrp->parent);
        tg = sched_create_group(parent);
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
@@ -7109,41 +6935,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
        return &tg->css;
 }
-static int cpu_cgroup_css_online(struct cgroup *cgrp)
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 {
-        struct task_group *tg = cgroup_tg(cgrp);
+        struct task_group *tg = css_tg(css);
-        struct task_group *parent;
+        struct task_group *parent = css_tg(css_parent(css));
-        if (!cgrp->parent)
-                return 0;
-        parent = cgroup_tg(cgrp->parent);
+        if (parent)
-        sched_online_group(tg, parent);
+                sched_online_group(tg, parent);
        return 0;
 }
-static void cpu_cgroup_css_free(struct cgroup *cgrp)
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 {
-        struct task_group *tg = cgroup_tg(cgrp);
+        struct task_group *tg = css_tg(css);
        sched_destroy_group(tg);
 }
-static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
-        struct task_group *tg = cgroup_tg(cgrp);
+        struct task_group *tg = css_tg(css);
        sched_offline_group(tg);
 }
-static int cpu_cgroup_can_attach(struct cgroup *cgrp,
+static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
                                 struct cgroup_taskset *tset)
 {
        struct task_struct *task;
-        cgroup_taskset_for_each(task, cgrp, tset) {
+        cgroup_taskset_for_each(task, css, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
-                if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+                if (!sched_rt_can_attach(css_tg(css), task))
                        return -EINVAL;
 #else
                /* We don't support RT-tasks being in separate groups */
@@ -7154,18 +6977,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp,
        return 0;
 }
-static void cpu_cgroup_attach(struct cgroup *cgrp,
+static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
                              struct cgroup_taskset *tset)
 {
        struct task_struct *task;
-        cgroup_taskset_for_each(task, cgrp, tset)
+        cgroup_taskset_for_each(task, css, tset)
                sched_move_task(task);
 }
-static void
+static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
-cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+                            struct cgroup_subsys_state *old_css,
-                struct task_struct *task)
+                            struct task_struct *task)
 {
        /*
         * cgroup_exit() is called in the copy_process() failure path.
@@ -7179,15 +7002,16 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
-                                u64 shareval)
+                                struct cftype *cftype, u64 shareval)
 {
-        return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
+        return sched_group_set_shares(css_tg(css), scale_load(shareval));
 }
-static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
+                               struct cftype *cft)
 {
-        struct task_group *tg = cgroup_tg(cgrp);
+        struct task_group *tg = css_tg(css);
        return (u64) scale_load_down(tg->shares);
 }
@@ -7231,7 +7055,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        runtime_enabled = quota != RUNTIME_INF;
        runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
-        account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+        /*
+         * If we need to toggle cfs_bandwidth_used, off->on must occur
+         * before making related changes, and on->off must occur afterwards
+         */
+        if (runtime_enabled && !runtime_was_enabled)
+                cfs_bandwidth_usage_inc();
        raw_spin_lock_irq(&cfs_b->lock);
        cfs_b->period = ns_to_ktime(period);
        cfs_b->quota = quota;
@@ -7257,6 +7086,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                        unthrottle_cfs_rq(cfs_rq);
                raw_spin_unlock_irq(&rq->lock);
        }
+        if (runtime_was_enabled && !runtime_enabled)
+                cfs_bandwidth_usage_dec();
 out_unlock:
        mutex_unlock(&cfs_constraints_mutex);
@@ -7309,26 +7140,28 @@ long tg_get_cfs_period(struct task_group *tg)
        return cfs_period_us;
 }
-static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
+                                  struct cftype *cft)
 {
-        return tg_get_cfs_quota(cgroup_tg(cgrp));
+        return tg_get_cfs_quota(css_tg(css));
 }
-static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
-                                s64 cfs_quota_us)
+                                   struct cftype *cftype, s64 cfs_quota_us)
 {
-        return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+        return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
 }
-static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
+                                   struct cftype *cft)
 {
-        return tg_get_cfs_period(cgroup_tg(cgrp));
+        return tg_get_cfs_period(css_tg(css));
 }
-static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
-                                u64 cfs_period_us)
+                                    struct cftype *cftype, u64 cfs_period_us)
 {
-        return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+        return tg_set_cfs_period(css_tg(css), cfs_period_us);
 }
 struct cfs_schedulable_data {
@@ -7409,10 +7242,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
        return ret;
 }
-static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
                struct cgroup_map_cb *cb)
 {
-        struct task_group *tg = cgroup_tg(cgrp);
+        struct task_group *tg = css_tg(css);
        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
        cb->fill(cb, "nr_periods", cfs_b->nr_periods);
@@ -7425,26 +7258,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
-static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
-                                s64 val)
+                                struct cftype *cft, s64 val)
 {
-        return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+        return sched_group_set_rt_runtime(css_tg(css), val);
 }
-static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
+static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
+                               struct cftype *cft)
 {
-        return sched_group_rt_runtime(cgroup_tg(cgrp));
+        return sched_group_rt_runtime(css_tg(css));
 }
-static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
-                u64 rt_period_us)
+                                    struct cftype *cftype, u64 rt_period_us)
 {
-        return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
+        return sched_group_set_rt_period(css_tg(css), rt_period_us);
 }
-static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
+                                   struct cftype *cft)
 {
-        return sched_group_rt_period(cgroup_tg(cgrp));
+        return sched_group_rt_period(css_tg(css));
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dbb7e2cd95eb..f64722ff0299 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -33,30 +33,20 @@ struct cpuacct {
        struct kernel_cpustat __percpu *cpustat;
 };
-/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
 {
-        return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+        return css ? container_of(css, struct cpuacct, css) : NULL;
-                            struct cpuacct, css);
 }
 /* return cpu accounting group to which this task belongs */
 static inline struct cpuacct *task_ca(struct task_struct *tsk)
 {
-        return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+        return css_ca(task_css(tsk, cpuacct_subsys_id));
-                            struct cpuacct, css);
-}
-static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
-{
-        return cgroup_ca(ca->css.cgroup->parent);
 }
 static inline struct cpuacct *parent_ca(struct cpuacct *ca)
 {
-        if (!ca->css.cgroup->parent)
+        return css_ca(css_parent(&ca->css));
-                return NULL;
-        return cgroup_ca(ca->css.cgroup->parent);
 }
 static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
@@ -66,11 +56,12 @@ static struct cpuacct root_cpuacct = {
 };
 /* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
+static struct cgroup_subsys_state *
+cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
 {
        struct cpuacct *ca;
-        if (!cgrp->parent)
+        if (!parent_css)
                return &root_cpuacct.css;
        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
@@ -96,9 +87,9 @@ out:
 }
 /* destroy an existing cpu accounting group */
-static void cpuacct_css_free(struct cgroup *cgrp)
+static void cpuacct_css_free(struct cgroup_subsys_state *css)
 {
-        struct cpuacct *ca = cgroup_ca(cgrp);
+        struct cpuacct *ca = css_ca(css);
        free_percpu(ca->cpustat);
        free_percpu(ca->cpuusage);
@@ -141,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 }
 /* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-        struct cpuacct *ca = cgroup_ca(cgrp);
+        struct cpuacct *ca = css_ca(css);
        u64 totalcpuusage = 0;
        int i;
@@ -153,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
        return totalcpuusage;
 }
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
+static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
-                                                                u64 reset)
+                          u64 reset)
 {
-        struct cpuacct *ca = cgroup_ca(cgrp);
+        struct cpuacct *ca = css_ca(css);
        int err = 0;
        int i;
@@ -172,10 +163,10 @@ out:
        return err;
 }
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
-                                   struct seq_file *m)
+                                   struct cftype *cft, struct seq_file *m)
 {
-        struct cpuacct *ca = cgroup_ca(cgroup);
+        struct cpuacct *ca = css_ca(css);
        u64 percpu;
        int i;
@@ -192,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = {
        [CPUACCT_STAT_SYSTEM] = "system",
 };
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+static int cpuacct_stats_show(struct cgroup_subsys_state *css,
-                              struct cgroup_map_cb *cb)
+                              struct cftype *cft, struct cgroup_map_cb *cb)
 {
-        struct cpuacct *ca = cgroup_ca(cgrp);
+        struct cpuacct *ca = css_ca(css);
        int cpu;
        s64 val = 0;
@@ -281,7 +272,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
        while (ca != &root_cpuacct) {
                kcpustat = this_cpu_ptr(ca->cpustat);
                kcpustat->cpustat[index] += val;
-                ca = __parent_ca(ca);
+                ca = parent_ca(ca);
        }
        rcu_read_unlock();
 }
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 1095e878a46f..8b836b376d91 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -62,7 +62,7 @@ static int convert_prio(int prio)
 * any discrepancies created by racing against the uncertainty of the current
 * priority configuration.
 *
- * Returns: (int)bool - CPUs were found
+ * Return: (int)bool - CPUs were found
 */
 int cpupri_find(struct cpupri *cp, struct task_struct *p,
                struct cpumask *lowest_mask)
@@ -203,7 +203,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 * cpupri_init - initialize the cpupri structure
 * @cp: The cpupri context
 *
- * Returns: -ENOMEM if memory fails.
+ * Return: -ENOMEM on memory allocation failure.
 */
 int cpupri_init(struct cpupri *cp)
 {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a7959e05a9d5..99947919e30b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
         * is the only cgroup, then nothing else should be necessary.
         *
         */
-        __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+        __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
        cpuacct_account_field(p, index, tmp);
 }
@@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
+void vtime_common_task_switch(struct task_struct *prev)
 {
-        if (!vtime_accounting_enabled())
-                return;
        if (is_idle_task(prev))
                vtime_account_idle(prev);
        else
@@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev)
 * vtime_account().
 */
 #ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
+void vtime_common_account_irq_enter(struct task_struct *tsk)
 {
-        if (!vtime_accounting_enabled())
-                return;
        if (!in_interrupt()) {
                /*
                 * If we interrupted user, context_tracking_in_user()
@@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
        }
        vtime_account_system(tsk);
 }
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
@@ -557,16 +551,7 @@ static void cputime_adjust(struct task_cputime *curr,
                           struct cputime *prev,
                           cputime_t *ut, cputime_t *st)
 {
-        cputime_t rtime, stime, utime, total;
+        cputime_t rtime, stime, utime;
-        if (vtime_accounting_enabled()) {
-                *ut = curr->utime;
-                *st = curr->stime;
-                return;
-        }
-        stime = curr->stime;
-        total = stime + curr->utime;
        /*
         * Tick based cputime accounting depend on random scheduling
@@ -588,13 +573,19 @@ static void cputime_adjust(struct task_cputime *curr,
        if (prev->stime + prev->utime >= rtime)
                goto out;
-        if (total) {
+        stime = curr->stime;
+        utime = curr->utime;
+        if (utime == 0) {
+                stime = rtime;
+        } else if (stime == 0) {
+                utime = rtime;
+        } else {
+                cputime_t total = stime + utime;
                stime = scale_stime((__force u64)stime,
                                    (__force u64)rtime, (__force u64)total);
                utime = rtime - stime;
-        } else {
-                stime = rtime;
-                utime = 0;
        }
        /*
@@ -664,23 +655,17 @@ static void __vtime_account_system(struct task_struct *tsk)
 void vtime_account_system(struct task_struct *tsk)
 {
-        if (!vtime_accounting_enabled())
-                return;
        write_seqlock(&tsk->vtime_seqlock);
        __vtime_account_system(tsk);
        write_sequnlock(&tsk->vtime_seqlock);
 }
-void vtime_account_irq_exit(struct task_struct *tsk)
+void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
-        if (!vtime_accounting_enabled())
-                return;
        write_seqlock(&tsk->vtime_seqlock);
+        __vtime_account_system(tsk);
        if (context_tracking_in_user())
                tsk->vtime_snap_whence = VTIME_USER;
-        __vtime_account_system(tsk);
        write_sequnlock(&tsk->vtime_seqlock);
 }
@@ -688,12 +673,8 @@ void vtime_account_user(struct task_struct *tsk)
 {
        cputime_t delta_cpu;
-        if (!vtime_accounting_enabled())
-                return;
-        delta_cpu = get_vtime_delta(tsk);
        write_seqlock(&tsk->vtime_seqlock);
+        delta_cpu = get_vtime_delta(tsk);
        tsk->vtime_snap_whence = VTIME_SYS;
        account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
        write_sequnlock(&tsk->vtime_seqlock);
@@ -701,22 +682,27 @@ void vtime_account_user(struct task_struct *tsk)
 void vtime_user_enter(struct task_struct *tsk)
 {
-        if (!vtime_accounting_enabled())
-                return;
        write_seqlock(&tsk->vtime_seqlock);
-        tsk->vtime_snap_whence = VTIME_USER;
        __vtime_account_system(tsk);
+        tsk->vtime_snap_whence = VTIME_USER;
        write_sequnlock(&tsk->vtime_seqlock);
 }
 void vtime_guest_enter(struct task_struct *tsk)
 {
+        /*
+         * The flags must be updated under the lock with
+         * the vtime_snap flush and update.
+         * That enforces a right ordering and update sequence
+         * synchronization against the reader (task_gtime())
+         * that can thus safely catch up with a tickless delta.
+         */
        write_seqlock(&tsk->vtime_seqlock);
        __vtime_account_system(tsk);
        current->flags |= PF_VCPU;
        write_sequnlock(&tsk->vtime_seqlock);
 }
+EXPORT_SYMBOL_GPL(vtime_guest_enter);
 void vtime_guest_exit(struct task_struct *tsk)
 {
@@ -725,6 +711,7 @@ void vtime_guest_exit(struct task_struct *tsk)
        current->flags &= ~PF_VCPU;
        write_sequnlock(&tsk->vtime_seqlock);
 }
+EXPORT_SYMBOL_GPL(vtime_guest_exit);
 void vtime_account_idle(struct task_struct *tsk)
 {
@@ -733,11 +720,6 @@ void vtime_account_idle(struct task_struct *tsk)
        account_idle_time(delta_cpu);
 }
-bool vtime_accounting_enabled(void)
-{
-        return context_tracking_active();
-}
 void arch_vtime_task_switch(struct task_struct *prev)
 {
        write_seqlock(&prev->vtime_seqlock);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e076bddd4c66..5c34d1817e8f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
+#include <linux/mempolicy.h>
 #include "sched.h"
@@ -124,7 +125,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                SEQ_printf(m, " ");
        SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
-                p->comm, p->pid,
+                p->comm, task_pid_nr(p),
                SPLIT_NS(p->se.vruntime),
                (long long)(p->nvcsw + p->nivcsw),
                p->prio);
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
+#endif
 #ifdef CONFIG_CGROUP_SCHED
        SEQ_printf(m, " %s", task_group_path(task_group(p)));
 #endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
        read_lock_irqsave(&tasklist_lock, flags);
        do_each_thread(g, p) {
-                if (!p->on_rq || task_cpu(p) != rq_cpu)
+                if (task_cpu(p) != rq_cpu)
                        continue;
                print_task(m, rq, p);
@@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        atomic_read(&cfs_rq->tg->runnable_avg));
 #endif
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        SEQ_printf(m, "  .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
+                        cfs_rq->tg->cfs_bandwidth.timer_active);
+        SEQ_printf(m, "  .%-30s: %d\n", "throttled",
+                        cfs_rq->throttled);
+        SEQ_printf(m, "  .%-30s: %d\n", "throttle_count",
+                        cfs_rq->throttle_count);
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
        print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -289,7 +301,7 @@ do {									\
        P(nr_load_updates);
        P(nr_uninterruptible);
        PN(next_balance);
-        P(curr->pid);
+        SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
        PN(clock);
        P(cpu_load[0]);
        P(cpu_load[1]);
@@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)
        cpu_clk = local_clock();
        local_irq_restore(flags);
-        SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
+        SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
                init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
                init_utsname()->version);
@@ -488,11 +500,61 @@ static int __init init_sched_debug_procfs(void)
 __initcall(init_sched_debug_procfs);
+#define __P(F) \
+        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+static void sched_show_numa(struct task_struct *p, struct seq_file *m)
+{
+#ifdef CONFIG_NUMA_BALANCING
+        struct mempolicy *pol;
+        int node, i;
+        if (p->mm)
+                P(mm->numa_scan_seq);
+        task_lock(p);
+        pol = p->mempolicy;
+        if (pol && !(pol->flags & MPOL_F_MORON))
+                pol = NULL;
+        mpol_get(pol);
+        task_unlock(p);
+        SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
+        for_each_online_node(node) {
+                for (i = 0; i < 2; i++) {
+                        unsigned long nr_faults = -1;
+                        int cpu_current, home_node;
+                        if (p->numa_faults)
+                                nr_faults = p->numa_faults[2*node + i];
+                        cpu_current = !i ? (task_node(p) == node) :
+                                (pol && node_isset(node, pol->v.nodes));
+                        home_node = (p->numa_preferred_nid == node);
+                        SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+                                i, node, cpu_current, home_node, nr_faults);
+                }
+        }
+        mpol_put(pol);
+#endif
+}
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
        unsigned long nr_switches;
-        SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
+        SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
                                                get_nr_threads(p));
        SEQ_printf(m,
                "---------------------------------------------------------"
@@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                SEQ_printf(m, "%-45s:%21Ld\n",
                           "clock-delta", (long long)(t1-t0));
        }
+        sched_show_numa(p, m);
 }
 void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9565645e3202..df77c605c7a6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
 static inline void __update_task_entity_contrib(struct sched_entity *se);
 /* Give new task start runnable values to heavy its load in infant time */
@@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #ifdef CONFIG_NUMA_BALANCING
 /*
- * numa task sample period in ms
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
 */
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
 /* Portion of address space to scan in MB */
 unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +833,810 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
-static void task_numa_placement(struct task_struct *p)
+/*
+ * After skipping a page migration on a shared page, skip N more numa page
+ * migrations unconditionally. This reduces the number of NUMA migrations
+ * in shared memory workloads, and has the effect of pulling tasks towards
+ * where their memory lives, over pulling the memory towards the task.
+ */
+unsigned int sysctl_numa_balancing_migrate_deferred = 16;
+static unsigned int task_nr_scan_windows(struct task_struct *p)
+{
+        unsigned long rss = 0;
+        unsigned long nr_scan_pages;
+        /*
+         * Calculations based on RSS as non-present and empty pages are skipped
+         * by the PTE scanner and NUMA hinting faults should be trapped based
+         * on resident pages
+         */
+        nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+        rss = get_mm_rss(p->mm);
+        if (!rss)
+                rss = nr_scan_pages;
+        rss = round_up(rss, nr_scan_pages);
+        return rss / nr_scan_pages;
+}
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+static unsigned int task_scan_min(struct task_struct *p)
+{
+        unsigned int scan, floor;
+        unsigned int windows = 1;
+        if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+                windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+        floor = 1000 / windows;
+        scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+        return max_t(unsigned int, floor, scan);
+}
+static unsigned int task_scan_max(struct task_struct *p)
+{
+        unsigned int smin = task_scan_min(p);
+        unsigned int smax;
+        /* Watch for min being lower than max due to floor calculations */
+        smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+        return max(smin, smax);
+}
+/*
+ * Once a preferred node is selected the scheduler balancer will prefer moving
+ * a task to that node for sysctl_numa_balancing_settle_count number of PTE
+ * scans. This will give the process the chance to accumulate more faults on
+ * the preferred node but still allow the scheduler to move the task again if
+ * the nodes CPUs are overloaded.
+ */
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+        rq->nr_numa_running += (p->numa_preferred_nid != -1);
+        rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+        rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+        rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+struct numa_group {
+        atomic_t refcount;
+        spinlock_t lock; /* nr_tasks, tasks */
+        int nr_tasks;
+        pid_t gid;
+        struct list_head task_list;
+        struct rcu_head rcu;
+        unsigned long total_faults;
+        unsigned long faults[0];
+};
+pid_t task_numa_group_id(struct task_struct *p)
+{
+        return p->numa_group ? p->numa_group->gid : 0;
+}
+static inline int task_faults_idx(int nid, int priv)
+{
+        return 2 * nid + priv;
+}
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+        if (!p->numa_faults)
+                return 0;
+        return p->numa_faults[task_faults_idx(nid, 0)] +
+                p->numa_faults[task_faults_idx(nid, 1)];
+}
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+        if (!p->numa_group)
+                return 0;
+        return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
+}
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node.  The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid)
+{
+        unsigned long total_faults;
+        if (!p->numa_faults)
+                return 0;
+        total_faults = p->total_numa_faults;
+        if (!total_faults)
+                return 0;
+        return 1000 * task_faults(p, nid) / total_faults;
+}
+static inline unsigned long group_weight(struct task_struct *p, int nid)
+{
+        if (!p->numa_group || !p->numa_group->total_faults)
+                return 0;
+        return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+}
+static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+        unsigned long nr_running;
+        unsigned long load;
+        /* Total compute capacity of CPUs on a node */
+        unsigned long power;
+        /* Approximate capacity in terms of runnable tasks on a node */
+        unsigned long capacity;
+        int has_capacity;
+};
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+        int cpu;
+        memset(ns, 0, sizeof(*ns));
+        for_each_cpu(cpu, cpumask_of_node(nid)) {
+                struct rq *rq = cpu_rq(cpu);
+                ns->nr_running += rq->nr_running;
+                ns->load += weighted_cpuload(cpu);
+                ns->power += power_of(cpu);
+        }
+        ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+        ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+        ns->has_capacity = (ns->nr_running < ns->capacity);
+}
+struct task_numa_env {
+        struct task_struct *p;
+        int src_cpu, src_nid;
+        int dst_cpu, dst_nid;
+        struct numa_stats src_stats, dst_stats;
+        int imbalance_pct, idx;
+        struct task_struct *best_task;
+        long best_imp;
+        int best_cpu;
+};
+static void task_numa_assign(struct task_numa_env *env,
+                             struct task_struct *p, long imp)
+{
+        if (env->best_task)
+                put_task_struct(env->best_task);
+        if (p)
+                get_task_struct(p);
+        env->best_task = p;
+        env->best_imp = imp;
+        env->best_cpu = env->dst_cpu;
+}
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+                              long taskimp, long groupimp)
+{
+        struct rq *src_rq = cpu_rq(env->src_cpu);
+        struct rq *dst_rq = cpu_rq(env->dst_cpu);
+        struct task_struct *cur;
+        long dst_load, src_load;
+        long load;
+        long imp = (groupimp > 0) ? groupimp : taskimp;
+        rcu_read_lock();
+        cur = ACCESS_ONCE(dst_rq->curr);
+        if (cur->pid == 0) /* idle */
+                cur = NULL;
+        /*
+         * "imp" is the fault differential for the source task between the
+         * source and destination node. Calculate the total differential for
+         * the source task and potential destination task. The more negative
+         * the value is, the more rmeote accesses that would be expected to
+         * be incurred if the tasks were swapped.
+         */
+        if (cur) {
+                /* Skip this swap candidate if cannot move to the source cpu */
+                if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+                        goto unlock;
+                /*
+                 * If dst and source tasks are in the same NUMA group, or not
+                 * in any group then look only at task weights.
+                 */
+                if (cur->numa_group == env->p->numa_group) {
+                        imp = taskimp + task_weight(cur, env->src_nid) -
+                              task_weight(cur, env->dst_nid);
+                        /*
+                         * Add some hysteresis to prevent swapping the
+                         * tasks within a group over tiny differences.
+                         */
+                        if (cur->numa_group)
+                                imp -= imp/16;
+                } else {
+                        /*
+                         * Compare the group weights. If a task is all by
+                         * itself (not part of a group), use the task weight
+                         * instead.
+                         */
+                        if (env->p->numa_group)
+                                imp = groupimp;
+                        else
+                                imp = taskimp;
+                        if (cur->numa_group)
+                                imp += group_weight(cur, env->src_nid) -
+                                       group_weight(cur, env->dst_nid);
+                        else
+                                imp += task_weight(cur, env->src_nid) -
+                                       task_weight(cur, env->dst_nid);
+                }
+        }
+        if (imp < env->best_imp)
+                goto unlock;
+        if (!cur) {
+                /* Is there capacity at our destination? */
+                if (env->src_stats.has_capacity &&
+                    !env->dst_stats.has_capacity)
+                        goto unlock;
+                goto balance;
+        }
+        /* Balance doesn't matter much if we're running a task per cpu */
+        if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+                goto assign;
+        /*
+         * In the overloaded case, try and keep the load balanced.
+         */
+balance:
+        dst_load = env->dst_stats.load;
+        src_load = env->src_stats.load;
+        /* XXX missing power terms */
+        load = task_h_load(env->p);
+        dst_load += load;
+        src_load -= load;
+        if (cur) {
+                load = task_h_load(cur);
+                dst_load -= load;
+                src_load += load;
+        }
+        /* make src_load the smaller */
+        if (dst_load < src_load)
+                swap(dst_load, src_load);
+        if (src_load * env->imbalance_pct < dst_load * 100)
+                goto unlock;
+assign:
+        task_numa_assign(env, cur, imp);
+unlock:
+        rcu_read_unlock();
+}
+static void task_numa_find_cpu(struct task_numa_env *env,
+                                long taskimp, long groupimp)
+{
+        int cpu;
+        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+                /* Skip this CPU if the source task cannot migrate */
+                if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+                        continue;
+                env->dst_cpu = cpu;
+                task_numa_compare(env, taskimp, groupimp);
+        }
+}
+static int task_numa_migrate(struct task_struct *p)
+{
+        struct task_numa_env env = {
+                .p = p,
+                .src_cpu = task_cpu(p),
+                .src_nid = task_node(p),
+                .imbalance_pct = 112,
+                .best_task = NULL,
+                .best_imp = 0,
+                .best_cpu = -1
+        };
+        struct sched_domain *sd;
+        unsigned long taskweight, groupweight;
+        int nid, ret;
+        long taskimp, groupimp;
+        /*
+         * Pick the lowest SD_NUMA domain, as that would have the smallest
+         * imbalance and would be the first to start moving tasks about.
+         *
+         * And we want to avoid any moving of tasks about, as that would create
+         * random movement of tasks -- counter the numa conditions we're trying
+         * to satisfy here.
+         */
+        rcu_read_lock();
+        sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+        env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+        rcu_read_unlock();
+        taskweight = task_weight(p, env.src_nid);
+        groupweight = group_weight(p, env.src_nid);
+        update_numa_stats(&env.src_stats, env.src_nid);
+        env.dst_nid = p->numa_preferred_nid;
+        taskimp = task_weight(p, env.dst_nid) - taskweight;
+        groupimp = group_weight(p, env.dst_nid) - groupweight;
+        update_numa_stats(&env.dst_stats, env.dst_nid);
+        /* If the preferred nid has capacity, try to use it. */
+        if (env.dst_stats.has_capacity)
+                task_numa_find_cpu(&env, taskimp, groupimp);
+        /* No space available on the preferred nid. Look elsewhere. */
+        if (env.best_cpu == -1) {
+                for_each_online_node(nid) {
+                        if (nid == env.src_nid || nid == p->numa_preferred_nid)
+                                continue;
+                        /* Only consider nodes where both task and groups benefit */
+                        taskimp = task_weight(p, nid) - taskweight;
+                        groupimp = group_weight(p, nid) - groupweight;
+                        if (taskimp < 0 && groupimp < 0)
+                                continue;
+                        env.dst_nid = nid;
+                        update_numa_stats(&env.dst_stats, env.dst_nid);
+                        task_numa_find_cpu(&env, taskimp, groupimp);
+                }
+        }
+        /* No better CPU than the current one was found. */
+        if (env.best_cpu == -1)
+                return -EAGAIN;
+        sched_setnuma(p, env.dst_nid);
+        /*
+         * Reset the scan period if the task is being rescheduled on an
+         * alternative node to recheck if the tasks is now properly placed.
+         */
+        p->numa_scan_period = task_scan_min(p);
+        if (env.best_task == NULL) {
+                int ret = migrate_task_to(p, env.best_cpu);
+                return ret;
+        }
+        ret = migrate_swap(p, env.best_task);
+        put_task_struct(env.best_task);
+        return ret;
+}
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+        /* This task has no NUMA fault statistics yet */
+        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+                return;
+        /* Periodically retry migrating the task to the preferred node */
+        p->numa_migrate_retry = jiffies + HZ;
+        /* Success if task is already running on preferred CPU */
+        if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
+                return;
+        /* Otherwise, try migrate to a CPU on the preferred node */
+        task_numa_migrate(p);
+}
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+                        unsigned long shared, unsigned long private)
 {
-        int seq;
+        unsigned int period_slot;
+        int ratio;
+        int diff;
+        unsigned long remote = p->numa_faults_locality[0];
+        unsigned long local = p->numa_faults_locality[1];
+        /*
+         * If there were no record hinting faults then either the task is
+         * completely idle or all activity is areas that are not of interest
+         * to automatic numa balancing. Scan slower
+         */
+        if (local + shared == 0) {
+                p->numa_scan_period = min(p->numa_scan_period_max,
+                        p->numa_scan_period << 1);
+                p->mm->numa_next_scan = jiffies +
+                        msecs_to_jiffies(p->numa_scan_period);
-        if (!p->mm)     /* for example, ksmd faulting in a user's mm */
                return;
+        }
+        /*
+         * Prepare to scale scan period relative to the current period.
+         *       == NUMA_PERIOD_THRESHOLD scan period stays the same
+         *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+         *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+         */
+        period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+        ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+        if (ratio >= NUMA_PERIOD_THRESHOLD) {
+                int slot = ratio - NUMA_PERIOD_THRESHOLD;
+                if (!slot)
+                        slot = 1;
+                diff = slot * period_slot;
+        } else {
+                diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+                /*
+                 * Scale scan rate increases based on sharing. There is an
+                 * inverse relationship between the degree of sharing and
+                 * the adjustment made to the scanning period. Broadly
+                 * speaking the intent is that there is little point
+                 * scanning faster if shared accesses dominate as it may
+                 * simply bounce migrations uselessly
+                 */
+                period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
+                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+                diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+        }
+        p->numa_scan_period = clamp(p->numa_scan_period + diff,
+                        task_scan_min(p), task_scan_max(p));
+        memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+static void task_numa_placement(struct task_struct *p)
+{
+        int seq, nid, max_nid = -1, max_group_nid = -1;
+        unsigned long max_faults = 0, max_group_faults = 0;
+        unsigned long fault_types[2] = { 0, 0 };
+        spinlock_t *group_lock = NULL;
        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
        if (p->numa_scan_seq == seq)
                return;
        p->numa_scan_seq = seq;
+        p->numa_scan_period_max = task_scan_max(p);
+        /* If the task is part of a group prevent parallel updates to group stats */
+        if (p->numa_group) {
+                group_lock = &p->numa_group->lock;
+                spin_lock(group_lock);
+        }
+        /* Find the node with the highest number of faults */
+        for_each_online_node(nid) {
+                unsigned long faults = 0, group_faults = 0;
+                int priv, i;
+                for (priv = 0; priv < 2; priv++) {
+                        long diff;
+                        i = task_faults_idx(nid, priv);
+                        diff = -p->numa_faults[i];
+                        /* Decay existing window, copy faults since last scan */
+                        p->numa_faults[i] >>= 1;
+                        p->numa_faults[i] += p->numa_faults_buffer[i];
+                        fault_types[priv] += p->numa_faults_buffer[i];
+                        p->numa_faults_buffer[i] = 0;
+                        faults += p->numa_faults[i];
+                        diff += p->numa_faults[i];
+                        p->total_numa_faults += diff;
+                        if (p->numa_group) {
+                                /* safe because we can only change our own group */
+                                p->numa_group->faults[i] += diff;
+                                p->numa_group->total_faults += diff;
+                                group_faults += p->numa_group->faults[i];
+                        }
+                }
+                if (faults > max_faults) {
+                        max_faults = faults;
+                        max_nid = nid;
+                }
+                if (group_faults > max_group_faults) {
+                        max_group_faults = group_faults;
+                        max_group_nid = nid;
+                }
+        }
+        update_task_scan_period(p, fault_types[0], fault_types[1]);
+        if (p->numa_group) {
+                /*
+                 * If the preferred task and group nids are different,
+                 * iterate over the nodes again to find the best place.
+                 */
+                if (max_nid != max_group_nid) {
+                        unsigned long weight, max_weight = 0;
+                        for_each_online_node(nid) {
+                                weight = task_weight(p, nid) + group_weight(p, nid);
+                                if (weight > max_weight) {
+                                        max_weight = weight;
+                                        max_nid = nid;
+                                }
+                        }
+                }
+                spin_unlock(group_lock);
+        }
+        /* Preferred node as the node with the most faults */
+        if (max_faults && max_nid != p->numa_preferred_nid) {
+                /* Update the preferred nid and migrate task if possible */
+                sched_setnuma(p, max_nid);
+                numa_migrate_preferred(p);
+        }
+}
+static inline int get_numa_group(struct numa_group *grp)
+{
+        return atomic_inc_not_zero(&grp->refcount);
+}
+static inline void put_numa_group(struct numa_group *grp)
+{
+        if (atomic_dec_and_test(&grp->refcount))
+                kfree_rcu(grp, rcu);
+}
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+                        int *priv)
+{
+        struct numa_group *grp, *my_grp;
+        struct task_struct *tsk;
+        bool join = false;
+        int cpu = cpupid_to_cpu(cpupid);
+        int i;
+        if (unlikely(!p->numa_group)) {
+                unsigned int size = sizeof(struct numa_group) +
+                                    2*nr_node_ids*sizeof(unsigned long);
+                grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+                if (!grp)
+                        return;
+                atomic_set(&grp->refcount, 1);
+                spin_lock_init(&grp->lock);
+                INIT_LIST_HEAD(&grp->task_list);
+                grp->gid = p->pid;
+                for (i = 0; i < 2*nr_node_ids; i++)
+                        grp->faults[i] = p->numa_faults[i];
+                grp->total_faults = p->total_numa_faults;
+                list_add(&p->numa_entry, &grp->task_list);
+                grp->nr_tasks++;
+                rcu_assign_pointer(p->numa_group, grp);
+        }
+        rcu_read_lock();
+        tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
-        /* FIXME: Scheduling placement policy hints go here */
+        if (!cpupid_match_pid(tsk, cpupid))
+                goto no_join;
+        grp = rcu_dereference(tsk->numa_group);
+        if (!grp)
+                goto no_join;
+        my_grp = p->numa_group;
+        if (grp == my_grp)
+                goto no_join;
+        /*
+         * Only join the other group if its bigger; if we're the bigger group,
+         * the other task will join us.
+         */
+        if (my_grp->nr_tasks > grp->nr_tasks)
+                goto no_join;
+        /*
+         * Tie-break on the grp address.
+         */
+        if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+                goto no_join;
+        /* Always join threads in the same process. */
+        if (tsk->mm == current->mm)
+                join = true;
+        /* Simple filter to avoid false positives due to PID collisions */
+        if (flags & TNF_SHARED)
+                join = true;
+        /* Update priv based on whether false sharing was detected */
+        *priv = !join;
+        if (join && !get_numa_group(grp))
+                goto no_join;
+        rcu_read_unlock();
+        if (!join)
+                return;
+        double_lock(&my_grp->lock, &grp->lock);
+        for (i = 0; i < 2*nr_node_ids; i++) {
+                my_grp->faults[i] -= p->numa_faults[i];
+                grp->faults[i] += p->numa_faults[i];
+        }
+        my_grp->total_faults -= p->total_numa_faults;
+        grp->total_faults += p->total_numa_faults;
+        list_move(&p->numa_entry, &grp->task_list);
+        my_grp->nr_tasks--;
+        grp->nr_tasks++;
+        spin_unlock(&my_grp->lock);
+        spin_unlock(&grp->lock);
+        rcu_assign_pointer(p->numa_group, grp);
+        put_numa_group(my_grp);
+        return;
+no_join:
+        rcu_read_unlock();
+        return;
+}
+void task_numa_free(struct task_struct *p)
+{
+        struct numa_group *grp = p->numa_group;
+        int i;
+        void *numa_faults = p->numa_faults;
+        if (grp) {
+                spin_lock(&grp->lock);
+                for (i = 0; i < 2*nr_node_ids; i++)
+                        grp->faults[i] -= p->numa_faults[i];
+                grp->total_faults -= p->total_numa_faults;
+                list_del(&p->numa_entry);
+                grp->nr_tasks--;
+                spin_unlock(&grp->lock);
+                rcu_assign_pointer(p->numa_group, NULL);
+                put_numa_group(grp);
+        }
+        p->numa_faults = NULL;
+        p->numa_faults_buffer = NULL;
+        kfree(numa_faults);
 }
 /*
 * Got a PROT_NONE fault for a page on @node.
 */
-void task_numa_fault(int node, int pages, bool migrated)
+void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 {
        struct task_struct *p = current;
+        bool migrated = flags & TNF_MIGRATED;
+        int priv;
        if (!numabalancing_enabled)
                return;
-        /* FIXME: Allocate task-specific structure for placement policy here */
+        /* for example, ksmd faulting in a user's mm */
+        if (!p->mm)
+                return;
+        /* Do not worry about placement if exiting */
+        if (p->state == TASK_DEAD)
+                return;
+        /* Allocate buffer to track faults on a per-node basis */
+        if (unlikely(!p->numa_faults)) {
+                int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+                /* numa_faults and numa_faults_buffer share the allocation */
+                p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+                if (!p->numa_faults)
+                        return;
+                BUG_ON(p->numa_faults_buffer);
+                p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+                p->total_numa_faults = 0;
+                memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+        }
        /*
-         * If pages are properly placed (did not migrate) then scan slower.
+         * First accesses are treated as private, otherwise consider accesses
-         * This is reset periodically in case of phase changes
+         * to be private if the accessing pid has not changed
         */
-        if (!migrated)
+        if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
-                p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+                priv = 1;
-                        p->numa_scan_period + jiffies_to_msecs(10));
+        } else {
+                priv = cpupid_match_pid(p, last_cpupid);
+                if (!priv && !(flags & TNF_NO_GROUP))
+                        task_numa_group(p, last_cpupid, flags, &priv);
+        }
        task_numa_placement(p);
+        /*
+         * Retry task to preferred node migration periodically, in case it
+         * case it previously failed, or the scheduler moved us.
+         */
+        if (time_after(jiffies, p->numa_migrate_retry))
+                numa_migrate_preferred(p);
+        if (migrated)
+                p->numa_pages_migrated += pages;
+        p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+        p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
 static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1656,7 @@ void task_numa_work(struct callback_head *work)
        struct mm_struct *mm = p->mm;
        struct vm_area_struct *vma;
        unsigned long start, end;
+        unsigned long nr_pte_updates = 0;
        long pages;
        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1673,9 @@ void task_numa_work(struct callback_head *work)
        if (p->flags & PF_EXITING)
                return;
-        /*
+        if (!mm->numa_next_scan) {
-         * We do not care about task placement until a task runs on a node
+                mm->numa_next_scan = now +
-         * other than the first one used by the address space. This is
+                        msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-         * largely because migrations are driven by what CPU the task
-         * is running on. If it's never scheduled on another node, it'll
-         * not migrate so why bother trapping the fault.
-         */
-        if (mm->first_nid == NUMA_PTE_SCAN_INIT)
-                mm->first_nid = numa_node_id();
-        if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
-                /* Are we running on a new node yet? */
-                if (numa_node_id() == mm->first_nid &&
-                    !sched_feat_numa(NUMA_FORCE))
-                        return;
-                mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
-        }
-        /*
-         * Reset the scan period if enough time has gone by. Objective is that
-         * scanning will be reduced if pages are properly placed. As tasks
-         * can enter different phases this needs to be re-examined. Lacking
-         * proper tracking of reference behaviour, this blunt hammer is used.
-         */
-        migrate = mm->numa_next_reset;
-        if (time_after(now, migrate)) {
-                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-                next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
-                xchg(&mm->numa_next_reset, next_scan);
        }
        /*
@@ -938,20 +1685,20 @@ void task_numa_work(struct callback_head *work)
        if (time_before(now, migrate))
                return;
-        if (p->numa_scan_period == 0)
+        if (p->numa_scan_period == 0) {
-                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                p->numa_scan_period_max = task_scan_max(p);
+                p->numa_scan_period = task_scan_min(p);
+        }
        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
                return;
        /*
-         * Do not set pte_numa if the current running node is rate-limited.
+         * Delay this task enough that another task of this mm will likely win
-         * This loses statistics on the fault but if we are unwilling to
+         * the next time around.
-         * migrate to this node, it is less likely we can do useful work
         */
-        if (migrate_ratelimited(numa_node_id()))
+        p->node_stamp += 2 * TICK_NSEC;
-                return;
        start = mm->numa_scan_offset;
        pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1714,32 @@ void task_numa_work(struct callback_head *work)
                vma = mm->mmap;
        }
        for (; vma; vma = vma->vm_next) {
-                if (!vma_migratable(vma))
+                if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
                        continue;
-                /* Skip small VMAs. They are not likely to be of relevance */
+                /*
-                if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+                 * Shared library pages mapped by multiple processes are not
+                 * migrated as it is expected they are cache replicated. Avoid
+                 * hinting faults in read-only file-backed mappings or the vdso
+                 * as migrating the pages will be of marginal benefit.
+                 */
+                if (!vma->vm_mm ||
+                    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
                        continue;
                do {
                        start = max(start, vma->vm_start);
                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
                        end = min(end, vma->vm_end);
-                        pages -= change_prot_numa(vma, start, end);
+                        nr_pte_updates += change_prot_numa(vma, start, end);
+                        /*
+                         * Scan sysctl_numa_balancing_scan_size but ensure that
+                         * at least one PTE is updated so that unused virtual
+                         * address space is quickly skipped.
+                         */
+                        if (nr_pte_updates)
+                                pages -= (end - start) >> PAGE_SHIFT;
                        start = end;
                        if (pages <= 0)
@@ -988,10 +1749,10 @@ void task_numa_work(struct callback_head *work)
 out:
        /*
-         * It is possible to reach the end of the VMA list but the last few VMAs are
+         * It is possible to reach the end of the VMA list but the last few
-         * not guaranteed to the vma_migratable. If they are not, we would find the
+         * VMAs are not guaranteed to the vma_migratable. If they are not, we
-         * !migratable VMA on the next scan but not reset the scanner to the start
+         * would find the !migratable VMA on the next scan but not reset the
-         * so check it now.
+         * scanner to the start so check it now.
         */
        if (vma)
                mm->numa_scan_offset = start;
@@ -1025,8 +1786,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
        if (now - curr->node_stamp > period) {
                if (!curr->node_stamp)
-                        curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                        curr->numa_scan_period = task_scan_min(curr);
-                curr->node_stamp = now;
+                curr->node_stamp += period;
                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1799,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 }
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 static void
@@ -1047,8 +1816,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        if (!parent_entity(se))
                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
-                list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+                struct rq *rq = rq_of(cfs_rq);
+                account_numa_enqueue(rq, task_of(se));
+                list_add(&se->group_node, &rq->cfs_tasks);
+        }
 #endif
        cfs_rq->nr_running++;
 }
@@ -1059,8 +1832,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
+                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                list_del_init(&se->group_node);
+        }
        cfs_rq->nr_running--;
 }
@@ -2032,6 +2807,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         */
        update_entity_load_avg(curr, 1);
        update_cfs_rq_blocked_load(cfs_rq, 1);
+        update_cfs_shares(cfs_rq);
 #ifdef CONFIG_SCHED_HRTICK
        /*
@@ -2069,13 +2845,14 @@ static inline bool cfs_bandwidth_used(void)
        return static_key_false(&__cfs_bandwidth_used);
 }
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_inc(void)
 {
-        /* only need to count groups transitioning between enabled/!enabled */
+        static_key_slow_inc(&__cfs_bandwidth_used);
-        if (enabled && !was_enabled)
+}
-                static_key_slow_inc(&__cfs_bandwidth_used);
-        else if (!enabled && was_enabled)
+void cfs_bandwidth_usage_dec(void)
-                static_key_slow_dec(&__cfs_bandwidth_used);
+{
+        static_key_slow_dec(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
@@ -2083,7 +2860,8 @@ static bool cfs_bandwidth_used(void)
        return true;
 }
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
 #endif /* HAVE_JUMP_LABEL */
 /*
@@ -2334,6 +3112,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->throttled_clock = rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+        if (!cfs_b->timer_active)
+                __start_cfs_bandwidth(cfs_b);
        raw_spin_unlock(&cfs_b->lock);
 }
@@ -2447,6 +3227,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
        if (idle)
                goto out_unlock;
+        /*
+         * if we have relooped after returning idle once, we need to update our
+         * status as actually running, so that other cpus doing
+         * __start_cfs_bandwidth will stop trying to cancel us.
+         */
+        cfs_b->timer_active = 1;
        __refill_cfs_bandwidth_runtime(cfs_b);
        if (!throttled) {
@@ -2507,7 +3294,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
 /* how long we wait to gather additional slack before distributing */
 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
 {
        struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2583,10 +3376,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        u64 expires;
        /* confirm we're still not at a refresh boundary */
-        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+        raw_spin_lock(&cfs_b->lock);
+        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+                raw_spin_unlock(&cfs_b->lock);
                return;
+        }
-        raw_spin_lock(&cfs_b->lock);
        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
                runtime = cfs_b->runtime;
                cfs_b->runtime = 0;
@@ -2707,11 +3502,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         * (timer_active==0 becomes visible before the hrtimer call-back
         * terminates).  In either case we ensure that it's re-programmed
         */
-        while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+        while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+               hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+                /* bounce the lock to allow do_sched_cfs_period_timer to run */
                raw_spin_unlock(&cfs_b->lock);
-                /* ensure cfs_b->lock is available while we wait */
+                cpu_relax();
-                hrtimer_cancel(&cfs_b->period_timer);
                raw_spin_lock(&cfs_b->lock);
                /* if someone else restarted the timer then we're done */
                if (cfs_b->timer_active)
@@ -3017,6 +3812,23 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        return 0;
 }
+static void record_wakee(struct task_struct *p)
+{
+        /*
+         * Rough decay (wiping) for cost saving, don't worry
+         * about the boundary, really active task won't care
+         * about the loss.
+         */
+        if (jiffies > current->wakee_flip_decay_ts + HZ) {
+                current->wakee_flips = 0;
+                current->wakee_flip_decay_ts = jiffies;
+        }
+        if (current->last_wakee != p) {
+                current->last_wakee = p;
+                current->wakee_flips++;
+        }
+}
 static void task_waking_fair(struct task_struct *p)
 {
@@ -3037,6 +3849,7 @@ static void task_waking_fair(struct task_struct *p)
 #endif
        se->vruntime -= min_vruntime;
+        record_wakee(p);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3094,7 +3907,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
-        if (!tg->parent)        /* the trivial, non-cgroup case */
+        if (!tg->parent || !wl) /* the trivial, non-cgroup case */
                return wl;
        for_each_sched_entity(se) {
@@ -3147,14 +3960,35 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 }
 #else
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-                unsigned long wl, unsigned long wg)
 {
        return wl;
 }
 #endif
+static int wake_wide(struct task_struct *p)
+{
+        int factor = this_cpu_read(sd_llc_size);
+        /*
+         * Yeah, it's the switching-frequency, could means many wakee or
+         * rapidly switch, use factor here will just help to automatically
+         * adjust the loose-degree, so bigger node will lead to more pull.
+         */
+        if (p->wakee_flips > factor) {
+                /*
+                 * wakee is somewhat hot, it needs certain amount of cpu
+                 * resource, so if waker is far more hot, prefer to leave
+                 * it alone.
+                 */
+                if (current->wakee_flips > (factor * p->wakee_flips))
+                        return 1;
+        }
+        return 0;
+}
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
        s64 this_load, load;
@@ -3164,6 +3998,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        unsigned long weight;
        int balanced;
+        /*
+         * If we wake multiple tasks be careful to not bounce
+         * ourselves around too much.
+         */
+        if (wake_wide(p))
+                return 0;
        idx       = sd->wake_idx;
        this_cpu  = smp_processor_id();
        prev_cpu  = task_cpu(p);
@@ -3372,11 +4213,10 @@ done:
 * preempt must be disabled.
 */
 static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
-        int prev_cpu = task_cpu(p);
        int new_cpu = cpu;
        int want_affine = 0;
        int sync = wake_flags & WF_SYNC;
@@ -3856,9 +4696,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+enum fbq_type { regular, remote, all };
 #define LBF_ALL_PINNED  0x01
 #define LBF_NEED_BREAK  0x02
-#define LBF_SOME_PINNED 0x04
+#define LBF_DST_PINNED  0x04
+#define LBF_SOME_PINNED 0x08
 struct lb_env {
        struct sched_domain     *sd;
@@ -3881,6 +4724,8 @@ struct lb_env {
        unsigned int            loop;
        unsigned int            loop_break;
        unsigned int            loop_max;
+        enum fbq_type           fbq_type;
 };
 /*
@@ -3927,6 +4772,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        return delta < (s64)sysctl_sched_migration_cost;
 }
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+        int src_nid, dst_nid;
+        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+            !(env->sd->flags & SD_NUMA)) {
+                return false;
+        }
+        src_nid = cpu_to_node(env->src_cpu);
+        dst_nid = cpu_to_node(env->dst_cpu);
+        if (src_nid == dst_nid)
+                return false;
+        /* Always encourage migration to the preferred node. */
+        if (dst_nid == p->numa_preferred_nid)
+                return true;
+        /* If both task and group weight improve, this move is a winner. */
+        if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+            group_weight(p, dst_nid) > group_weight(p, src_nid))
+                return true;
+        return false;
+}
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+        int src_nid, dst_nid;
+        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
+                return false;
+        if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+                return false;
+        src_nid = cpu_to_node(env->src_cpu);
+        dst_nid = cpu_to_node(env->dst_cpu);
+        if (src_nid == dst_nid)
+                return false;
+        /* Migrating away from the preferred node is always bad. */
+        if (src_nid == p->numa_preferred_nid)
+                return true;
+        /* If either task or group weight get worse, don't do it. */
+        if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
+            group_weight(p, dst_nid) < group_weight(p, src_nid))
+                return true;
+        return false;
+}
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+                                             struct lb_env *env)
+{
+        return false;
+}
+static inline bool migrate_degrades_locality(struct task_struct *p,
+                                             struct lb_env *env)
+{
+        return false;
+}
+#endif
 /*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
@@ -3949,6 +4866,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+                env->flags |= LBF_SOME_PINNED;
                /*
                 * Remember if this task can be migrated to any other cpu in
                 * our sched_group. We may want to revisit it if we couldn't
@@ -3957,13 +4876,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                 * Also avoid computing new_dst_cpu if we have already computed
                 * one in current iteration.
                 */
-                if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+                if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
                        return 0;
                /* Prevent to re-select dst_cpu via env's cpus */
                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
                        if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
-                                env->flags |= LBF_SOME_PINNED;
+                                env->flags |= LBF_DST_PINNED;
                                env->new_dst_cpu = cpu;
                                break;
                        }
@@ -3982,11 +4901,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        /*
         * Aggressive migration if:
-         * 1) task is cache cold, or
+         * 1) destination numa is preferred
-         * 2) too many balance attempts have failed.
+         * 2) task is cache cold, or
+         * 3) too many balance attempts have failed.
         */
        tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+        if (!tsk_cache_hot)
+                tsk_cache_hot = migrate_degrades_locality(p, env);
+        if (migrate_improves_locality(p, env)) {
+#ifdef CONFIG_SCHEDSTATS
+                if (tsk_cache_hot) {
+                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                        schedstat_inc(p, se.statistics.nr_forced_migrations);
+                }
+#endif
+                return 1;
+        }
        if (!tsk_cache_hot ||
                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
@@ -4029,8 +4961,6 @@ static int move_one_task(struct lb_env *env)
        return 0;
 }
-static unsigned long task_h_load(struct task_struct *p);
 static const unsigned int sched_nr_migrate_break = 32;
 /*
@@ -4171,47 +5101,48 @@ static void update_blocked_averages(int cpu)
 }
 /*
- * Compute the cpu's hierarchical load factor for each task group.
+ * Compute the hierarchical load factor for cfs_rq and all its ascendants.
 * This needs to be done in a top-down fashion because the load of a child
 * group is a fraction of its parents load.
 */
-static int tg_load_down(struct task_group *tg, void *data)
+static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
-{
-        unsigned long load;
-        long cpu = (long)data;
-        if (!tg->parent) {
-                load = cpu_rq(cpu)->avg.load_avg_contrib;
-        } else {
-                load = tg->parent->cfs_rq[cpu]->h_load;
-                load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
-                                tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
-        }
-        tg->cfs_rq[cpu]->h_load = load;
-        return 0;
-}
-static void update_h_load(long cpu)
 {
-        struct rq *rq = cpu_rq(cpu);
+        struct rq *rq = rq_of(cfs_rq);
+        struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
        unsigned long now = jiffies;
+        unsigned long load;
-        if (rq->h_load_throttle == now)
+        if (cfs_rq->last_h_load_update == now)
                return;
-        rq->h_load_throttle = now;
+        cfs_rq->h_load_next = NULL;
+        for_each_sched_entity(se) {
+                cfs_rq = cfs_rq_of(se);
+                cfs_rq->h_load_next = se;
+                if (cfs_rq->last_h_load_update == now)
+                        break;
+        }
+        if (!se) {
+                cfs_rq->h_load = cfs_rq->runnable_load_avg;
+                cfs_rq->last_h_load_update = now;
+        }
-        rcu_read_lock();
+        while ((se = cfs_rq->h_load_next) != NULL) {
-        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+                load = cfs_rq->h_load;
-        rcu_read_unlock();
+                load = div64_ul(load * se->avg.load_avg_contrib,
+                                cfs_rq->runnable_load_avg + 1);
+                cfs_rq = group_cfs_rq(se);
+                cfs_rq->h_load = load;
+                cfs_rq->last_h_load_update = now;
+        }
 }
 static unsigned long task_h_load(struct task_struct *p)
 {
        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        update_cfs_rq_h_load(cfs_rq);
        return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
                        cfs_rq->runnable_load_avg + 1);
 }
@@ -4220,10 +5151,6 @@ static inline void update_blocked_averages(int cpu)
 {
 }
-static inline void update_h_load(long cpu)
-{
-}
 static unsigned long task_h_load(struct task_struct *p)
 {
        return p->se.avg.load_avg_contrib;
@@ -4232,54 +5159,66 @@ static unsigned long task_h_load(struct task_struct *p)
 /********** Helpers for find_busiest_group ************************/
 /*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- *              during load balancing.
- */
-struct sd_lb_stats {
-        struct sched_group *busiest; /* Busiest group in this sd */
-        struct sched_group *this;  /* Local group in this sd */
-        unsigned long total_load;  /* Total load of all groups in sd */
-        unsigned long total_pwr;   /*   Total power of all groups in sd */
-        unsigned long avg_load;    /* Average load across all groups in sd */
-        /** Statistics of this group */
-        unsigned long this_load;
-        unsigned long this_load_per_task;
-        unsigned long this_nr_running;
-        unsigned long this_has_capacity;
-        unsigned int  this_idle_cpus;
-        /* Statistics of the busiest group */
-        unsigned int  busiest_idle_cpus;
-        unsigned long max_load;
-        unsigned long busiest_load_per_task;
-        unsigned long busiest_nr_running;
-        unsigned long busiest_group_capacity;
-        unsigned long busiest_has_capacity;
-        unsigned int  busiest_group_weight;
-        int group_imb; /* Is there imbalance in this sd */
-};
-/*
 * sg_lb_stats - stats of a sched_group required for load_balancing
 */
 struct sg_lb_stats {
        unsigned long avg_load; /*Avg load across the CPUs of the group */
        unsigned long group_load; /* Total load over the CPUs of the group */
-        unsigned long sum_nr_running; /* Nr tasks running in the group */
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
-        unsigned long group_capacity;
+        unsigned long load_per_task;
-        unsigned long idle_cpus;
+        unsigned long group_power;
-        unsigned long group_weight;
+        unsigned int sum_nr_running; /* Nr tasks running in the group */
+        unsigned int group_capacity;
+        unsigned int idle_cpus;
+        unsigned int group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+        unsigned int nr_numa_running;
+        unsigned int nr_preferred_running;
+#endif
+};
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ *               during load balancing.
+ */
+struct sd_lb_stats {
+        struct sched_group *busiest;    /* Busiest group in this sd */
+        struct sched_group *local;      /* Local group in this sd */
+        unsigned long total_load;       /* Total load of all groups in sd */
+        unsigned long total_pwr;        /* Total power of all groups in sd */
+        unsigned long avg_load; /* Average load across all groups in sd */
+        struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
+        struct sg_lb_stats local_stat;  /* Statistics of the local group */
 };
+static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
+{
+        /*
+         * Skimp on the clearing to avoid duplicate work. We can avoid clearing
+         * local_stat because update_sg_lb_stats() does a full clear/assignment.
+         * We must however clear busiest_stat::avg_load because
+         * update_sd_pick_busiest() reads this before assignment.
+         */
+        *sds = (struct sd_lb_stats){
+                .busiest = NULL,
+                .local = NULL,
+                .total_load = 0UL,
+                .total_pwr = 0UL,
+                .busiest_stat = {
+                        .avg_load = 0UL,
+                },
+        };
+}
 /**
 * get_sd_load_idx - Obtain the load index for a given sched domain.
 * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ * @idle: The idle status of the CPU for whose sd load_idx is obtained.
+ *
+ * Return: The load index.
 */
 static inline int get_sd_load_idx(struct sched_domain *sd,
                                        enum cpu_idle_type idle)
@@ -4394,7 +5333,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
-        unsigned long power;
+        unsigned long power, power_orig;
        unsigned long interval;
        interval = msecs_to_jiffies(sd->balance_interval);
@@ -4406,7 +5345,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
                return;
        }
-        power = 0;
+        power_orig = power = 0;
        if (child->flags & SD_OVERLAP) {
                /*
@@ -4414,8 +5353,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
                 * span the current group.
                 */
-                for_each_cpu(cpu, sched_group_cpus(sdg))
+                for_each_cpu(cpu, sched_group_cpus(sdg)) {
-                        power += power_of(cpu);
+                        struct sched_group *sg = cpu_rq(cpu)->sd->groups;
+                        power_orig += sg->sgp->power_orig;
+                        power += sg->sgp->power;
+                }
        } else  {
                /*
                 * !SD_OVERLAP domains can assume that child groups
@@ -4424,12 +5367,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
                group = child->groups;
                do {
+                        power_orig += group->sgp->power_orig;
                        power += group->sgp->power;
                        group = group->next;
                } while (group != child->groups);
        }
-        sdg->sgp->power_orig = sdg->sgp->power = power;
+        sdg->sgp->power_orig = power_orig;
+        sdg->sgp->power = power;
 }
 /*
@@ -4457,33 +5402,84 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
        return 0;
 }
+/*
+ * Group imbalance indicates (and tries to solve) the problem where balancing
+ * groups is inadequate due to tsk_cpus_allowed() constraints.
+ *
+ * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
+ * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
+ * Something like:
+ *
+ *      { 0 1 2 3 } { 4 5 6 7 }
+ *              *     * * *
+ *
+ * If we were to balance group-wise we'd place two tasks in the first group and
+ * two tasks in the second group. Clearly this is undesired as it will overload
+ * cpu 3 and leave one of the cpus in the second group unused.
+ *
+ * The current solution to this issue is detecting the skew in the first group
+ * by noticing the lower domain failed to reach balance and had difficulty
+ * moving tasks due to affinity constraints.
+ *
+ * When this is so detected; this group becomes a candidate for busiest; see
+ * update_sd_pick_busiest(). And calculate_imbalance() and
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
+ * to create an effective group imbalance.
+ *
+ * This is a somewhat tricky proposition since the next run might not find the
+ * group imbalance and decide the groups need to be balanced again. A most
+ * subtle and fragile situation.
+ */
+static inline int sg_imbalanced(struct sched_group *group)
+{
+        return group->sgp->imbalance;
+}
+/*
+ * Compute the group capacity.
+ *
+ * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
+ * first dividing out the smt factor and computing the actual number of cores
+ * and limit power unit capacity with that.
+ */
+static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
+{
+        unsigned int capacity, smt, cpus;
+        unsigned int power, power_orig;
+        power = group->sgp->power;
+        power_orig = group->sgp->power_orig;
+        cpus = group->group_weight;
+        /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
+        smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
+        capacity = cpus / smt; /* cores */
+        capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
+        if (!capacity)
+                capacity = fix_small_capacity(env->sd, group);
+        return capacity;
+}
 /**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 * @env: The load balancing environment.
 * @group: sched_group whose statistics are to be updated.
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
- * @balance: Should we balance.
 * @sgs: variable to hold the statistics for this group.
 */
 static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
-                        int local_group, int *balance, struct sg_lb_stats *sgs)
+                        int local_group, struct sg_lb_stats *sgs)
 {
-        unsigned long nr_running, max_nr_running, min_nr_running;
+        unsigned long nr_running;
-        unsigned long load, max_cpu_load, min_cpu_load;
+        unsigned long load;
-        unsigned int balance_cpu = -1, first_idle_cpu = 0;
-        unsigned long avg_load_per_task = 0;
        int i;
-        if (local_group)
+        memset(sgs, 0, sizeof(*sgs));
-                balance_cpu = group_balance_cpu(group);
-        /* Tally up the load of all CPUs in the group */
-        max_cpu_load = 0;
-        min_cpu_load = ~0UL;
-        max_nr_running = 0;
-        min_nr_running = ~0UL;
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
@@ -4491,76 +5487,34 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                nr_running = rq->nr_running;
                /* Bias balancing toward cpus of our domain */
-                if (local_group) {
+                if (local_group)
-                        if (idle_cpu(i) && !first_idle_cpu &&
-                                        cpumask_test_cpu(i, sched_group_mask(group))) {
-                                first_idle_cpu = 1;
-                                balance_cpu = i;
-                        }
                        load = target_load(i, load_idx);
-                } else {
+                else
                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load)
-                                max_cpu_load = load;
-                        if (min_cpu_load > load)
-                                min_cpu_load = load;
-                        if (nr_running > max_nr_running)
-                                max_nr_running = nr_running;
-                        if (min_nr_running > nr_running)
-                                min_nr_running = nr_running;
-                }
                sgs->group_load += load;
                sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+                sgs->nr_numa_running += rq->nr_numa_running;
+                sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
                sgs->sum_weighted_load += weighted_cpuload(i);
                if (idle_cpu(i))
                        sgs->idle_cpus++;
        }
-        /*
-         * First idle cpu or the first cpu(busiest) in this sched group
-         * is eligible for doing load balancing at this and above
-         * domains. In the newly idle case, we will allow all the cpu's
-         * to do the newly idle load balance.
-         */
-        if (local_group) {
-                if (env->idle != CPU_NEWLY_IDLE) {
-                        if (balance_cpu != env->dst_cpu) {
-                                *balance = 0;
-                                return;
-                        }
-                        update_group_power(env->sd, env->dst_cpu);
-                } else if (time_after_eq(jiffies, group->sgp->next_update))
-                        update_group_power(env->sd, env->dst_cpu);
-        }
        /* Adjust by relative CPU power of the group */
-        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
+        sgs->group_power = group->sgp->power;
+        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
-        /*
-         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of a task.
-         *
-         * APZ: with cgroup the avg task weight can vary wildly and
-         *      might not be a suitable number - should we keep a
-         *      normalized nr_running number somewhere that negates
-         *      the hierarchy?
-         */
        if (sgs->sum_nr_running)
-                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
-            (max_nr_running - min_nr_running) > 1)
-                sgs->group_imb = 1;
-        sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
-                                                SCHED_POWER_SCALE);
-        if (!sgs->group_capacity)
-                sgs->group_capacity = fix_small_capacity(env->sd, group);
        sgs->group_weight = group->group_weight;
+        sgs->group_imb = sg_imbalanced(group);
+        sgs->group_capacity = sg_capacity(env, group);
        if (sgs->group_capacity > sgs->sum_nr_running)
                sgs->group_has_capacity = 1;
 }
@@ -4574,13 +5528,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 *
 * Determine if @sg is a busier group than the previously selected
 * busiest group.
+ *
+ * Return: %true if @sg is a busier group than the previously selected
+ * busiest group. %false otherwise.
 */
 static bool update_sd_pick_busiest(struct lb_env *env,
                                   struct sd_lb_stats *sds,
                                   struct sched_group *sg,
                                   struct sg_lb_stats *sgs)
 {
-        if (sgs->avg_load <= sds->max_load)
+        if (sgs->avg_load <= sds->busiest_stat.avg_load)
                return false;
        if (sgs->sum_nr_running > sgs->group_capacity)
@@ -4606,18 +5563,46 @@ static bool update_sd_pick_busiest(struct lb_env *env,
        return false;
 }
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+        if (sgs->sum_nr_running > sgs->nr_numa_running)
+                return regular;
+        if (sgs->sum_nr_running > sgs->nr_preferred_running)
+                return remote;
+        return all;
+}
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+        if (rq->nr_running > rq->nr_numa_running)
+                return regular;
+        if (rq->nr_running > rq->nr_preferred_running)
+                return remote;
+        return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+        return all;
+}
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+        return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @env: The load balancing environment.
- * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
-static inline void update_sd_lb_stats(struct lb_env *env,
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
-                                        int *balance, struct sd_lb_stats *sds)
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
-        struct sg_lb_stats sgs;
+        struct sg_lb_stats tmp_sgs;
        int load_idx, prefer_sibling = 0;
        if (child && child->flags & SD_PREFER_SIBLING)
@@ -4626,17 +5611,23 @@ static inline void update_sd_lb_stats(struct lb_env *env,
        load_idx = get_sd_load_idx(env->sd, env->idle);
        do {
+                struct sg_lb_stats *sgs = &tmp_sgs;
                int local_group;
                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
-                memset(&sgs, 0, sizeof(sgs));
+                if (local_group) {
-                update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
+                        sds->local = sg;
+                        sgs = &sds->local_stat;
-                if (local_group && !(*balance))
+                        if (env->idle != CPU_NEWLY_IDLE ||
-                        return;
+                            time_after_eq(jiffies, sg->sgp->next_update))
+                                update_group_power(env->sd, env->dst_cpu);
+                }
-                sds->total_load += sgs.group_load;
+                update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
-                sds->total_pwr += sg->sgp->power;
+                if (local_group)
+                        goto next_group;
                /*
                 * In case the child domain prefers tasks go to siblings
@@ -4648,30 +5639,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                 * heaviest group when it is already under-utilized (possible
                 * with a large weight task outweighs the tasks on the system).
                 */
-                if (prefer_sibling && !local_group && sds->this_has_capacity)
+                if (prefer_sibling && sds->local &&
-                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
+                    sds->local_stat.group_has_capacity)
+                        sgs->group_capacity = min(sgs->group_capacity, 1U);
-                if (local_group) {
+                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
-                        sds->this_load = sgs.avg_load;
-                        sds->this = sg;
-                        sds->this_nr_running = sgs.sum_nr_running;
-                        sds->this_load_per_task = sgs.sum_weighted_load;
-                        sds->this_has_capacity = sgs.group_has_capacity;
-                        sds->this_idle_cpus = sgs.idle_cpus;
-                } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
-                        sds->max_load = sgs.avg_load;
                        sds->busiest = sg;
-                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_stat = *sgs;
-                        sds->busiest_idle_cpus = sgs.idle_cpus;
-                        sds->busiest_group_capacity = sgs.group_capacity;
-                        sds->busiest_load_per_task = sgs.sum_weighted_load;
-                        sds->busiest_has_capacity = sgs.group_has_capacity;
-                        sds->busiest_group_weight = sgs.group_weight;
-                        sds->group_imb = sgs.group_imb;
                }
+next_group:
+                /* Now, start updating sd_lb_stats */
+                sds->total_load += sgs->group_load;
+                sds->total_pwr += sgs->group_power;
                sg = sg->next;
        } while (sg != env->sd->groups);
+        if (env->sd->flags & SD_NUMA)
+                env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 }
 /**
@@ -4691,7 +5677,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
 * assuming lower CPU number will be equivalent to lower a SMT thread
 * number.
 *
- * Returns 1 when packing is required and a task should be moved to
+ * Return: 1 when packing is required and a task should be moved to
 * this CPU.  The amount of the imbalance is returned in *imbalance.
 *
 * @env: The load balancing environment.
@@ -4712,7 +5698,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
                return 0;
        env->imbalance = DIV_ROUND_CLOSEST(
-                sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
+                sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
+                SCHED_POWER_SCALE);
        return 1;
 }
@@ -4730,24 +5717,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
        unsigned long tmp, pwr_now = 0, pwr_move = 0;
        unsigned int imbn = 2;
        unsigned long scaled_busy_load_per_task;
+        struct sg_lb_stats *local, *busiest;
-        if (sds->this_nr_running) {
+        local = &sds->local_stat;
-                sds->this_load_per_task /= sds->this_nr_running;
+        busiest = &sds->busiest_stat;
-                if (sds->busiest_load_per_task >
-                                sds->this_load_per_task)
+        if (!local->sum_nr_running)
-                        imbn = 1;
+                local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
-        } else {
+        else if (busiest->load_per_task > local->load_per_task)
-                sds->this_load_per_task =
+                imbn = 1;
-                        cpu_avg_load_per_task(env->dst_cpu);
-        }
-        scaled_busy_load_per_task = sds->busiest_load_per_task
+        scaled_busy_load_per_task =
-                                         * SCHED_POWER_SCALE;
+                (busiest->load_per_task * SCHED_POWER_SCALE) /
-        scaled_busy_load_per_task /= sds->busiest->sgp->power;
+                busiest->group_power;
-        if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
+        if (busiest->avg_load + scaled_busy_load_per_task >=
-                        (scaled_busy_load_per_task * imbn)) {
+            local->avg_load + (scaled_busy_load_per_task * imbn)) {
-                env->imbalance = sds->busiest_load_per_task;
+                env->imbalance = busiest->load_per_task;
                return;
        }
@@ -4757,34 +5743,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
         * moving them.
         */
-        pwr_now += sds->busiest->sgp->power *
+        pwr_now += busiest->group_power *
-                        min(sds->busiest_load_per_task, sds->max_load);
+                        min(busiest->load_per_task, busiest->avg_load);
-        pwr_now += sds->this->sgp->power *
+        pwr_now += local->group_power *
-                        min(sds->this_load_per_task, sds->this_load);
+                        min(local->load_per_task, local->avg_load);
        pwr_now /= SCHED_POWER_SCALE;
        /* Amount of load we'd subtract */
-        tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
+        tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
-                sds->busiest->sgp->power;
+                busiest->group_power;
-        if (sds->max_load > tmp)
+        if (busiest->avg_load > tmp) {
-                pwr_move += sds->busiest->sgp->power *
+                pwr_move += busiest->group_power *
-                        min(sds->busiest_load_per_task, sds->max_load - tmp);
+                            min(busiest->load_per_task,
+                                busiest->avg_load - tmp);
+        }
        /* Amount of load we'd add */
-        if (sds->max_load * sds->busiest->sgp->power <
+        if (busiest->avg_load * busiest->group_power <
-                sds->busiest_load_per_task * SCHED_POWER_SCALE)
+            busiest->load_per_task * SCHED_POWER_SCALE) {
-                tmp = (sds->max_load * sds->busiest->sgp->power) /
+                tmp = (busiest->avg_load * busiest->group_power) /
-                        sds->this->sgp->power;
+                      local->group_power;
-        else
+        } else {
-                tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
+                tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
-                        sds->this->sgp->power;
+                      local->group_power;
-        pwr_move += sds->this->sgp->power *
+        }
-                        min(sds->this_load_per_task, sds->this_load + tmp);
+        pwr_move += local->group_power *
+                    min(local->load_per_task, local->avg_load + tmp);
        pwr_move /= SCHED_POWER_SCALE;
        /* Move if we gain throughput */
        if (pwr_move > pwr_now)
-                env->imbalance = sds->busiest_load_per_task;
+                env->imbalance = busiest->load_per_task;
 }
 /**
@@ -4796,11 +5785,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 {
        unsigned long max_pull, load_above_capacity = ~0UL;
+        struct sg_lb_stats *local, *busiest;
+        local = &sds->local_stat;
+        busiest = &sds->busiest_stat;
-        sds->busiest_load_per_task /= sds->busiest_nr_running;
+        if (busiest->group_imb) {
-        if (sds->group_imb) {
+                /*
-                sds->busiest_load_per_task =
+                 * In the group_imb case we cannot rely on group-wide averages
-                        min(sds->busiest_load_per_task, sds->avg_load);
+                 * to ensure cpu-load equilibrium, look at wider averages. XXX
+                 */
+                busiest->load_per_task =
+                        min(busiest->load_per_task, sds->avg_load);
        }
        /*
@@ -4808,21 +5804,23 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         * max load less than avg load(as we skip the groups at or below
         * its cpu_power, while calculating max_load..)
         */
-        if (sds->max_load < sds->avg_load) {
+        if (busiest->avg_load <= sds->avg_load ||
+            local->avg_load >= sds->avg_load) {
                env->imbalance = 0;
                return fix_small_imbalance(env, sds);
        }
-        if (!sds->group_imb) {
+        if (!busiest->group_imb) {
                /*
                 * Don't want to pull so many tasks that a group would go idle.
+                 * Except of course for the group_imb case, since then we might
+                 * have to drop below capacity to reach cpu-load equilibrium.
                 */
-                load_above_capacity = (sds->busiest_nr_running -
+                load_above_capacity =
-                                                sds->busiest_group_capacity);
+                        (busiest->sum_nr_running - busiest->group_capacity);
                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
+                load_above_capacity /= busiest->group_power;
-                load_above_capacity /= sds->busiest->sgp->power;
        }
        /*
@@ -4832,15 +5830,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         * we also don't want to reduce the group load below the group capacity
         * (so that we can implement power-savings policies etc). Thus we look
         * for the minimum possible imbalance.
-         * Be careful of negative numbers as they'll appear as very large values
-         * with unsigned longs.
         */
-        max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
+        max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
        /* How much load to actually move to equalise the imbalance */
-        env->imbalance = min(max_pull * sds->busiest->sgp->power,
+        env->imbalance = min(
-                (sds->avg_load - sds->this_load) * sds->this->sgp->power)
+                max_pull * busiest->group_power,
-                        / SCHED_POWER_SCALE;
+                (sds->avg_load - local->avg_load) * local->group_power
+        ) / SCHED_POWER_SCALE;
        /*
         * if *imbalance is less than the average load per runnable task
@@ -4848,9 +5845,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         * a think about bumping its value to force at least one task to be
         * moved
         */
-        if (env->imbalance < sds->busiest_load_per_task)
+        if (env->imbalance < busiest->load_per_task)
                return fix_small_imbalance(env, sds);
 }
 /******* find_busiest_group() helpers end here *********************/
@@ -4866,69 +5862,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 * to restore balance.
 *
 * @env: The load balancing environment.
- * @balance: Pointer to a variable indicating if this_cpu
- *      is the appropriate cpu to perform load balancing at this_level.
 *
- * Returns:     - the busiest group if imbalance exists.
+ * Return:      - The busiest group if imbalance exists.
 *              - If no imbalance and user has opted for power-savings balance,
 *                 return the least loaded group whose CPUs can be
 *                 put to idle by rebalancing its tasks onto our group.
 */
-static struct sched_group *
+static struct sched_group *find_busiest_group(struct lb_env *env)
-find_busiest_group(struct lb_env *env, int *balance)
 {
+        struct sg_lb_stats *local, *busiest;
        struct sd_lb_stats sds;
-        memset(&sds, 0, sizeof(sds));
+        init_sd_lb_stats(&sds);
        /*
         * Compute the various statistics relavent for load balancing at
         * this level.
         */
-        update_sd_lb_stats(env, balance, &sds);
+        update_sd_lb_stats(env, &sds);
+        local = &sds.local_stat;
-        /*
+        busiest = &sds.busiest_stat;
-         * this_cpu is not the appropriate cpu to perform load balancing at
-         * this level.
-         */
-        if (!(*balance))
-                goto ret;
        if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
            check_asym_packing(env, &sds))
                return sds.busiest;
        /* There is no busy sibling group to pull tasks from */
-        if (!sds.busiest || sds.busiest_nr_running == 0)
+        if (!sds.busiest || busiest->sum_nr_running == 0)
                goto out_balanced;
        sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
        /*
         * If the busiest group is imbalanced the below checks don't
-         * work because they assumes all things are equal, which typically
+         * work because they assume all things are equal, which typically
         * isn't true due to cpus_allowed constraints and the like.
         */
-        if (sds.group_imb)
+        if (busiest->group_imb)
                goto force_balance;
        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-        if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+        if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
-                        !sds.busiest_has_capacity)
+            !busiest->group_has_capacity)
                goto force_balance;
        /*
         * If the local group is more busy than the selected busiest group
         * don't try and pull any tasks.
         */
-        if (sds.this_load >= sds.max_load)
+        if (local->avg_load >= busiest->avg_load)
                goto out_balanced;
        /*
         * Don't pull any tasks if this group is already above the domain
         * average load.
         */
-        if (sds.this_load >= sds.avg_load)
+        if (local->avg_load >= sds.avg_load)
                goto out_balanced;
        if (env->idle == CPU_IDLE) {
@@ -4938,15 +5927,16 @@ find_busiest_group(struct lb_env *env, int *balance)
                 * there is no imbalance between this and busiest group
                 * wrt to idle cpu's, it is balanced.
                 */
-                if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
+                if ((local->idle_cpus < busiest->idle_cpus) &&
-                    sds.busiest_nr_running <= sds.busiest_group_weight)
+                    busiest->sum_nr_running <= busiest->group_weight)
                        goto out_balanced;
        } else {
                /*
                 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
                 * imbalance_pct to be conservative.
                 */
-                if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
+                if (100 * busiest->avg_load <=
+                                env->sd->imbalance_pct * local->avg_load)
                        goto out_balanced;
        }
@@ -4956,7 +5946,6 @@ force_balance:
        return sds.busiest;
 out_balanced:
-ret:
        env->imbalance = 0;
        return NULL;
 }
@@ -4968,22 +5957,43 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                                     struct sched_group *group)
 {
        struct rq *busiest = NULL, *rq;
-        unsigned long max_load = 0;
+        unsigned long busiest_load = 0, busiest_power = 1;
        int i;
-        for_each_cpu(i, sched_group_cpus(group)) {
+        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-                unsigned long power = power_of(i);
+                unsigned long power, capacity, wl;
-                unsigned long capacity = DIV_ROUND_CLOSEST(power,
+                enum fbq_type rt;
-                                                           SCHED_POWER_SCALE);
-                unsigned long wl;
-                if (!capacity)
+                rq = cpu_rq(i);
-                        capacity = fix_small_capacity(env->sd, group);
+                rt = fbq_classify_rq(rq);
-                if (!cpumask_test_cpu(i, env->cpus))
+                /*
+                 * We classify groups/runqueues into three groups:
+                 *  - regular: there are !numa tasks
+                 *  - remote:  there are numa tasks that run on the 'wrong' node
+                 *  - all:     there is no distinction
+                 *
+                 * In order to avoid migrating ideally placed numa tasks,
+                 * ignore those when there's better options.
+                 *
+                 * If we ignore the actual busiest queue to migrate another
+                 * task, the next balance pass can still reduce the busiest
+                 * queue by moving tasks around inside the node.
+                 *
+                 * If we cannot move enough load due to this classification
+                 * the next pass will adjust the group classification and
+                 * allow migration of more tasks.
+                 *
+                 * Both cases only affect the total convergence complexity.
+                 */
+                if (rt > env->fbq_type)
                        continue;
-                rq = cpu_rq(i);
+                power = power_of(i);
+                capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
+                if (!capacity)
+                        capacity = fix_small_capacity(env->sd, group);
                wl = weighted_cpuload(i);
                /*
@@ -4998,11 +6008,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                 * the weighted_cpuload() scaled with the cpu power, so that
                 * the load can be moved away from the cpu that is potentially
                 * running at a lower capacity.
+                 *
+                 * Thus we're looking for max(wl_i / power_i), crosswise
+                 * multiplication to rid ourselves of the division works out
+                 * to: wl_i * power_j > wl_j * power_i;  where j is our
+                 * previous maximum.
                 */
-                wl = (wl * SCHED_POWER_SCALE) / power;
+                if (wl * busiest_power > busiest_load * power) {
+                        busiest_load = wl;
-                if (wl > max_load) {
+                        busiest_power = power;
-                        max_load = wl;
                        busiest = rq;
                }
        }
@@ -5039,15 +6053,50 @@ static int need_active_balance(struct lb_env *env)
 static int active_load_balance_cpu_stop(void *data);
+static int should_we_balance(struct lb_env *env)
+{
+        struct sched_group *sg = env->sd->groups;
+        struct cpumask *sg_cpus, *sg_mask;
+        int cpu, balance_cpu = -1;
+        /*
+         * In the newly idle case, we will allow all the cpu's
+         * to do the newly idle load balance.
+         */
+        if (env->idle == CPU_NEWLY_IDLE)
+                return 1;
+        sg_cpus = sched_group_cpus(sg);
+        sg_mask = sched_group_mask(sg);
+        /* Try to find first idle cpu */
+        for_each_cpu_and(cpu, sg_cpus, env->cpus) {
+                if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+                        continue;
+                balance_cpu = cpu;
+                break;
+        }
+        if (balance_cpu == -1)
+                balance_cpu = group_balance_cpu(sg);
+        /*
+         * First idle cpu or the first cpu(busiest) in this sched group
+         * is eligible for doing load balancing at this and above domains.
+         */
+        return balance_cpu == env->dst_cpu;
+}
 /*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
 */
 static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *balance)
+                        int *continue_balancing)
 {
        int ld_moved, cur_ld_moved, active_balance = 0;
+        struct sched_domain *sd_parent = sd->parent;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
@@ -5061,6 +6110,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
+                .fbq_type       = all,
        };
        /*
@@ -5075,11 +6125,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        group = find_busiest_group(&env, balance);
+        if (!should_we_balance(&env)) {
+                *continue_balancing = 0;
-        if (*balance == 0)
                goto out_balanced;
+        }
+        group = find_busiest_group(&env);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[idle]);
                goto out_balanced;
@@ -5108,7 +6159,6 @@ redo:
                env.src_rq    = busiest;
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
-                update_h_load(env.src_cpu);
 more_balance:
                local_irq_save(flags);
                double_rq_lock(env.dst_rq, busiest);
@@ -5152,17 +6202,17 @@ more_balance:
                 * moreover subsequent load balance cycles should correct the
                 * excess load moved.
                 */
-                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
+                        /* Prevent to re-select dst_cpu via env's cpus */
+                        cpumask_clear_cpu(env.dst_cpu, env.cpus);
                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
                        env.dst_cpu      = env.new_dst_cpu;
-                        env.flags       &= ~LBF_SOME_PINNED;
+                        env.flags       &= ~LBF_DST_PINNED;
                        env.loop         = 0;
                        env.loop_break   = sched_nr_migrate_break;
-                        /* Prevent to re-select dst_cpu via env's cpus */
-                        cpumask_clear_cpu(env.dst_cpu, env.cpus);
                        /*
                         * Go back to "more_balance" rather than "redo" since we
                         * need to continue with same src_cpu.
@@ -5170,6 +6220,18 @@ more_balance:
                        goto more_balance;
                }
+                /*
+                 * We failed to reach balance because of affinity.
+                 */
+                if (sd_parent) {
+                        int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+                        if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                                *group_imbalance = 1;
+                        } else if (*group_imbalance)
+                                *group_imbalance = 0;
+                }
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5277,6 +6339,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
+        u64 curr_cost = 0;
        this_rq->idle_stamp = rq_clock(this_rq);
@@ -5292,15 +6355,28 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
-                int balance = 1;
+                int continue_balancing = 1;
+                u64 t0, domain_cost;
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
+                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+                        break;
                if (sd->flags & SD_BALANCE_NEWIDLE) {
+                        t0 = sched_clock_cpu(this_cpu);
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance(this_cpu, this_rq,
-                                                   sd, CPU_NEWLY_IDLE, &balance);
+                                                   sd, CPU_NEWLY_IDLE,
+                                                   &continue_balancing);
+                        domain_cost = sched_clock_cpu(this_cpu) - t0;
+                        if (domain_cost > sd->max_newidle_lb_cost)
+                                sd->max_newidle_lb_cost = domain_cost;
+                        curr_cost += domain_cost;
                }
                interval = msecs_to_jiffies(sd->balance_interval);
@@ -5322,6 +6398,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                 */
                this_rq->next_balance = next_balance;
        }
+        if (curr_cost > this_rq->max_idle_balance_cost)
+                this_rq->max_idle_balance_cost = curr_cost;
 }
 /*
@@ -5455,16 +6534,16 @@ static inline void nohz_balance_exit_idle(int cpu)
 static inline void set_cpu_sd_state_busy(void)
 {
        struct sched_domain *sd;
+        int cpu = smp_processor_id();
        rcu_read_lock();
-        sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+        sd = rcu_dereference(per_cpu(sd_busy, cpu));
        if (!sd || !sd->nohz_idle)
                goto unlock;
        sd->nohz_idle = 0;
-        for (; sd; sd = sd->parent)
+        atomic_inc(&sd->groups->sgp->nr_busy_cpus);
-                atomic_inc(&sd->groups->sgp->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -5472,16 +6551,16 @@ unlock:
 void set_cpu_sd_state_idle(void)
 {
        struct sched_domain *sd;
+        int cpu = smp_processor_id();
        rcu_read_lock();
-        sd = rcu_dereference_check_sched_domain(this_rq()->sd);
+        sd = rcu_dereference(per_cpu(sd_busy, cpu));
        if (!sd || sd->nohz_idle)
                goto unlock;
        sd->nohz_idle = 1;
-        for (; sd; sd = sd->parent)
+        atomic_dec(&sd->groups->sgp->nr_busy_cpus);
-                atomic_dec(&sd->groups->sgp->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -5538,22 +6617,46 @@ void update_max_interval(void)
 */
 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
-        int balance = 1;
+        int continue_balancing = 1;
        struct rq *rq = cpu_rq(cpu);
        unsigned long interval;
        struct sched_domain *sd;
        /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
-        int need_serialize;
+        int need_serialize, need_decay = 0;
+        u64 max_cost = 0;
        update_blocked_averages(cpu);
        rcu_read_lock();
        for_each_domain(cpu, sd) {
+                /*
+                 * Decay the newidle max times here because this is a regular
+                 * visit to all the domains. Decay ~1% per second.
+                 */
+                if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+                        sd->max_newidle_lb_cost =
+                                (sd->max_newidle_lb_cost * 253) / 256;
+                        sd->next_decay_max_lb_cost = jiffies + HZ;
+                        need_decay = 1;
+                }
+                max_cost += sd->max_newidle_lb_cost;
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
+                /*
+                 * Stop the load balance at this level. There is another
+                 * CPU in our sched group which is doing load balancing more
+                 * actively.
+                 */
+                if (!continue_balancing) {
+                        if (need_decay)
+                                continue;
+                        break;
+                }
                interval = sd->balance_interval;
                if (idle != CPU_IDLE)
                        interval *= sd->busy_factor;
@@ -5570,9 +6673,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                }
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                        if (load_balance(cpu, rq, sd, idle, &balance)) {
+                        if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
                                /*
-                                 * The LBF_SOME_PINNED logic could have changed
+                                 * The LBF_DST_PINNED logic could have changed
                                 * env->dst_cpu, so we can't know our idle
                                 * state even if we migrated tasks. Update it.
                                 */
@@ -5587,14 +6690,14 @@ out:
                        next_balance = sd->last_balance + interval;
                        update_next_balance = 1;
                }
+        }
+        if (need_decay) {
                /*
-                 * Stop the load balance at this level. There is another
+                 * Ensure the rq-wide value also decays but keep it at a
-                 * CPU in our sched group which is doing load balancing more
+                 * reasonable floor to avoid funnies with rq->avg_idle.
-                 * actively.
                 */
-                if (!balance)
+                rq->max_idle_balance_cost =
-                        break;
+                        max((u64)sysctl_sched_migration_cost, max_cost);
        }
        rcu_read_unlock();
@@ -5664,6 +6767,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
 {
        unsigned long now = jiffies;
        struct sched_domain *sd;
+        struct sched_group_power *sgp;
+        int nr_busy;
        if (unlikely(idle_cpu(cpu)))
                return 0;
@@ -5689,22 +6794,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
                goto need_kick;
        rcu_read_lock();
-        for_each_domain(cpu, sd) {
+        sd = rcu_dereference(per_cpu(sd_busy, cpu));
-                struct sched_group *sg = sd->groups;
-                struct sched_group_power *sgp = sg->sgp;
-                int nr_busy = atomic_read(&sgp->nr_busy_cpus);
-                if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
+        if (sd) {
-                        goto need_kick_unlock;
+                sgp = sd->groups->sgp;
+                nr_busy = atomic_read(&sgp->nr_busy_cpus);
-                if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
+                if (nr_busy > 1)
-                    && (cpumask_first_and(nohz.idle_cpus_mask,
-                                          sched_domain_span(sd)) < cpu))
                        goto need_kick_unlock;
-                if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
-                        break;
        }
+        sd = rcu_dereference(per_cpu(sd_asym, cpu));
+        if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
+                                  sched_domain_span(sd)) < cpu))
+                goto need_kick_unlock;
        rcu_read_unlock();
        return 0;
@@ -5812,11 +6917,15 @@ static void task_fork_fair(struct task_struct *p)
        cfs_rq = task_cfs_rq(current);
        curr = cfs_rq->curr;
-        if (unlikely(task_cpu(p) != this_cpu)) {
+        /*
-                rcu_read_lock();
+         * Not only the cpu but also the task_group of the parent might have
-                __set_task_cpu(p, this_cpu);
+         * been changed after parent->se.parent,cfs_rq were copied to
-                rcu_read_unlock();
+         * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
-        }
+         * of child point to valid ones.
+         */
+        rcu_read_lock();
+        __set_task_cpu(p, this_cpu);
+        rcu_read_unlock();
        update_curr(cfs_rq);
@@ -5889,11 +6998,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
        * and ensure we don't carry in an old decay_count if we
        * switch back.
        */
-        if (p->se.avg.decay_count) {
+        if (se->avg.decay_count) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+                __synchronize_entity_decay(se);
-                __synchronize_entity_decay(&p->se);
+                subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
-                subtract_blocked_load_contrib(cfs_rq,
-                                p->se.avg.load_avg_contrib);
        }
 #endif
 }
@@ -6095,7 +7202,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                se->cfs_rq = parent->my_q;
        se->my_q = cfs_rq;
-        update_load_set(&se->load, 0);
+        /* guarantee group entities always have weight */
+        update_load_set(&se->load, NICE_0_LOAD);
        se->parent = parent;
 }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 99399f8e4799..5716929a2e3a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
 /*
 * Apply the automatic NUMA scheduling policy. Enabled automatically
 * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=. Allow PTE scanning to be forced on UMA machines
+ * numa_balancing=
- * for debugging the core machinery.
 */
 #ifdef CONFIG_NUMA_BALANCING
 SCHED_FEAT(NUMA,        false)
-SCHED_FEAT(NUMA_FORCE,  false)
+/*
+ * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
+ * higher number of hinting faults are recorded during active load
+ * balancing.
+ */
+SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
+/*
+ * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
+ * lower number of hinting faults have been recorded. As this has
+ * the potential to prevent a task ever migrating to a new node
+ * due to CPU overload it is disabled by default.
+ */
+SCHED_FEAT(NUMA_RESIST_LOWER, false)
 #endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..516c3d9ceea1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
 #ifdef CONFIG_SMP
 static int
-select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 01970c8e64df..7d57275fc396 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
         * if we should look at the mask. It would be a shame
         * if we looked at the mask, but the mask was not
         * updated yet.
+         *
+         * Matched by the barrier in pull_rt_task().
         */
-        wmb();
+        smp_wmb();
        atomic_inc(&rq->rd->rto_count);
 }
@@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq)
 static int find_lowest_rq(struct task_struct *task);
 static int
-select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        struct task_struct *curr;
        struct rq *rq;
-        int cpu;
-        cpu = task_cpu(p);
        if (p->nr_cpus_allowed == 1)
                goto out;
@@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
         */
        if (curr && unlikely(rt_task(curr)) &&
            (curr->nr_cpus_allowed < 2 ||
-             curr->prio <= p->prio) &&
+             curr->prio <= p->prio)) {
-            (p->nr_cpus_allowed > 1)) {
                int target = find_lowest_rq(p);
                if (target != -1)
@@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq)
        if (likely(!rt_overloaded(this_rq)))
                return 0;
+        /*
+         * Match the barrier from rt_set_overloaded; this guarantees that if we
+         * see overloaded we must also see the rto_mask bit.
+         */
+        smp_rmb();
        for_each_cpu(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
@@ -1931,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        p->rt.time_slice = sched_rr_timeslice;
        /*
-         * Requeue to the end of queue if we (and all of our ancestors) are the
+         * Requeue to the end of queue if we (and all of our ancestors) are not
-         * only element on the queue
+         * the only element on the queue
         */
        for_each_sched_rt_entity(rt_se) {
                if (rt_se->run_list.prev != rt_se->run_list.next) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef0a7b2439dd..88c85b21d633 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
 #include <linux/tick.h>
+#include <linux/slab.h>
 #include "cpupri.h"
 #include "cpuacct.h"
@@ -285,7 +286,6 @@ struct cfs_rq {
        /* Required to track per-cpu representation of a task_group */
        u32 tg_runnable_contrib;
        unsigned long tg_load_contrib;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
        /*
         *   h_load = weight * f(tg)
@@ -294,6 +294,9 @@ struct cfs_rq {
         * this group.
         */
        unsigned long h_load;
+        u64 last_h_load_update;
+        struct sched_entity *h_load_next;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -406,6 +409,10 @@ struct rq {
         * remote CPUs use both these fields when doing load calculation.
         */
        unsigned int nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+        unsigned int nr_numa_running;
+        unsigned int nr_preferred_running;
+#endif
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
        unsigned long last_load_update_tick;
@@ -429,9 +436,6 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
        struct list_head leaf_cfs_rq_list;
-#ifdef CONFIG_SMP
-        unsigned long h_load_throttle;
-#endif /* CONFIG_SMP */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -477,6 +481,9 @@ struct rq {
        u64 age_stamp;
        u64 idle_stamp;
        u64 avg_idle;
+        /* This is used to determine avg_idle's max value */
+        u64 max_idle_balance_cost;
 #endif
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -553,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
        return rq->clock_task;
 }
+#ifdef CONFIG_NUMA_BALANCING
+extern void sched_setnuma(struct task_struct *p, int node);
+extern int migrate_task_to(struct task_struct *p, int cpu);
+extern int migrate_swap(struct task_struct *, struct task_struct *);
+#endif /* CONFIG_NUMA_BALANCING */
 #ifdef CONFIG_SMP
 #define rcu_dereference_check_sched_domain(p) \
@@ -594,8 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
        return hsd;
 }
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd) {
+                if (sd->flags & flag)
+                        break;
+        }
+        return sd;
+}
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain *, sd_busy);
+DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 struct sched_group_power {
        atomic_t ref;
@@ -605,6 +634,7 @@ struct sched_group_power {
         */
        unsigned int power, power_orig;
        unsigned long next_update;
+        int imbalance; /* XXX unrelated to power but shared group state */
        /*
         * Number of busy cpus in this group.
         */
@@ -665,9 +695,9 @@ extern int group_balance_cpu(struct sched_group *sg);
 /*
 * Return the group to which this tasks belongs.
 *
- * We cannot use task_subsys_state() and friends because the cgroup
+ * We cannot use task_css() and friends because the cgroup subsystem
- * subsystem changes that value before the cgroup_subsys::attach() method
+ * changes that value before the cgroup_subsys::attach() method is called,
- * is called, therefore we cannot pin it and might observe the wrong value.
+ * therefore we cannot pin it and might observe the wrong value.
 *
 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
 * core changes this before calling sched_move_task().
@@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
         */
        smp_wmb();
        task_thread_info(p)->cpu = cpu;
+        p->wake_cpu = cpu;
 #endif
 }
@@ -974,7 +1005,7 @@ struct sched_class {
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 #ifdef CONFIG_SMP
-        int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+        int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
        void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
+static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+        if (l1 > l2)
+                swap(l1, l2);
+        spin_lock(l1);
+        spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+        if (l1 > l2)
+                swap(l1, l2);
+        raw_spin_lock(l1);
+        raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
 /*
 * double_rq_lock - safely lock two runqueues
 *
@@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
 extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+extern void cfs_bandwidth_usage_inc(void);
+extern void cfs_bandwidth_usage_dec(void);
 #ifdef CONFIG_NO_HZ_COMMON
 enum rq_nohz_flag_bits {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 5aef494fc8b4..4ab704339656 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
 * from dequeue_task() to account for possible rq->clock skew across cpus. The
 * delta taken on each cpu would annul the skew.
 */
-static inline void sched_info_dequeued(struct task_struct *t)
+static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
 {
-        unsigned long long now = rq_clock(task_rq(t)), delta = 0;
+        unsigned long long now = rq_clock(rq), delta = 0;
        if (unlikely(sched_info_on()))
                if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
        sched_info_reset_dequeued(t);
        t->sched_info.run_delay += delta;
-        rq_sched_info_dequeued(task_rq(t), delta);
+        rq_sched_info_dequeued(rq, delta);
 }
 /*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
 * long it was waiting to run.  We also note when it began so that we
 * can keep stats on how long its timeslice is.
 */
-static void sched_info_arrive(struct task_struct *t)
+static void sched_info_arrive(struct rq *rq, struct task_struct *t)
 {
-        unsigned long long now = rq_clock(task_rq(t)), delta = 0;
+        unsigned long long now = rq_clock(rq), delta = 0;
        if (t->sched_info.last_queued)
                delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
        t->sched_info.last_arrival = now;
        t->sched_info.pcount++;
-        rq_sched_info_arrive(task_rq(t), delta);
+        rq_sched_info_arrive(rq, delta);
 }
 /*
@@ -96,29 +96,30 @@ static void sched_info_arrive(struct task_struct *t)
 * the timestamp if it is already not set.  It's assumed that
 * sched_info_dequeued() will clear that stamp when appropriate.
 */
-static inline void sched_info_queued(struct task_struct *t)
+static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
 {
        if (unlikely(sched_info_on()))
                if (!t->sched_info.last_queued)
-                        t->sched_info.last_queued = rq_clock(task_rq(t));
+                        t->sched_info.last_queued = rq_clock(rq);
 }
 /*
- * Called when a process ceases being the active-running process, either
+ * Called when a process ceases being the active-running process involuntarily
- * voluntarily or involuntarily.  Now we can calculate how long we ran.
+ * due, typically, to expiring its time slice (this may also be called when
+ * switching to the idle task).  Now we can calculate how long we ran.
 * Also, if the process is still in the TASK_RUNNING state, call
 * sched_info_queued() to mark that it has now again started waiting on
 * the runqueue.
 */
-static inline void sched_info_depart(struct task_struct *t)
+static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
 {
-        unsigned long long delta = rq_clock(task_rq(t)) -
+        unsigned long long delta = rq_clock(rq) -
                                        t->sched_info.last_arrival;
-        rq_sched_info_depart(task_rq(t), delta);
+        rq_sched_info_depart(rq, delta);
        if (t->state == TASK_RUNNING)
-                sched_info_queued(t);
+                sched_info_queued(rq, t);
 }
 /*
@@ -127,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
 * the idle task.)  We are only called when prev != next.
 */
 static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct rq *rq,
+                    struct task_struct *prev, struct task_struct *next)
 {
-        struct rq *rq = task_rq(prev);
        /*
         * prev now departs the cpu.  It's not interesting to record
         * stats about how efficient we were at scheduling the idle
         * process, however.
         */
        if (prev != rq->idle)
-                sched_info_depart(prev);
+                sched_info_depart(rq, prev);
        if (next != rq->idle)
-                sched_info_arrive(next);
+                sched_info_arrive(rq, next);
 }
 static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq,
+                  struct task_struct *prev, struct task_struct *next)
 {
        if (unlikely(sched_info_on()))
-                __sched_info_switch(prev, next);
+                __sched_info_switch(rq, prev, next);
 }
 #else
-#define sched_info_queued(t)                    do { } while (0)
+#define sched_info_queued(rq, t)                do { } while (0)
 #define sched_info_reset_dequeued(t)    do { } while (0)
-#define sched_info_dequeued(t)                  do { } while (0)
+#define sched_info_dequeued(rq, t)              do { } while (0)
-#define sched_info_switch(t, next)              do { } while (0)
+#define sched_info_depart(rq, t)                do { } while (0)
+#define sched_info_arrive(rq, next)             do { } while (0)
+#define sched_info_switch(rq, t, next)          do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 /*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index e08fbeeb54b9..47197de8abd9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
 #ifdef CONFIG_SMP
 static int
-select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        return task_cpu(p); /* stop tasks as never migrate */
 }
diff --git a/kernel/wait.c b/kernel/sched/wait.c
index dec68bd4e9d8..7d50f794e248 100644
--- a/kernel/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue);
 /*
+ * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
+ * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
+ * number) then we wake all the non-exclusive tasks and one exclusive task.
+ *
+ * There are circumstances in which we can try to wake a task which has already
+ * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
+ * zero in this (rare) case, and we handle it by continuing to scan the queue.
+ */
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+                        int nr_exclusive, int wake_flags, void *key)
+{
+        wait_queue_t *curr, *next;
+        list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
+                unsigned flags = curr->flags;
+                if (curr->func(curr, mode, wake_flags, key) &&
+                                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+                        break;
+        }
+}
+/**
+ * __wake_up - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up(wait_queue_head_t *q, unsigned int mode,
+                        int nr_exclusive, void *key)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&q->lock, flags);
+        __wake_up_common(q, mode, nr_exclusive, 0, key);
+        spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(__wake_up);
+/*
+ * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
+ */
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
+{
+        __wake_up_common(q, mode, nr, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+        __wake_up_common(q, mode, 1, 0, key);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
+/**
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
+ * @q: the waitqueue
+ * @mode: which threads
+ * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+                        int nr_exclusive, void *key)
+{
+        unsigned long flags;
+        int wake_flags = 1; /* XXX WF_SYNC */
+        if (unlikely(!q))
+                return;
+        if (unlikely(nr_exclusive != 1))
+                wake_flags = 0;
+        spin_lock_irqsave(&q->lock, flags);
+        __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
+        spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+        __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
+/*
 * Note: we use "set_current_state()" _after_ the wait-queue add,
 * because we need a memory barrier there on SMP, so that any
 * wake-function that tests for the wait-queue being active
@@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+        unsigned long flags;
+        if (signal_pending_state(state, current))
+                return -ERESTARTSYS;
+        wait->private = current;
+        wait->func = autoremove_wake_function;
+        spin_lock_irqsave(&q->lock, flags);
+        if (list_empty(&wait->task_list)) {
+                if (wait->flags & WQ_FLAG_EXCLUSIVE)
+                        __add_wait_queue_tail(q, wait);
+                else
+                        __add_wait_queue(q, wait);
+        }
+        set_current_state(state);
+        spin_unlock_irqrestore(&q->lock, flags);
+        return 0;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
 /**
 * finish_wait - clean up after waiting in a queue
 * @q: waitqueue waited on
@@ -363,8 +490,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
 /**
 * wake_up_atomic_t - Wake up a waiter on a atomic_t
- * @word: The word being waited on, a kernel virtual address
+ * @p: The atomic_t being waited on, a kernel virtual address
- * @bit: The bit of the word being waited on
 *
 * Wake up anyone waiting for the atomic_t to go to zero.
 *
diff --git a/kernel/signal.c b/kernel/signal.c
index 50e41075ac77..ded28b91fa53 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
                new_ka.sa.sa_restorer = compat_ptr(restorer);
 #endif
                ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
-                ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+                ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
                if (ret)
                        return -EFAULT;
                sigset_from_compat(&new_ka.sa.sa_mask, &mask);
@@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
                ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
                               &oact->sa_handler);
                ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
-                ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+                ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
 #ifdef __ARCH_HAS_SA_RESTORER
                ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
                                &oact->sa_restorer);
diff --git a/kernel/smp.c b/kernel/smp.c
index fe9f773d7114..f5768b0c816a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
                                cpu_to_node(cpu)))
                        return notifier_from_errno(-ENOMEM);
                if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
-                                cpu_to_node(cpu)))
+                                cpu_to_node(cpu))) {
+                        free_cpumask_var(cfd->cpumask);
                        return notifier_from_errno(-ENOMEM);
+                }
                cfd->csd = alloc_percpu(struct call_single_data);
                if (!cfd->csd) {
+                        free_cpumask_var(cfd->cpumask_ipi);
                        free_cpumask_var(cfd->cpumask);
                        return notifier_from_errno(-ENOMEM);
                }
@@ -186,25 +189,13 @@ void generic_smp_call_function_single_interrupt(void)
        while (!list_empty(&list)) {
                struct call_single_data *csd;
-                unsigned int csd_flags;
                csd = list_entry(list.next, struct call_single_data, list);
                list_del(&csd->list);
-                /*
-                 * 'csd' can be invalid after this call if flags == 0
-                 * (when called through generic_exec_single()),
-                 * so save them away before making the call:
-                 */
-                csd_flags = csd->flags;
                csd->func(csd->info);
-                /*
+                csd_unlock(csd);
-                 * Unlocked CSDs are valid through generic_exec_single():
-                 */
-                if (csd_flags & CSD_FLAG_LOCK)
-                        csd_unlock(csd);
        }
 }
@@ -278,8 +269,6 @@ EXPORT_SYMBOL(smp_call_function_single);
 * @wait: If true, wait until function has completed.
 *
 * Returns 0 on success, else a negative status code (if no cpus were online).
- * Note that @wait will be implicitly turned on in case of allocation failures,
- * since we fall back to on-stack allocation.
 *
 * Selection preference:
 *      1) current cpu if in @mask
@@ -535,6 +524,11 @@ void __init setup_nr_cpu_ids(void)
        nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
 }
+void __weak smp_announce(void)
+{
+        printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
+}
 /* Called by boot processor to activate the rest. */
 void __init smp_init(void)
 {
@@ -551,7 +545,7 @@ void __init smp_init(void)
        }
        /* Any cleanup work */
-        printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+        smp_announce();
        smp_cpus_done(setup_max_cpus);
 }
@@ -586,8 +580,10 @@ EXPORT_SYMBOL(on_each_cpu);
 *
 * If @wait is true, then returns once @func has returned.
 *
- * You must not call this function with disabled interrupts or
+ * You must not call this function with disabled interrupts or from a
- * from a hardware interrupt handler or from a bottom half handler.
+ * hardware interrupt handler or from a bottom half handler.  The
+ * exception is that it may be used during early boot while
+ * early_boot_irqs_disabled is set.
 */
 void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
                        void *info, bool wait)
@@ -596,9 +592,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
        smp_call_function_many(mask, func, info, wait);
        if (cpumask_test_cpu(cpu, mask)) {
-                local_irq_disable();
+                unsigned long flags;
+                local_irq_save(flags);
                func(info);
-                local_irq_enable();
+                local_irq_restore(flags);
        }
        put_cpu();
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index be3d3514c325..b24988353458 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -29,7 +29,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
-#include <asm/irq.h>
 /*
   - No shared variables, all the data are CPU local.
   - If a softirq needs serialization, let it serialize itself
@@ -100,13 +99,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
        raw_local_irq_save(flags);
        /*
-         * The preempt tracer hooks into add_preempt_count and will break
+         * The preempt tracer hooks into preempt_count_add and will break
         * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
         * is set and before current->softirq_enabled is cleared.
         * We must manually increment preempt_count here and manually
         * call the trace_preempt_off later.
         */
-        preempt_count() += cnt;
+        __preempt_count_add(cnt);
        /*
         * Were softirqs turned off above:
         */
@@ -120,7 +119,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
 #else /* !CONFIG_TRACE_IRQFLAGS */
 static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
-        add_preempt_count(cnt);
+        preempt_count_add(cnt);
        barrier();
 }
 #endif /* CONFIG_TRACE_IRQFLAGS */
@@ -134,12 +133,11 @@ EXPORT_SYMBOL(local_bh_disable);
 static void __local_bh_enable(unsigned int cnt)
 {
-        WARN_ON_ONCE(in_irq());
        WARN_ON_ONCE(!irqs_disabled());
        if (softirq_count() == cnt)
                trace_softirqs_on(_RET_IP_);
-        sub_preempt_count(cnt);
+        preempt_count_sub(cnt);
 }
 /*
@@ -149,6 +147,7 @@ static void __local_bh_enable(unsigned int cnt)
 */
 void _local_bh_enable(void)
 {
+        WARN_ON_ONCE(in_irq());
        __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
 }
@@ -169,12 +168,17 @@ static inline void _local_bh_enable_ip(unsigned long ip)
         * Keep preemption disabled until we are done with
         * softirq processing:
         */
-        sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
+        preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
-        if (unlikely(!in_interrupt() && local_softirq_pending()))
+        if (unlikely(!in_interrupt() && local_softirq_pending())) {
+                /*
+                 * Run softirq if any pending. And do it in its own stack
+                 * as we may be calling this deep in a task call stack already.
+                 */
                do_softirq();
+        }
-        dec_preempt_count();
+        preempt_count_dec();
 #ifdef CONFIG_TRACE_IRQFLAGS
        local_irq_enable();
 #endif
@@ -256,7 +260,7 @@ restart:
                                       " exited with %08x?\n", vec_nr,
                                       softirq_to_name[vec_nr], h->action,
                                       prev_count, preempt_count());
-                                preempt_count() = prev_count;
+                                preempt_count_set(prev_count);
                        }
                        rcu_bh_qs(cpu);
@@ -280,10 +284,11 @@ restart:
        account_irq_exit_time(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
+        WARN_ON_ONCE(in_interrupt());
        tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
-#ifndef __ARCH_HAS_DO_SOFTIRQ
 asmlinkage void do_softirq(void)
 {
@@ -298,13 +303,11 @@ asmlinkage void do_softirq(void)
        pending = local_softirq_pending();
        if (pending)
-                __do_softirq();
+                do_softirq_own_stack();
        local_irq_restore(flags);
 }
-#endif
 /*
 * Enter an interrupt context.
 */
@@ -328,10 +331,25 @@ void irq_enter(void)
 static inline void invoke_softirq(void)
 {
-        if (!force_irqthreads)
+        if (!force_irqthreads) {
+#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
+                /*
+                 * We can safely execute softirq on the current stack if
+                 * it is the irq stack, because it should be near empty
+                 * at this stage.
+                 */
                __do_softirq();
-        else
+#else
+                /*
+                 * Otherwise, irq_exit() is called on the task stack that can
+                 * be potentially deep already. So call softirq in its own stack
+                 * to prevent from any overrun.
+                 */
+                do_softirq_own_stack();
+#endif
+        } else {
                wakeup_softirqd();
+        }
 }
 static inline void tick_irq_exit(void)
@@ -360,7 +378,7 @@ void irq_exit(void)
        account_irq_exit_time(current);
        trace_hardirq_exit();
-        sub_preempt_count(HARDIRQ_OFFSET);
+        preempt_count_sub(HARDIRQ_OFFSET);
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
@@ -762,6 +780,10 @@ static void run_ksoftirqd(unsigned int cpu)
 {
        local_irq_disable();
        if (local_softirq_pending()) {
+                /*
+                 * We can safely run softirq on inline stack, as we are not deep
+                 * in the task stack here.
+                 */
                __do_softirq();
                rcu_note_context_switch(cpu);
                local_irq_enable();
@@ -876,7 +898,6 @@ int __init __weak early_irq_init(void)
        return 0;
 }
-#ifdef CONFIG_GENERIC_HARDIRQS
 int __init __weak arch_probe_nr_irqs(void)
 {
        return NR_IRQS_LEGACY;
@@ -886,4 +907,3 @@ int __init __weak arch_early_irq_init(void)
 {
        return 0;
 }
-#endif
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5cdd8065a3ce..4b082b5cac9e 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -34,6 +34,20 @@
 #else
 #define raw_read_can_lock(l)    read_can_lock(l)
 #define raw_write_can_lock(l)   write_can_lock(l)
+/*
+ * Some architectures can relax in favour of the CPU owning the lock.
+ */
+#ifndef arch_read_relax
+# define arch_read_relax(l)     cpu_relax()
+#endif
+#ifndef arch_write_relax
+# define arch_write_relax(l)    cpu_relax()
+#endif
+#ifndef arch_spin_relax
+# define arch_spin_relax(l)     cpu_relax()
+#endif
 /*
 * We build the __lock_function inlines here. They are too large for
 * inlining all over the place, but here is only one user per function
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index c09f2955ae30..84571e09c907 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,6 +20,7 @@
 #include <linux/kallsyms.h>
 #include <linux/smpboot.h>
 #include <linux/atomic.h>
+#include <linux/lglock.h>
 /*
 * Structure to determine completion condition and record errors.  May
@@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
 static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
 static bool stop_machine_initialized = false;
+/*
+ * Avoids a race between stop_two_cpus and global stop_cpus, where
+ * the stoppers could get queued up in reverse order, leading to
+ * system deadlock. Using an lglock means stop_two_cpus remains
+ * relatively cheap.
+ */
+DEFINE_STATIC_LGLOCK(stop_cpus_lock);
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
        memset(done, 0, sizeof(*done));
@@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
        return done.executed ? done.ret : -ENOENT;
 }
+/* This controls the threads on each CPU. */
+enum multi_stop_state {
+        /* Dummy starting state for thread. */
+        MULTI_STOP_NONE,
+        /* Awaiting everyone to be scheduled. */
+        MULTI_STOP_PREPARE,
+        /* Disable interrupts. */
+        MULTI_STOP_DISABLE_IRQ,
+        /* Run the function */
+        MULTI_STOP_RUN,
+        /* Exit */
+        MULTI_STOP_EXIT,
+};
+struct multi_stop_data {
+        int                     (*fn)(void *);
+        void                    *data;
+        /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+        unsigned int            num_threads;
+        const struct cpumask    *active_cpus;
+        enum multi_stop_state   state;
+        atomic_t                thread_ack;
+};
+static void set_state(struct multi_stop_data *msdata,
+                      enum multi_stop_state newstate)
+{
+        /* Reset ack counter. */
+        atomic_set(&msdata->thread_ack, msdata->num_threads);
+        smp_wmb();
+        msdata->state = newstate;
+}
+/* Last one to ack a state moves to the next state. */
+static void ack_state(struct multi_stop_data *msdata)
+{
+        if (atomic_dec_and_test(&msdata->thread_ack))
+                set_state(msdata, msdata->state + 1);
+}
+/* This is the cpu_stop function which stops the CPU. */
+static int multi_cpu_stop(void *data)
+{
+        struct multi_stop_data *msdata = data;
+        enum multi_stop_state curstate = MULTI_STOP_NONE;
+        int cpu = smp_processor_id(), err = 0;
+        unsigned long flags;
+        bool is_active;
+        /*
+         * When called from stop_machine_from_inactive_cpu(), irq might
+         * already be disabled.  Save the state and restore it on exit.
+         */
+        local_save_flags(flags);
+        if (!msdata->active_cpus)
+                is_active = cpu == cpumask_first(cpu_online_mask);
+        else
+                is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
+        /* Simple state machine */
+        do {
+                /* Chill out and ensure we re-read multi_stop_state. */
+                cpu_relax();
+                if (msdata->state != curstate) {
+                        curstate = msdata->state;
+                        switch (curstate) {
+                        case MULTI_STOP_DISABLE_IRQ:
+                                local_irq_disable();
+                                hard_irq_disable();
+                                break;
+                        case MULTI_STOP_RUN:
+                                if (is_active)
+                                        err = msdata->fn(msdata->data);
+                                break;
+                        default:
+                                break;
+                        }
+                        ack_state(msdata);
+                }
+        } while (curstate != MULTI_STOP_EXIT);
+        local_irq_restore(flags);
+        return err;
+}
+struct irq_cpu_stop_queue_work_info {
+        int cpu1;
+        int cpu2;
+        struct cpu_stop_work *work1;
+        struct cpu_stop_work *work2;
+};
+/*
+ * This function is always run with irqs and preemption disabled.
+ * This guarantees that both work1 and work2 get queued, before
+ * our local migrate thread gets the chance to preempt us.
+ */
+static void irq_cpu_stop_queue_work(void *arg)
+{
+        struct irq_cpu_stop_queue_work_info *info = arg;
+        cpu_stop_queue_work(info->cpu1, info->work1);
+        cpu_stop_queue_work(info->cpu2, info->work2);
+}
+/**
+ * stop_two_cpus - stops two cpus
+ * @cpu1: the cpu to stop
+ * @cpu2: the other cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Stops both the current and specified CPU and runs @fn on one of them.
+ *
+ * returns when both are completed.
+ */
+int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
+{
+        struct cpu_stop_done done;
+        struct cpu_stop_work work1, work2;
+        struct irq_cpu_stop_queue_work_info call_args;
+        struct multi_stop_data msdata;
+        preempt_disable();
+        msdata = (struct multi_stop_data){
+                .fn = fn,
+                .data = arg,
+                .num_threads = 2,
+                .active_cpus = cpumask_of(cpu1),
+        };
+        work1 = work2 = (struct cpu_stop_work){
+                .fn = multi_cpu_stop,
+                .arg = &msdata,
+                .done = &done
+        };
+        call_args = (struct irq_cpu_stop_queue_work_info){
+                .cpu1 = cpu1,
+                .cpu2 = cpu2,
+                .work1 = &work1,
+                .work2 = &work2,
+        };
+        cpu_stop_init_done(&done, 2);
+        set_state(&msdata, MULTI_STOP_PREPARE);
+        /*
+         * If we observe both CPUs active we know _cpu_down() cannot yet have
+         * queued its stop_machine works and therefore ours will get executed
+         * first. Or its not either one of our CPUs that's getting unplugged,
+         * in which case we don't care.
+         *
+         * This relies on the stopper workqueues to be FIFO.
+         */
+        if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
+                preempt_enable();
+                return -ENOENT;
+        }
+        lg_local_lock(&stop_cpus_lock);
+        /*
+         * Queuing needs to be done by the lowest numbered CPU, to ensure
+         * that works are always queued in the same order on every CPU.
+         * This prevents deadlocks.
+         */
+        smp_call_function_single(min(cpu1, cpu2),
+                                 &irq_cpu_stop_queue_work,
+                                 &call_args, 0);
+        lg_local_unlock(&stop_cpus_lock);
+        preempt_enable();
+        wait_for_completion(&done.completion);
+        return done.executed ? done.ret : -ENOENT;
+}
 /**
 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
 * @cpu: cpu to stop
@@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
         * preempted by a stopper which might wait for other stoppers
         * to enter @fn which can lead to deadlock.
         */
-        preempt_disable();
+        lg_global_lock(&stop_cpus_lock);
        for_each_cpu(cpu, cpumask)
                cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
-        preempt_enable();
+        lg_global_unlock(&stop_cpus_lock);
 }
 static int __stop_cpus(const struct cpumask *cpumask,
@@ -359,98 +546,14 @@ early_initcall(cpu_stop_init);
 #ifdef CONFIG_STOP_MACHINE
-/* This controls the threads on each CPU. */
-enum stopmachine_state {
-        /* Dummy starting state for thread. */
-        STOPMACHINE_NONE,
-        /* Awaiting everyone to be scheduled. */
-        STOPMACHINE_PREPARE,
-        /* Disable interrupts. */
-        STOPMACHINE_DISABLE_IRQ,
-        /* Run the function */
-        STOPMACHINE_RUN,
-        /* Exit */
-        STOPMACHINE_EXIT,
-};
-struct stop_machine_data {
-        int                     (*fn)(void *);
-        void                    *data;
-        /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-        unsigned int            num_threads;
-        const struct cpumask    *active_cpus;
-        enum stopmachine_state  state;
-        atomic_t                thread_ack;
-};
-static void set_state(struct stop_machine_data *smdata,
-                      enum stopmachine_state newstate)
-{
-        /* Reset ack counter. */
-        atomic_set(&smdata->thread_ack, smdata->num_threads);
-        smp_wmb();
-        smdata->state = newstate;
-}
-/* Last one to ack a state moves to the next state. */
-static void ack_state(struct stop_machine_data *smdata)
-{
-        if (atomic_dec_and_test(&smdata->thread_ack))
-                set_state(smdata, smdata->state + 1);
-}
-/* This is the cpu_stop function which stops the CPU. */
-static int stop_machine_cpu_stop(void *data)
-{
-        struct stop_machine_data *smdata = data;
-        enum stopmachine_state curstate = STOPMACHINE_NONE;
-        int cpu = smp_processor_id(), err = 0;
-        unsigned long flags;
-        bool is_active;
-        /*
-         * When called from stop_machine_from_inactive_cpu(), irq might
-         * already be disabled.  Save the state and restore it on exit.
-         */
-        local_save_flags(flags);
-        if (!smdata->active_cpus)
-                is_active = cpu == cpumask_first(cpu_online_mask);
-        else
-                is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
-        /* Simple state machine */
-        do {
-                /* Chill out and ensure we re-read stopmachine_state. */
-                cpu_relax();
-                if (smdata->state != curstate) {
-                        curstate = smdata->state;
-                        switch (curstate) {
-                        case STOPMACHINE_DISABLE_IRQ:
-                                local_irq_disable();
-                                hard_irq_disable();
-                                break;
-                        case STOPMACHINE_RUN:
-                                if (is_active)
-                                        err = smdata->fn(smdata->data);
-                                break;
-                        default:
-                                break;
-                        }
-                        ack_state(smdata);
-                }
-        } while (curstate != STOPMACHINE_EXIT);
-        local_irq_restore(flags);
-        return err;
-}
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
-        struct stop_machine_data smdata = { .fn = fn, .data = data,
+        struct multi_stop_data msdata = {
-                                            .num_threads = num_online_cpus(),
+                .fn = fn,
-                                            .active_cpus = cpus };
+                .data = data,
+                .num_threads = num_online_cpus(),
+                .active_cpus = cpus,
+        };
        if (!stop_machine_initialized) {
                /*
@@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
                unsigned long flags;
                int ret;
-                WARN_ON_ONCE(smdata.num_threads != 1);
+                WARN_ON_ONCE(msdata.num_threads != 1);
                local_irq_save(flags);
                hard_irq_disable();
@@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
        }
        /* Set the initial state and stop all online cpus. */
-        set_state(&smdata, STOPMACHINE_PREPARE);
+        set_state(&msdata, MULTI_STOP_PREPARE);
-        return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
+        return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
 }
 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
 int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
                                  const struct cpumask *cpus)
 {
-        struct stop_machine_data smdata = { .fn = fn, .data = data,
+        struct multi_stop_data msdata = { .fn = fn, .data = data,
                                            .active_cpus = cpus };
        struct cpu_stop_done done;
        int ret;
        /* Local CPU must be inactive and CPU hotplug in progress. */
        BUG_ON(cpu_active(raw_smp_processor_id()));
-        smdata.num_threads = num_active_cpus() + 1;     /* +1 for local */
+        msdata.num_threads = num_active_cpus() + 1;     /* +1 for local */
        /* No proper task established and can't sleep - busy wait for lock. */
        while (!mutex_trylock(&stop_cpus_mutex))
                cpu_relax();
        /* Schedule work on other CPUs and execute directly for local CPU */
-        set_state(&smdata, STOPMACHINE_PREPARE);
+        set_state(&msdata, MULTI_STOP_PREPARE);
        cpu_stop_init_done(&done, num_active_cpus());
-        queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
+        queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
                             &done);
-        ret = stop_machine_cpu_stop(&smdata);
+        ret = multi_cpu_stop(&msdata);
        /* Busy wait for completion. */
        while (!completion_done(&done.completion))
diff --git a/kernel/sys.c b/kernel/sys.c
index 771129b299f8..c18ecca575b4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -337,7 +337,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
        if (rgid != (gid_t) -1) {
                if (gid_eq(old->gid, krgid) ||
                    gid_eq(old->egid, krgid) ||
-                    nsown_capable(CAP_SETGID))
+                    ns_capable(old->user_ns, CAP_SETGID))
                        new->gid = krgid;
                else
                        goto error;
@@ -346,7 +346,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
                if (gid_eq(old->gid, kegid) ||
                    gid_eq(old->egid, kegid) ||
                    gid_eq(old->sgid, kegid) ||
-                    nsown_capable(CAP_SETGID))
+                    ns_capable(old->user_ns, CAP_SETGID))
                        new->egid = kegid;
                else
                        goto error;
@@ -387,7 +387,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
        old = current_cred();
        retval = -EPERM;
-        if (nsown_capable(CAP_SETGID))
+        if (ns_capable(old->user_ns, CAP_SETGID))
                new->gid = new->egid = new->sgid = new->fsgid = kgid;
        else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid))
                new->egid = new->fsgid = kgid;
@@ -471,7 +471,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
                new->uid = kruid;
                if (!uid_eq(old->uid, kruid) &&
                    !uid_eq(old->euid, kruid) &&
-                    !nsown_capable(CAP_SETUID))
+                    !ns_capable(old->user_ns, CAP_SETUID))
                        goto error;
        }
@@ -480,7 +480,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
                if (!uid_eq(old->uid, keuid) &&
                    !uid_eq(old->euid, keuid) &&
                    !uid_eq(old->suid, keuid) &&
-                    !nsown_capable(CAP_SETUID))
+                    !ns_capable(old->user_ns, CAP_SETUID))
                        goto error;
        }
@@ -534,7 +534,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
        old = current_cred();
        retval = -EPERM;
-        if (nsown_capable(CAP_SETUID)) {
+        if (ns_capable(old->user_ns, CAP_SETUID)) {
                new->suid = new->uid = kuid;
                if (!uid_eq(kuid, old->uid)) {
                        retval = set_user(new);
@@ -591,7 +591,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
        old = current_cred();
        retval = -EPERM;
-        if (!nsown_capable(CAP_SETUID)) {
+        if (!ns_capable(old->user_ns, CAP_SETUID)) {
                if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
                    !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
                        goto error;
@@ -673,7 +673,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
        old = current_cred();
        retval = -EPERM;
-        if (!nsown_capable(CAP_SETGID)) {
+        if (!ns_capable(old->user_ns, CAP_SETGID)) {
                if (rgid != (gid_t) -1        && !gid_eq(krgid, old->gid) &&
                    !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
                        goto error;
@@ -744,7 +744,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
        if (uid_eq(kuid, old->uid)  || uid_eq(kuid, old->euid)  ||
            uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) ||
-            nsown_capable(CAP_SETUID)) {
+            ns_capable(old->user_ns, CAP_SETUID)) {
                if (!uid_eq(kuid, old->fsuid)) {
                        new->fsuid = kuid;
                        if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -783,7 +783,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
        if (gid_eq(kgid, old->gid)  || gid_eq(kgid, old->egid)  ||
            gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) ||
-            nsown_capable(CAP_SETGID)) {
+            ns_capable(old->user_ns, CAP_SETGID)) {
                if (!gid_eq(kgid, old->fsgid)) {
                        new->fsgid = kgid;
                        goto change_okay;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 07f6fc468e17..36547dddcdb8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
-static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
+static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
 static int sysrq_sysctl_handler(ctl_table *table, int write,
                                void __user *buffer, size_t *lenp,
@@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-                .procname       = "numa_balancing_scan_period_reset",
-                .data           = &sysctl_numa_balancing_scan_period_reset,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
                .procname       = "numa_balancing_scan_period_max_ms",
                .data           = &sysctl_numa_balancing_scan_period_max,
                .maxlen         = sizeof(unsigned int),
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+        {
+                .procname       = "numa_balancing_settle_count",
+                .data           = &sysctl_numa_balancing_settle_count,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_migrate_deferred",
+                .data           = &sysctl_numa_balancing_migrate_deferred,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
        {
@@ -1049,6 +1056,7 @@ static struct ctl_table kern_table[] = {
                .maxlen         = sizeof(sysctl_perf_event_sample_rate),
                .mode           = 0644,
                .proc_handler   = perf_proc_update_handler,
+                .extra1         = &one,
        },
        {
                .procname       = "perf_cpu_time_max_percent",
@@ -1225,7 +1233,7 @@ static struct ctl_table vm_table[] = {
                .data           = &hugepages_treat_as_movable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = hugetlb_treat_movable_handler,
+                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "nr_overcommit_hugepages",
@@ -1471,14 +1479,14 @@ static struct ctl_table fs_table[] = {
        {
                .procname       = "inode-nr",
                .data           = &inodes_stat,
-                .maxlen         = 2*sizeof(int),
+                .maxlen         = 2*sizeof(long),
                .mode           = 0444,
                .proc_handler   = proc_nr_inodes,
        },
        {
                .procname       = "inode-state",
                .data           = &inodes_stat,
-                .maxlen         = 7*sizeof(int),
+                .maxlen         = 7*sizeof(long),
                .mode           = 0444,
                .proc_handler   = proc_nr_inodes,
        },
@@ -1508,7 +1516,7 @@ static struct ctl_table fs_table[] = {
        {
                .procname       = "dentry-state",
                .data           = &dentry_stat,
-                .maxlen         = 6*sizeof(int),
+                .maxlen         = 6*sizeof(long),
                .mode           = 0444,
                .proc_handler   = proc_nr_dentry,
        },
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 65bd3c92d6f3..8727032e3a6f 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -4,6 +4,23 @@
 static struct callback_head work_exited; /* all we need is ->next == NULL */
+/**
+ * task_work_add - ask the @task to execute @work->func()
+ * @task: the task which should run the callback
+ * @work: the callback to run
+ * @notify: send the notification if true
+ *
+ * Queue @work for task_work_run() below and notify the @task if @notify.
+ * Fails if the @task is exiting/exited and thus it can't process this @work.
+ * Otherwise @work->func() will be called when the @task returns from kernel
+ * mode or exits.
+ *
+ * This is like the signal handler which runs in kernel mode, but it doesn't
+ * try to wake up the @task.
+ *
+ * RETURNS:
+ * 0 if succeeds or -ESRCH.
+ */
 int
 task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
 {
@@ -21,11 +38,22 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
        return 0;
 }
+/**
+ * task_work_cancel - cancel a pending work added by task_work_add()
+ * @task: the task which should execute the work
+ * @func: identifies the work to remove
+ *
+ * Find the last queued pending work with ->func == @func and remove
+ * it from queue.
+ *
+ * RETURNS:
+ * The found work or NULL if not found.
+ */
 struct callback_head *
 task_work_cancel(struct task_struct *task, task_work_func_t func)
 {
        struct callback_head **pprev = &task->task_works;
-        struct callback_head *work = NULL;
+        struct callback_head *work;
        unsigned long flags;
        /*
         * If cmpxchg() fails we continue without updating pprev.
@@ -35,7 +63,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
         */
        raw_spin_lock_irqsave(&task->pi_lock, flags);
        while ((work = ACCESS_ONCE(*pprev))) {
-                read_barrier_depends();
+                smp_read_barrier_depends();
                if (work->func != func)
                        pprev = &work->next;
                else if (cmpxchg(pprev, work, work->next) == work)
@@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
        return work;
 }
+/**
+ * task_work_run - execute the works added by task_work_add()
+ *
+ * Flush the pending works. Should be used by the core kernel code.
+ * Called before the task returns to the user-mode or stops, or when
+ * it exits. In the latter case task_work_add() can no longer add the
+ * new work after task_work_run() returns.
+ */
 void task_work_run(void)
 {
        struct task_struct *task = current;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 70f27e89012b..3ce6e8c5f3fc 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -100,12 +100,11 @@ config NO_HZ_FULL
        # RCU_USER_QS dependency
        depends on HAVE_CONTEXT_TRACKING
        # VIRT_CPU_ACCOUNTING_GEN dependency
-        depends on 64BIT
+        depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
        select NO_HZ_COMMON
        select RCU_USER_QS
        select RCU_NOCB_CPU
        select VIRT_CPU_ACCOUNTING_GEN
-        select CONTEXT_TRACKING_FORCE
        select IRQ_WORK
        help
         Adaptively try to shutdown the tick whenever possible, even when
@@ -134,6 +133,56 @@ config NO_HZ_FULL_ALL
         Note the boot CPU will still be kept outside the range to
         handle the timekeeping duty.
+config NO_HZ_FULL_SYSIDLE
+        bool "Detect full-system idle state for full dynticks system"
+        depends on NO_HZ_FULL
+        default n
+        help
+         At least one CPU must keep the scheduling-clock tick running for
+         timekeeping purposes whenever there is a non-idle CPU, where
+         "non-idle" also includes dynticks CPUs as long as they are
+         running non-idle tasks.  Because the underlying adaptive-tick
+         support cannot distinguish between all CPUs being idle and
+         all CPUs each running a single task in dynticks mode, the
+         underlying support simply ensures that there is always a CPU
+         handling the scheduling-clock tick, whether or not all CPUs
+         are idle.  This Kconfig option enables scalable detection of
+         the all-CPUs-idle state, thus allowing the scheduling-clock
+         tick to be disabled when all CPUs are idle.  Note that scalable
+         detection of the all-CPUs-idle state means that larger systems
+         will be slower to declare the all-CPUs-idle state.
+         Say Y if you would like to help debug all-CPUs-idle detection.
+         Say N if you are unsure.
+config NO_HZ_FULL_SYSIDLE_SMALL
+        int "Number of CPUs above which large-system approach is used"
+        depends on NO_HZ_FULL_SYSIDLE
+        range 1 NR_CPUS
+        default 8
+        help
+         The full-system idle detection mechanism takes a lazy approach
+         on large systems, as is required to attain decent scalability.
+         However, on smaller systems, scalability is not anywhere near as
+         large a concern as is energy efficiency.  The sysidle subsystem
+         therefore uses a fast but non-scalable algorithm for small
+         systems and a lazier but scalable algorithm for large systems.
+         This Kconfig parameter defines the number of CPUs in the largest
+         system that will be considered to be "small".
+         The default value will be fine in most cases.  Battery-powered
+         systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
+         numbers of CPUs, and (3) are suffering from battery-lifetime
+         problems due to long sysidle latencies might wish to experiment
+         with larger values for this Kconfig parameter.  On the other
+         hand, they might be even better served by disabling NO_HZ_FULL
+         entirely, given that NO_HZ_FULL is intended for HPC and
+         real-time workloads that at present do not tend to be run on
+         battery-powered systems.
+         Take the default if you are unsure.
 config NO_HZ
        bool "Old Idle dynticks config"
        depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index eec50fcef9e4..88c9c65a430d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
        clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
        if (!alarmtimer_get_rtcdev())
-                return -ENOTSUPP;
+                return -EINVAL;
        return hrtimer_get_res(baseid, tp);
 }
@@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
        struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
        if (!alarmtimer_get_rtcdev())
-                return -ENOTSUPP;
+                return -EINVAL;
        *tp = ktime_to_timespec(base->gettime());
        return 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 38959c866789..086ad6043bcb 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -33,29 +33,64 @@ struct ce_unbind {
        int res;
 };
-/**
+static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
- * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+                        bool ismax)
- * @latch:      value to convert
- * @evt:        pointer to clock event device descriptor
- *
- * Math helper, returns latch value converted to nanoseconds (bound checked)
- */
-u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
 {
        u64 clc = (u64) latch << evt->shift;
+        u64 rnd;
        if (unlikely(!evt->mult)) {
                evt->mult = 1;
                WARN_ON(1);
        }
+        rnd = (u64) evt->mult - 1;
+        /*
+         * Upper bound sanity check. If the backwards conversion is
+         * not equal latch, we know that the above shift overflowed.
+         */
+        if ((clc >> evt->shift) != (u64)latch)
+                clc = ~0ULL;
+        /*
+         * Scaled math oddities:
+         *
+         * For mult <= (1 << shift) we can safely add mult - 1 to
+         * prevent integer rounding loss. So the backwards conversion
+         * from nsec to device ticks will be correct.
+         *
+         * For mult > (1 << shift), i.e. device frequency is > 1GHz we
+         * need to be careful. Adding mult - 1 will result in a value
+         * which when converted back to device ticks can be larger
+         * than latch by up to (mult - 1) >> shift. For the min_delta
+         * calculation we still want to apply this in order to stay
+         * above the minimum device ticks limit. For the upper limit
+         * we would end up with a latch value larger than the upper
+         * limit of the device, so we omit the add to stay below the
+         * device upper boundary.
+         *
+         * Also omit the add if it would overflow the u64 boundary.
+         */
+        if ((~0ULL - clc > rnd) &&
+            (!ismax || evt->mult <= (1U << evt->shift)))
+                clc += rnd;
        do_div(clc, evt->mult);
-        if (clc < 1000)
-                clc = 1000;
-        if (clc > KTIME_MAX)
-                clc = KTIME_MAX;
-        return clc;
+        /* Deltas less than 1usec are pointless noise */
+        return clc > 1000 ? clc : 1000;
+}
+/**
+ * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+ * @latch:      value to convert
+ * @evt:        pointer to clock event device descriptor
+ *
+ * Math helper, returns latch value converted to nanoseconds (bound checked)
+ */
+u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
+{
+        return cev_delta2ns(latch, evt, false);
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
@@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq)
                sec = 600;
        clockevents_calc_mult_shift(dev, freq, sec);
-        dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
+        dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false);
-        dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
+        dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true);
 }
 /**
@@ -584,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,
                                     const char *buf, size_t count)
 {
        char name[CS_NAME_LEN];
-        size_t ret = sysfs_get_uname(buf, name, count);
+        ssize_t ret = sysfs_get_uname(buf, name, count);
        struct clock_event_device *ce;
        if (ret < 0)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 50a8736757f3..ba3e502c955a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
 static inline int __clocksource_watchdog_kthread(void) { return 0; }
 static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
+void clocksource_mark_unstable(struct clocksource *cs) { }
 #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
@@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
 }
 /**
- * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
- * @cs:         Pointer to clocksource
+ * @mult:       cycle to nanosecond multiplier
- *
+ * @shift:      cycle to nanosecond divisor (power of two)
+ * @maxadj:     maximum adjustment value to mult (~11%)
+ * @mask:       bitmask for two's complement subtraction of non 64 bit counters
 */
-static u64 clocksource_max_deferment(struct clocksource *cs)
+u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
 {
        u64 max_nsecs, max_cycles;
        /*
         * Calculate the maximum number of cycles that we can pass to the
         * cyc2ns function without overflowing a 64-bit signed result. The
-         * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
+         * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
         * which is equivalent to the below.
-         * max_cycles < (2^63)/(cs->mult + cs->maxadj)
+         * max_cycles < (2^63)/(mult + maxadj)
-         * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
+         * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
-         * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
+         * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
-         * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
+         * max_cycles < 2^(63 - log2(mult + maxadj))
-         * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
+         * max_cycles < 1 << (63 - log2(mult + maxadj))
         * Please note that we add 1 to the result of the log2 to account for
         * any rounding errors, ensure the above inequality is satisfied and
         * no overflow will occur.
         */
-        max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
+        max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
        /*
         * The actual maximum number of cycles we can defer the clocksource is
-         * determined by the minimum of max_cycles and cs->mask.
+         * determined by the minimum of max_cycles and mask.
         * Note: Here we subtract the maxadj to make sure we don't sleep for
         * too long if there's a large negative adjustment.
         */
-        max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
+        max_cycles = min(max_cycles, mask);
-        max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
+        max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
-                                        cs->shift);
+        return max_nsecs;
+}
+/**
+ * clocksource_max_deferment - Returns max time the clocksource can be deferred
+ * @cs:         Pointer to clocksource
+ *
+ */
+static u64 clocksource_max_deferment(struct clocksource *cs)
+{
+        u64 max_nsecs;
+        max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
+                                          cs->mask);
        /*
         * To ensure that the clocksource does not wrap whilst we are idle,
         * limit the time the clocksource can be deferred by 12.5%. Please
@@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev,
        return count;
 }
-size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
+ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
 {
        size_t ret = cnt;
@@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
                                          struct device_attribute *attr,
                                          const char *buf, size_t count)
 {
-        size_t ret;
+        ssize_t ret;
        mutex_lock(&clocksource_mutex);
@@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
 {
        struct clocksource *cs;
        char name[CS_NAME_LEN];
-        size_t ret;
+        ssize_t ret;
        ret = sysfs_get_uname(buf, name, count);
        if (ret < 0)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8f5b3b98577b..af8d1d4f3d55 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)
         * called as close as possible to 500 ms before the new second starts.
         * This code is run on a timer.  If the clock is set, that timer
         * may not expire at the correct time.  Thus, we adjust...
+         * We want the clock to be within a couple of ticks from the target.
         */
        if (!ntp_synced()) {
                /*
@@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work)
        }
        getnstimeofday(&now);
-        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
+        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
                struct timespec adjust = now;
                fail = -ENODEV;
@@ -516,13 +517,13 @@ static void sync_cmos_clock(struct work_struct *work)
        schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
 }
-static void notify_cmos_timer(void)
+void ntp_notify_cmos_timer(void)
 {
        schedule_delayed_work(&sync_cmos_work, 0);
 }
 #else
-static inline void notify_cmos_timer(void) { }
+void ntp_notify_cmos_timer(void) { }
 #endif
@@ -687,8 +688,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
        if (!(time_status & STA_NANO))
                txc->time.tv_usec /= NSEC_PER_USEC;
-        notify_cmos_timer();
        return result;
 }
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index a326f27d7f09..68b799375981 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,25 +8,28 @@
 #include <linux/clocksource.h>
 #include <linux/init.h>
 #include <linux/jiffies.h>
+#include <linux/ktime.h>
 #include <linux/kernel.h>
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
 #include <linux/syscore_ops.h>
-#include <linux/timer.h>
+#include <linux/hrtimer.h>
 #include <linux/sched_clock.h>
+#include <linux/seqlock.h>
+#include <linux/bitops.h>
 struct clock_data {
+        ktime_t wrap_kt;
        u64 epoch_ns;
-        u32 epoch_cyc;
+        u64 epoch_cyc;
-        u32 epoch_cyc_copy;
+        seqcount_t seq;
        unsigned long rate;
        u32 mult;
        u32 shift;
        bool suspended;
 };
-static void sched_clock_poll(unsigned long wrap_ticks);
+static struct hrtimer sched_clock_timer;
-static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
 static int irqtime = -1;
 core_param(irqtime, irqtime, int, 0400);
@@ -35,42 +38,46 @@ static struct clock_data cd = {
        .mult   = NSEC_PER_SEC / HZ,
 };
-static u32 __read_mostly sched_clock_mask = 0xffffffff;
+static u64 __read_mostly sched_clock_mask;
-static u32 notrace jiffy_sched_clock_read(void)
+static u64 notrace jiffy_sched_clock_read(void)
 {
-        return (u32)(jiffies - INITIAL_JIFFIES);
+        /*
+         * We don't need to use get_jiffies_64 on 32-bit arches here
+         * because we register with BITS_PER_LONG
+         */
+        return (u64)(jiffies - INITIAL_JIFFIES);
 }
-static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static u32 __read_mostly (*read_sched_clock_32)(void);
+static u64 notrace read_sched_clock_32_wrapper(void)
+{
+        return read_sched_clock_32();
+}
+static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 {
        return (cyc * mult) >> shift;
 }
-static unsigned long long notrace sched_clock_32(void)
+unsigned long long notrace sched_clock(void)
 {
        u64 epoch_ns;
-        u32 epoch_cyc;
+        u64 epoch_cyc;
-        u32 cyc;
+        u64 cyc;
+        unsigned long seq;
        if (cd.suspended)
                return cd.epoch_ns;
-        /*
-         * Load the epoch_cyc and epoch_ns atomically.  We do this by
-         * ensuring that we always write epoch_cyc, epoch_ns and
-         * epoch_cyc_copy in strict order, and read them in strict order.
-         * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
-         * the middle of an update, and we should repeat the load.
-         */
        do {
+                seq = read_seqcount_begin(&cd.seq);
                epoch_cyc = cd.epoch_cyc;
-                smp_rmb();
                epoch_ns = cd.epoch_ns;
-                smp_rmb();
+        } while (read_seqcount_retry(&cd.seq, seq));
-        } while (epoch_cyc != cd.epoch_cyc_copy);
        cyc = read_sched_clock();
        cyc = (cyc - epoch_cyc) & sched_clock_mask;
@@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void)
 static void notrace update_sched_clock(void)
 {
        unsigned long flags;
-        u32 cyc;
+        u64 cyc;
        u64 ns;
        cyc = read_sched_clock();
        ns = cd.epoch_ns +
                cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
                          cd.mult, cd.shift);
-        /*
-         * Write epoch_cyc and epoch_ns in a way that the update is
-         * detectable in cyc_to_fixed_sched_clock().
-         */
        raw_local_irq_save(flags);
-        cd.epoch_cyc_copy = cyc;
+        write_seqcount_begin(&cd.seq);
-        smp_wmb();
        cd.epoch_ns = ns;
-        smp_wmb();
        cd.epoch_cyc = cyc;
+        write_seqcount_end(&cd.seq);
        raw_local_irq_restore(flags);
 }
-static void sched_clock_poll(unsigned long wrap_ticks)
+static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 {
-        mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
        update_sched_clock();
+        hrtimer_forward_now(hrt, cd.wrap_kt);
+        return HRTIMER_RESTART;
 }
-void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
+void __init sched_clock_register(u64 (*read)(void), int bits,
+                                 unsigned long rate)
 {
-        unsigned long r, w;
+        unsigned long r;
        u64 res, wrap;
        char r_unit;
        if (cd.rate > rate)
                return;
-        BUG_ON(bits > 32);
        WARN_ON(!irqs_disabled());
        read_sched_clock = read;
-        sched_clock_mask = (1 << bits) - 1;
+        sched_clock_mask = CLOCKSOURCE_MASK(bits);
        cd.rate = rate;
        /* calculate the mult/shift to convert counter ticks to ns. */
-        clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0);
+        clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
        r = rate;
        if (r >= 4000000) {
@@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
                r_unit = ' ';
        /* calculate how many ns until we wrap */
-        wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift);
+        wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
-        do_div(wrap, NSEC_PER_MSEC);
+        cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
-        w = wrap;
        /* calculate the ns resolution of this counter */
        res = cyc_to_ns(1ULL, cd.mult, cd.shift);
-        pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n",
+        pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
-                bits, r, r_unit, res, w);
+                bits, r, r_unit, res, wrap);
-        /*
-         * Start the timer to keep sched_clock() properly updated and
-         * sets the initial epoch.
-         */
-        sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
        update_sched_clock();
        /*
@@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
        pr_debug("Registered %pF as sched_clock source\n", read);
 }
-unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32;
+void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
-unsigned long long notrace sched_clock(void)
 {
-        return sched_clock_func();
+        read_sched_clock_32 = read;
+        sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
 }
 void __init sched_clock_postinit(void)
@@ -180,14 +177,22 @@ void __init sched_clock_postinit(void)
         * make it the final one one.
         */
        if (read_sched_clock == jiffy_sched_clock_read)
-                setup_sched_clock(jiffy_sched_clock_read, 32, HZ);
+                sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
-        sched_clock_poll(sched_clock_timer.data);
+        update_sched_clock();
+        /*
+         * Start the timer to keep sched_clock() properly updated and
+         * sets the initial epoch.
+         */
+        hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        sched_clock_timer.function = sched_clock_poll;
+        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
 }
 static int sched_clock_suspend(void)
 {
-        sched_clock_poll(sched_clock_timer.data);
+        sched_clock_poll(&sched_clock_timer);
        cd.suspended = true;
        return 0;
 }
@@ -195,7 +200,6 @@ static int sched_clock_suspend(void)
 static void sched_clock_resume(void)
 {
        cd.epoch_cyc = read_sched_clock();
-        cd.epoch_cyc_copy = cd.epoch_cyc;
        cd.suspended = false;
 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 218bcb565fed..9532690daaa9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,
                                        struct clock_event_device *newdev)
 {
        if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
+            (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
            (newdev->features & CLOCK_EVT_FEAT_C3STOP))
                return false;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bc906cad709b..18e71f7fbc2a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev);
 extern void clockevents_shutdown(struct clock_event_device *dev);
-extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
+extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
 /*
 * NO_HZ / high resolution timer shared code
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e77edc97e036..3612fc77f834 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -23,6 +23,7 @@
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
 #include <linux/perf_event.h>
+#include <linux/context_tracking.h>
 #include <asm/irq_regs.h>
@@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 }
 #ifdef CONFIG_NO_HZ_FULL
-static cpumask_var_t nohz_full_mask;
+cpumask_var_t tick_nohz_full_mask;
-bool have_nohz_full_mask;
+bool tick_nohz_full_running;
 static bool can_stop_full_tick(void)
 {
@@ -182,7 +183,8 @@ static bool can_stop_full_tick(void)
                 * Don't allow the user to think they can get
                 * full NO_HZ with this machine.
                 */
-                WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock");
+                WARN_ONCE(tick_nohz_full_running,
+                          "NO_HZ FULL will not work with unstable sched clock");
                return false;
        }
 #endif
@@ -196,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
 * Re-evaluate the need for the tick on the current CPU
 * and restart it if necessary.
 */
-void tick_nohz_full_check(void)
+void __tick_nohz_full_check(void)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
@@ -210,7 +212,7 @@ void tick_nohz_full_check(void)
 static void nohz_full_kick_work_func(struct irq_work *work)
 {
-        tick_nohz_full_check();
+        __tick_nohz_full_check();
 }
 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -229,7 +231,7 @@ void tick_nohz_full_kick(void)
 static void nohz_full_kick_ipi(void *info)
 {
-        tick_nohz_full_check();
+        __tick_nohz_full_check();
 }
 /*
@@ -238,12 +240,13 @@ static void nohz_full_kick_ipi(void *info)
 */
 void tick_nohz_full_kick_all(void)
 {
-        if (!have_nohz_full_mask)
+        if (!tick_nohz_full_running)
                return;
        preempt_disable();
-        smp_call_function_many(nohz_full_mask,
+        smp_call_function_many(tick_nohz_full_mask,
                               nohz_full_kick_ipi, NULL, false);
+        tick_nohz_full_kick();
        preempt_enable();
 }
@@ -252,7 +255,7 @@ void tick_nohz_full_kick_all(void)
 * It might need the tick due to per task/process properties:
 * perf events, posix cpu timers, ...
 */
-void tick_nohz_task_switch(struct task_struct *tsk)
+void __tick_nohz_task_switch(struct task_struct *tsk)
 {
        unsigned long flags;
@@ -268,31 +271,23 @@ out:
        local_irq_restore(flags);
 }
-int tick_nohz_full_cpu(int cpu)
-{
-        if (!have_nohz_full_mask)
-                return 0;
-        return cpumask_test_cpu(cpu, nohz_full_mask);
-}
 /* Parse the boot-time nohz CPU list from the kernel parameters. */
 static int __init tick_nohz_full_setup(char *str)
 {
        int cpu;
-        alloc_bootmem_cpumask_var(&nohz_full_mask);
+        alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
-        if (cpulist_parse(str, nohz_full_mask) < 0) {
+        if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
                pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
                return 1;
        }
        cpu = smp_processor_id();
-        if (cpumask_test_cpu(cpu, nohz_full_mask)) {
+        if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
                pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
-                cpumask_clear_cpu(cpu, nohz_full_mask);
+                cpumask_clear_cpu(cpu, tick_nohz_full_mask);
        }
-        have_nohz_full_mask = true;
+        tick_nohz_full_running = true;
        return 1;
 }
@@ -310,7 +305,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
                 * If we handle the timekeeping duty for full dynticks CPUs,
                 * we can't safely shutdown that CPU.
                 */
-                if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
+                if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
                        return NOTIFY_BAD;
                break;
        }
@@ -329,14 +324,14 @@ static int tick_nohz_init_all(void)
        int err = -1;
 #ifdef CONFIG_NO_HZ_FULL_ALL
-        if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
+        if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
                pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
                return err;
        }
        err = 0;
-        cpumask_setall(nohz_full_mask);
+        cpumask_setall(tick_nohz_full_mask);
-        cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
+        cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
-        have_nohz_full_mask = true;
+        tick_nohz_full_running = true;
 #endif
        return err;
 }
@@ -345,17 +340,18 @@ void __init tick_nohz_init(void)
 {
        int cpu;
-        if (!have_nohz_full_mask) {
+        if (!tick_nohz_full_running) {
                if (tick_nohz_init_all() < 0)
                        return;
        }
+        for_each_cpu(cpu, tick_nohz_full_mask)
+                context_tracking_cpu_set(cpu);
        cpu_notifier(tick_nohz_cpu_down_callback, 0);
-        cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
+        cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask);
        pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
 }
-#else
-#define have_nohz_full_mask (0)
 #endif
 /*
@@ -733,7 +729,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
                return false;
        }
-        if (have_nohz_full_mask) {
+        if (tick_nohz_full_enabled()) {
                /*
                 * Keep the tick alive to guarantee timekeeping progression
                 * if there are full dynticks CPUs around
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 48b9fffabdc2..3abf53418b67 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 * ktime_get_update_offsets - hrtimer helper
 * @offs_real:  pointer to storage for monotonic -> realtime offset
 * @offs_boot:  pointer to storage for monotonic -> boottime offset
+ * @offs_tai:   pointer to storage for monotonic -> clock tai offset
 *
 * Returns current monotonic time and updates the offsets
- * Called from hrtimer_interupt() or retrigger_next_event()
+ * Called from hrtimer_interrupt() or retrigger_next_event()
 */
 ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
                                                        ktime_t *offs_tai)
@@ -1703,6 +1704,8 @@ int do_adjtimex(struct timex *txc)
        write_seqcount_end(&timekeeper_seq);
        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+        ntp_notify_cmos_timer();
        return ret;
 }
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3bdf28323012..61ed862cdd37 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -265,10 +265,9 @@ static inline void timer_list_header(struct seq_file *m, u64 now)
 static int timer_list_show(struct seq_file *m, void *v)
 {
        struct timer_list_iter *iter = v;
-        u64 now = ktime_to_ns(ktime_get());
        if (iter->cpu == -1 && !iter->second_pass)
-                timer_list_header(m, now);
+                timer_list_header(m, iter->now);
        else if (!iter->second_pass)
                print_cpu(m, iter->cpu, iter->now);
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -298,33 +297,41 @@ void sysrq_timer_list_show(void)
        return;
 }
-static void *timer_list_start(struct seq_file *file, loff_t *offset)
+static void *move_iter(struct timer_list_iter *iter, loff_t offset)
 {
-        struct timer_list_iter *iter = file->private;
+        for (; offset; offset--) {
+                iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
-        if (!*offset) {
+                if (iter->cpu >= nr_cpu_ids) {
-                iter->cpu = -1;
-                iter->now = ktime_to_ns(ktime_get());
-        } else if (iter->cpu >= nr_cpu_ids) {
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
-                if (!iter->second_pass) {
+                        if (!iter->second_pass) {
-                        iter->cpu = -1;
+                                iter->cpu = -1;
-                        iter->second_pass = true;
+                                iter->second_pass = true;
-                } else
+                        } else
-                        return NULL;
+                                return NULL;
 #else
-                return NULL;
+                        return NULL;
 #endif
+                }
        }
        return iter;
 }
+static void *timer_list_start(struct seq_file *file, loff_t *offset)
+{
+        struct timer_list_iter *iter = file->private;
+        if (!*offset)
+                iter->now = ktime_to_ns(ktime_get());
+        iter->cpu = -1;
+        iter->second_pass = false;
+        return move_iter(iter, *offset);
+}
 static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
 {
        struct timer_list_iter *iter = file->private;
-        iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
        ++*offset;
-        return timer_list_start(file, offset);
+        return move_iter(iter, 1);
 }
 static void timer_list_stop(struct seq_file *seq, void *v)
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 0b537f27b559..1fb08f21302e 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)
        period = ktime_to_timespec(time);
        ms = period.tv_nsec / 1000000;
-        seq_puts(m, "Timer Stats Version: v0.2\n");
+        seq_puts(m, "Timer Stats Version: v0.3\n");
        seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
        if (atomic_read(&overflow_count))
-                seq_printf(m, "Overflow: %d entries\n",
+                seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
-                        atomic_read(&overflow_count));
+        seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
        for (i = 0; i < nr_entries; i++) {
                entry = entries + i;
-                if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+                if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
                        seq_printf(m, "%4luD, %5d %-16s ",
                                entry->count, entry->pid, entry->comm);
                } else {
diff --git a/kernel/timer.c b/kernel/timer.c
index 4296d13db3d1..6582b82fa966 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
                          unsigned long data)
 {
-        int preempt_count = preempt_count();
+        int count = preempt_count();
 #ifdef CONFIG_LOCKDEP
        /*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
        lock_map_release(&lockdep_map);
-        if (preempt_count != preempt_count()) {
+        if (count != preempt_count()) {
                WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
-                          fn, preempt_count, preempt_count());
+                          fn, count, preempt_count());
                /*
                 * Restore the preempt count. That gives us a decent
                 * chance to survive and extract information. If the
                 * callback kept a lock held, bad luck, but not worse
                 * than the BUG() we had.
                 */
-                preempt_count() = preempt_count;
+                preempt_count_set(count);
        }
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a6d098c6df3f..03cf44ac54d3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1978,12 +1978,27 @@ int __weak ftrace_arch_code_modify_post_process(void)
 void ftrace_modify_all_code(int command)
 {
+        int update = command & FTRACE_UPDATE_TRACE_FUNC;
+        /*
+         * If the ftrace_caller calls a ftrace_ops func directly,
+         * we need to make sure that it only traces functions it
+         * expects to trace. When doing the switch of functions,
+         * we need to update to the ftrace_ops_list_func first
+         * before the transition between old and new calls are set,
+         * as the ftrace_ops_list_func will check the ops hashes
+         * to make sure the ops are having the right functions
+         * traced.
+         */
+        if (update)
+                ftrace_update_ftrace_func(ftrace_ops_list_func);
        if (command & FTRACE_UPDATE_CALLS)
                ftrace_replace_code(1);
        else if (command & FTRACE_DISABLE_CALLS)
                ftrace_replace_code(0);
-        if (command & FTRACE_UPDATE_TRACE_FUNC)
+        if (update && ftrace_trace_function != ftrace_ops_list_func)
                ftrace_update_ftrace_func(ftrace_trace_function);
        if (command & FTRACE_START_FUNC_RET)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 496f94d57698..d9fea7dfd5d3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 #endif
                ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
                ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
-                (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+                (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
+                (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
@@ -3166,11 +3167,6 @@ static const struct file_operations show_traces_fops = {
 };
 /*
- * Only trace on a CPU if the bitmask is set:
- */
-static cpumask_var_t tracing_cpumask;
-/*
 * The tracer itself will not take this lock, but still we want
 * to provide a consistent cpumask to user-space:
 */
@@ -3186,11 +3182,12 @@ static ssize_t
 tracing_cpumask_read(struct file *filp, char __user *ubuf,
                     size_t count, loff_t *ppos)
 {
+        struct trace_array *tr = file_inode(filp)->i_private;
        int len;
        mutex_lock(&tracing_cpumask_update_lock);
-        len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+        len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask);
        if (count - len < 2) {
                count = -EINVAL;
                goto out_err;
@@ -3208,7 +3205,7 @@ static ssize_t
 tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                      size_t count, loff_t *ppos)
 {
-        struct trace_array *tr = filp->private_data;
+        struct trace_array *tr = file_inode(filp)->i_private;
        cpumask_var_t tracing_cpumask_new;
        int err, cpu;
@@ -3228,12 +3225,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                 * Increase/decrease the disabled counter if we are
                 * about to flip a bit in the cpumask:
                 */
-                if (cpumask_test_cpu(cpu, tracing_cpumask) &&
+                if (cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
                                !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                        atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
                        ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
                }
-                if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
+                if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) &&
                                cpumask_test_cpu(cpu, tracing_cpumask_new)) {
                        atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
                        ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
@@ -3242,7 +3239,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        arch_spin_unlock(&ftrace_max_lock);
        local_irq_enable();
-        cpumask_copy(tracing_cpumask, tracing_cpumask_new);
+        cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
        mutex_unlock(&tracing_cpumask_update_lock);
        free_cpumask_var(tracing_cpumask_new);
@@ -3256,9 +3253,10 @@ err_unlock:
 }
 static const struct file_operations tracing_cpumask_fops = {
-        .open           = tracing_open_generic,
+        .open           = tracing_open_generic_tr,
        .read           = tracing_cpumask_read,
        .write          = tracing_cpumask_write,
+        .release        = tracing_release_generic_tr,
        .llseek         = generic_file_llseek,
 };
@@ -5938,6 +5936,11 @@ static int new_instance_create(const char *name)
        if (!tr->name)
                goto out_free_tr;
+        if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
+                goto out_free_tr;
+        cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
        raw_spin_lock_init(&tr->start_lock);
        tr->current_trace = &nop_trace;
@@ -5969,6 +5972,7 @@ static int new_instance_create(const char *name)
 out_free_tr:
        if (tr->trace_buffer.buffer)
                ring_buffer_free(tr->trace_buffer.buffer);
+        free_cpumask_var(tr->tracing_cpumask);
        kfree(tr->name);
        kfree(tr);
@@ -6098,6 +6102,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
 {
        int cpu;
+        trace_create_file("tracing_cpumask", 0644, d_tracer,
+                          tr, &tracing_cpumask_fops);
        trace_create_file("trace_options", 0644, d_tracer,
                          tr, &tracing_iter_fops);
@@ -6147,9 +6154,6 @@ static __init int tracer_init_debugfs(void)
        init_tracer_debugfs(&global_trace, d_tracer);
-        trace_create_file("tracing_cpumask", 0644, d_tracer,
-                        &global_trace, &tracing_cpumask_fops);
        trace_create_file("available_tracers", 0444, d_tracer,
                        &global_trace, &show_traces_fops);
@@ -6371,7 +6375,7 @@ __init static int tracer_alloc_buffers(void)
        if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
                goto out;
-        if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
+        if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL))
                goto out_free_buffer_mask;
        /* Only allocate trace_printk buffers if a trace_printk exists */
@@ -6386,7 +6390,7 @@ __init static int tracer_alloc_buffers(void)
                ring_buf_size = 1;
        cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
-        cpumask_copy(tracing_cpumask, cpu_all_mask);
+        cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask);
        raw_spin_lock_init(&global_trace.start_lock);
@@ -6441,7 +6445,7 @@ out_free_cpumask:
 #ifdef CONFIG_TRACER_MAX_TRACE
        free_percpu(global_trace.max_buffer.data);
 #endif
-        free_cpumask_var(tracing_cpumask);
+        free_cpumask_var(global_trace.tracing_cpumask);
 out_free_buffer_mask:
        free_cpumask_var(tracing_buffer_mask);
 out:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index afaae41b0a02..73d08aa25b55 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -124,6 +124,7 @@ enum trace_flag_type {
        TRACE_FLAG_NEED_RESCHED         = 0x04,
        TRACE_FLAG_HARDIRQ              = 0x08,
        TRACE_FLAG_SOFTIRQ              = 0x10,
+        TRACE_FLAG_PREEMPT_RESCHED      = 0x20,
 };
 #define TRACE_BUF_SIZE          1024
@@ -206,6 +207,7 @@ struct trace_array {
        struct dentry           *event_dir;
        struct list_head        systems;
        struct list_head        events;
+        cpumask_var_t           tracing_cpumask; /* only trace on set CPUs */
        int                     ref;
 };
@@ -1022,6 +1024,9 @@ extern struct list_head ftrace_events;
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
+extern const char *__start___tracepoint_str[];
+extern const char *__stop___tracepoint_str[];
 void trace_printk_init_buffers(void);
 void trace_printk_start_comm(void);
 int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 80c36bcf66e8..78e27e3b52ac 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -26,7 +26,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
 {
        /* The ftrace function trace is allowed only for root. */
        if (ftrace_event_is_function(tp_event) &&
-            perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+            perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
                return -EPERM;
        /* No tracing, just counting, so no obvious leak */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 29a7ebcfb426..368a4d50cc30 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1489,12 +1489,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 }
 static int
-event_create_dir(struct dentry *parent,
+event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
-                 struct ftrace_event_file *file,
-                 const struct file_operations *id,
-                 const struct file_operations *enable,
-                 const struct file_operations *filter,
-                 const struct file_operations *format)
 {
        struct ftrace_event_call *call = file->event_call;
        struct trace_array *tr = file->tr;
@@ -1522,12 +1517,13 @@ event_create_dir(struct dentry *parent,
        if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
                trace_create_file("enable", 0644, file->dir, file,
-                                  enable);
+                                  &ftrace_enable_fops);
 #ifdef CONFIG_PERF_EVENTS
        if (call->event.type && call->class->reg)
                trace_create_file("id", 0444, file->dir,
-                                  (void *)(long)call->event.type, id);
+                                  (void *)(long)call->event.type,
+                                  &ftrace_event_id_fops);
 #endif
        /*
@@ -1544,10 +1540,10 @@ event_create_dir(struct dentry *parent,
                }
        }
        trace_create_file("filter", 0644, file->dir, call,
-                          filter);
+                          &ftrace_event_filter_fops);
        trace_create_file("format", 0444, file->dir, call,
-                          format);
+                          &ftrace_event_format_fops);
        return 0;
 }
@@ -1648,12 +1644,7 @@ trace_create_new_event(struct ftrace_event_call *call,
 /* Add an event to a trace directory */
 static int
-__trace_add_new_event(struct ftrace_event_call *call,
+__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr)
-                      struct trace_array *tr,
-                      const struct file_operations *id,
-                      const struct file_operations *enable,
-                      const struct file_operations *filter,
-                      const struct file_operations *format)
 {
        struct ftrace_event_file *file;
@@ -1661,7 +1652,7 @@ __trace_add_new_event(struct ftrace_event_call *call,
        if (!file)
                return -ENOMEM;
-        return event_create_dir(tr->event_dir, file, id, enable, filter, format);
+        return event_create_dir(tr->event_dir, file);
 }
 /*
@@ -1683,8 +1674,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
 }
 struct ftrace_module_file_ops;
-static void __add_event_to_tracers(struct ftrace_event_call *call,
+static void __add_event_to_tracers(struct ftrace_event_call *call);
-                                   struct ftrace_module_file_ops *file_ops);
 /* Add an additional event_call dynamically */
 int trace_add_event_call(struct ftrace_event_call *call)
@@ -1695,7 +1685,7 @@ int trace_add_event_call(struct ftrace_event_call *call)
        ret = __register_event(call, NULL);
        if (ret >= 0)
-                __add_event_to_tracers(call, NULL);
+                __add_event_to_tracers(call);
        mutex_unlock(&event_mutex);
        mutex_unlock(&trace_types_lock);
@@ -1769,100 +1759,21 @@ int trace_remove_event_call(struct ftrace_event_call *call)
 #ifdef CONFIG_MODULES
-static LIST_HEAD(ftrace_module_file_list);
-/*
- * Modules must own their file_operations to keep up with
- * reference counting.
- */
-struct ftrace_module_file_ops {
-        struct list_head                list;
-        struct module                   *mod;
-        struct file_operations          id;
-        struct file_operations          enable;
-        struct file_operations          format;
-        struct file_operations          filter;
-};
-static struct ftrace_module_file_ops *
-find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
-{
-        /*
-         * As event_calls are added in groups by module,
-         * when we find one file_ops, we don't need to search for
-         * each call in that module, as the rest should be the
-         * same. Only search for a new one if the last one did
-         * not match.
-         */
-        if (file_ops && mod == file_ops->mod)
-                return file_ops;
-        list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
-                if (file_ops->mod == mod)
-                        return file_ops;
-        }
-        return NULL;
-}
-static struct ftrace_module_file_ops *
-trace_create_file_ops(struct module *mod)
-{
-        struct ftrace_module_file_ops *file_ops;
-        /*
-         * This is a bit of a PITA. To allow for correct reference
-         * counting, modules must "own" their file_operations.
-         * To do this, we allocate the file operations that will be
-         * used in the event directory.
-         */
-        file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
-        if (!file_ops)
-                return NULL;
-        file_ops->mod = mod;
-        file_ops->id = ftrace_event_id_fops;
-        file_ops->id.owner = mod;
-        file_ops->enable = ftrace_enable_fops;
-        file_ops->enable.owner = mod;
-        file_ops->filter = ftrace_event_filter_fops;
-        file_ops->filter.owner = mod;
-        file_ops->format = ftrace_event_format_fops;
-        file_ops->format.owner = mod;
-        list_add(&file_ops->list, &ftrace_module_file_list);
-        return file_ops;
-}
 static void trace_module_add_events(struct module *mod)
 {
-        struct ftrace_module_file_ops *file_ops = NULL;
        struct ftrace_event_call **call, **start, **end;
        start = mod->trace_events;
        end = mod->trace_events + mod->num_trace_events;
-        if (start == end)
-                return;
-        file_ops = trace_create_file_ops(mod);
-        if (!file_ops)
-                return;
        for_each_event(call, start, end) {
                __register_event(*call, mod);
-                __add_event_to_tracers(*call, file_ops);
+                __add_event_to_tracers(*call);
        }
 }
 static void trace_module_remove_events(struct module *mod)
 {
-        struct ftrace_module_file_ops *file_ops;
        struct ftrace_event_call *call, *p;
        bool clear_trace = false;
@@ -1874,16 +1785,6 @@ static void trace_module_remove_events(struct module *mod)
                        __trace_remove_event_call(call);
                }
        }
-        /* Now free the file_operations */
-        list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
-                if (file_ops->mod == mod)
-                        break;
-        }
-        if (&file_ops->list != &ftrace_module_file_list) {
-                list_del(&file_ops->list);
-                kfree(file_ops);
-        }
        up_write(&trace_event_sem);
        /*
@@ -1919,67 +1820,21 @@ static int trace_module_notify(struct notifier_block *self,
        return 0;
 }
-static int
+static struct notifier_block trace_module_nb = {
-__trace_add_new_mod_event(struct ftrace_event_call *call,
+        .notifier_call = trace_module_notify,
-                          struct trace_array *tr,
+        .priority = 0,
-                          struct ftrace_module_file_ops *file_ops)
+};
-{
-        return __trace_add_new_event(call, tr,
-                                     &file_ops->id, &file_ops->enable,
-                                     &file_ops->filter, &file_ops->format);
-}
-#else
-static inline struct ftrace_module_file_ops *
-find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
-{
-        return NULL;
-}
-static inline int trace_module_notify(struct notifier_block *self,
-                                      unsigned long val, void *data)
-{
-        return 0;
-}
-static inline int
-__trace_add_new_mod_event(struct ftrace_event_call *call,
-                          struct trace_array *tr,
-                          struct ftrace_module_file_ops *file_ops)
-{
-        return -ENODEV;
-}
 #endif /* CONFIG_MODULES */
 /* Create a new event directory structure for a trace directory. */
 static void
 __trace_add_event_dirs(struct trace_array *tr)
 {
-        struct ftrace_module_file_ops *file_ops = NULL;
        struct ftrace_event_call *call;
        int ret;
        list_for_each_entry(call, &ftrace_events, list) {
-                if (call->mod) {
+                ret = __trace_add_new_event(call, tr);
-                        /*
-                         * Directories for events by modules need to
-                         * keep module ref counts when opened (as we don't
-                         * want the module to disappear when reading one
-                         * of these files). The file_ops keep account of
-                         * the module ref count.
-                         */
-                        file_ops = find_ftrace_file_ops(file_ops, call->mod);
-                        if (!file_ops)
-                                continue; /* Warn? */
-                        ret = __trace_add_new_mod_event(call, tr, file_ops);
-                        if (ret < 0)
-                                pr_warning("Could not create directory for event %s\n",
-                                           call->name);
-                        continue;
-                }
-                ret = __trace_add_new_event(call, tr,
-                                            &ftrace_event_id_fops,
-                                            &ftrace_enable_fops,
-                                            &ftrace_event_filter_fops,
-                                            &ftrace_event_format_fops);
                if (ret < 0)
                        pr_warning("Could not create directory for event %s\n",
                                   call->name);
@@ -2287,11 +2142,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
        list_for_each_entry(file, &tr->events, list) {
-                ret = event_create_dir(tr->event_dir, file,
+                ret = event_create_dir(tr->event_dir, file);
-                                       &ftrace_event_id_fops,
-                                       &ftrace_enable_fops,
-                                       &ftrace_event_filter_fops,
-                                       &ftrace_event_format_fops);
                if (ret < 0)
                        pr_warning("Could not create directory for event %s\n",
                                   file->event_call->name);
@@ -2332,29 +2183,14 @@ __trace_remove_event_dirs(struct trace_array *tr)
                remove_event_file_dir(file);
 }
-static void
+static void __add_event_to_tracers(struct ftrace_event_call *call)
-__add_event_to_tracers(struct ftrace_event_call *call,
-                       struct ftrace_module_file_ops *file_ops)
 {
        struct trace_array *tr;
-        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+        list_for_each_entry(tr, &ftrace_trace_arrays, list)
-                if (file_ops)
+                __trace_add_new_event(call, tr);
-                        __trace_add_new_mod_event(call, tr, file_ops);
-                else
-                        __trace_add_new_event(call, tr,
-                                              &ftrace_event_id_fops,
-                                              &ftrace_enable_fops,
-                                              &ftrace_event_filter_fops,
-                                              &ftrace_event_format_fops);
-        }
 }
-static struct notifier_block trace_module_nb = {
-        .notifier_call = trace_module_notify,
-        .priority = 0,
-};
 extern struct ftrace_event_call *__start_ftrace_events[];
 extern struct ftrace_event_call *__stop_ftrace_events[];
@@ -2559,10 +2395,11 @@ static __init int event_trace_init(void)
        if (ret)
                return ret;
+#ifdef CONFIG_MODULES
        ret = register_module_notifier(&trace_module_nb);
        if (ret)
                pr_warning("Failed to register trace events module notifier\n");
+#endif
        return 0;
 }
 early_initcall(event_trace_memsetup);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 34e7cbac0c9c..ed32284fbe32 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
                (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
                (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
                '.';
-        need_resched =
-                (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+        switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
+                                TRACE_FLAG_PREEMPT_RESCHED)) {
+        case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
+                need_resched = 'N';
+                break;
+        case TRACE_FLAG_NEED_RESCHED:
+                need_resched = 'n';
+                break;
+        case TRACE_FLAG_PREEMPT_RESCHED:
+                need_resched = 'p';
+                break;
+        default:
+                need_resched = '.';
+                break;
+        }
        hardsoft_irq =
                (hardirq && softirq) ? 'H' :
                hardirq ? 'h' :
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a9077c1b4ad3..2900817ba65c 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos)
 {
        const char **fmt = v;
        int start_index;
+        int last_index;
        start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
        if (*pos < start_index)
                return __start___trace_bprintk_fmt + *pos;
+        /*
+         * The __tracepoint_str section is treated the same as the
+         * __trace_printk_fmt section. The difference is that the
+         * __trace_printk_fmt section should only be used by trace_printk()
+         * in a debugging environment, as if anything exists in that section
+         * the trace_prink() helper buffers are allocated, which would just
+         * waste space in a production environment.
+         *
+         * The __tracepoint_str sections on the other hand are used by
+         * tracepoints which need to map pointers to their strings to
+         * the ASCII text for userspace.
+         */
+        last_index = start_index;
+        start_index = __stop___tracepoint_str - __start___tracepoint_str;
+        if (*pos < last_index + start_index)
+                return __start___tracepoint_str + (*pos - last_index);
        return find_next_mod_format(start_index, v, fmt, pos);
 }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8fd03657bc7d..559329d9bd2f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -200,8 +200,8 @@ extern char *__bad_type_size(void);
                #type, #name, offsetof(typeof(trace), name),            \
                sizeof(trace.name), is_signed_type(type)
-static
+static int __init
-int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
+__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 {
        int i;
        int pos = 0;
@@ -228,7 +228,7 @@ int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
        return pos;
 }
-static int set_syscall_print_fmt(struct ftrace_event_call *call)
+static int __init set_syscall_print_fmt(struct ftrace_event_call *call)
 {
        char *print_fmt;
        int len;
@@ -253,7 +253,7 @@ static int set_syscall_print_fmt(struct ftrace_event_call *call)
        return 0;
 }
-static void free_syscall_print_fmt(struct ftrace_event_call *call)
+static void __init free_syscall_print_fmt(struct ftrace_event_call *call)
 {
        struct syscall_metadata *entry = call->data;
@@ -459,7 +459,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
        mutex_unlock(&syscall_trace_lock);
 }
-static int init_syscall_trace(struct ftrace_event_call *call)
+static int __init init_syscall_trace(struct ftrace_event_call *call)
 {
        int id;
        int num;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index f6c83d7ef000..602e5bbbceff 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
        struct group_info *group_info;
        int retval;
-        if (!nsown_capable(CAP_SETGID))
+        if (!ns_capable(current_user_ns(), CAP_SETGID))
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;
diff --git a/kernel/up.c b/kernel/up.c
index c54c75e9faf7..630d72bf7e41 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -10,12 +10,64 @@
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
                                int wait)
 {
+        unsigned long flags;
        WARN_ON(cpu != 0);
-        local_irq_disable();
+        local_irq_save(flags);
-        (func)(info);
+        func(info);
-        local_irq_enable();
+        local_irq_restore(flags);
        return 0;
 }
 EXPORT_SYMBOL(smp_call_function_single);
+int on_each_cpu(smp_call_func_t func, void *info, int wait)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        func(info);
+        local_irq_restore(flags);
+        return 0;
+}
+EXPORT_SYMBOL(on_each_cpu);
+/*
+ * Note we still need to test the mask even for UP
+ * because we actually can get an empty mask from
+ * code that on SMP might call us without the local
+ * CPU in the mask.
+ */
+void on_each_cpu_mask(const struct cpumask *mask,
+                      smp_call_func_t func, void *info, bool wait)
+{
+        unsigned long flags;
+        if (cpumask_test_cpu(0, mask)) {
+                local_irq_save(flags);
+                func(info);
+                local_irq_restore(flags);
+        }
+}
+EXPORT_SYMBOL(on_each_cpu_mask);
+/*
+ * Preemption is disabled here to make sure the cond_func is called under the
+ * same condtions in UP and SMP.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+                      smp_call_func_t func, void *info, bool wait,
+                      gfp_t gfp_flags)
+{
+        unsigned long flags;
+        preempt_disable();
+        if (cond_func(0, info)) {
+                local_irq_save(flags);
+                func(info);
+                local_irq_restore(flags);
+        }
+        preempt_enable();
+}
+EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/user.c b/kernel/user.c
index 69b4c3d48cde..5bbb91988e69 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,8 +51,6 @@ struct user_namespace init_user_ns = {
        .owner = GLOBAL_ROOT_UID,
        .group = GLOBAL_ROOT_GID,
        .proc_inum = PROC_USER_INIT_INO,
-        .may_mount_sysfs = true,
-        .may_mount_proc = true,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9064b919a406..13fb1134ba58 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -101,8 +101,6 @@ int create_user_ns(struct cred *new)
        set_cred_user_ns(new, ns);
-        update_mnt_policy(ns);
        return 0;
 }
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 2fc8576efaa8..fd393124e507 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -114,7 +114,7 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
        struct uts_namespace *ns = new;
        if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
-            !nsown_capable(CAP_SYS_ADMIN))
+            !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
                return -EPERM;
        get_uts_ns(ns);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1241d8c91d5e..4431610f049a 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = {
        .unpark                 = watchdog_enable,
 };
-static int watchdog_enable_all_cpus(void)
+static void restart_watchdog_hrtimer(void *info)
+{
+        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+        int ret;
+        /*
+         * No need to cancel and restart hrtimer if it is currently executing
+         * because it will reprogram itself with the new period now.
+         * We should never see it unqueued here because we are running per-cpu
+         * with interrupts disabled.
+         */
+        ret = hrtimer_try_to_cancel(hrtimer);
+        if (ret == 1)
+                hrtimer_start(hrtimer, ns_to_ktime(sample_period),
+                                HRTIMER_MODE_REL_PINNED);
+}
+static void update_timers(int cpu)
+{
+        struct call_single_data data = {.func = restart_watchdog_hrtimer};
+        /*
+         * Make sure that perf event counter will adopt to a new
+         * sampling period. Updating the sampling period directly would
+         * be much nicer but we do not have an API for that now so
+         * let's use a big hammer.
+         * Hrtimer will adopt the new period on the next tick but this
+         * might be late already so we have to restart the timer as well.
+         */
+        watchdog_nmi_disable(cpu);
+        __smp_call_function_single(cpu, &data, 1);
+        watchdog_nmi_enable(cpu);
+}
+static void update_timers_all_cpus(void)
+{
+        int cpu;
+        get_online_cpus();
+        preempt_disable();
+        for_each_online_cpu(cpu)
+                update_timers(cpu);
+        preempt_enable();
+        put_online_cpus();
+}
+static int watchdog_enable_all_cpus(bool sample_period_changed)
 {
        int err = 0;
@@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void)
                        pr_err("Failed to create watchdog threads, disabled\n");
                else
                        watchdog_running = 1;
+        } else if (sample_period_changed) {
+                update_timers_all_cpus();
        }
        return err;
@@ -520,13 +567,15 @@ int proc_dowatchdog(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int err, old_thresh, old_enabled;
+        static DEFINE_MUTEX(watchdog_proc_mutex);
+        mutex_lock(&watchdog_proc_mutex);
        old_thresh = ACCESS_ONCE(watchdog_thresh);
        old_enabled = ACCESS_ONCE(watchdog_user_enabled);
        err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (err || !write)
-                return err;
+                goto out;
        set_sample_period();
        /*
@@ -535,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
         * watchdog_*_all_cpus() function takes care of this.
         */
        if (watchdog_user_enabled && watchdog_thresh)
-                err = watchdog_enable_all_cpus();
+                err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
        else
                watchdog_disable_all_cpus();
@@ -544,7 +593,8 @@ int proc_dowatchdog(struct ctl_table *table, int write,
                watchdog_thresh = old_thresh;
                watchdog_user_enabled = old_enabled;
        }
+out:
+        mutex_unlock(&watchdog_proc_mutex);
        return err;
 }
 #endif /* CONFIG_SYSCTL */
@@ -553,14 +603,6 @@ void __init lockup_detector_init(void)
 {
        set_sample_period();
-#ifdef CONFIG_NO_HZ_FULL
-        if (watchdog_user_enabled) {
-                watchdog_user_enabled = 0;
-                pr_warning("Disabled lockup detectors by default for full dynticks\n");
-                pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n");
-        }
-#endif
        if (watchdog_user_enabled)
-                watchdog_enable_all_cpus();
+                watchdog_enable_all_cpus(false);
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7f5d4be22034..987293d03ebc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -16,9 +16,10 @@
 *
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
- * automatically managed.  There is one worker pool for each CPU and
+ * automatically managed.  There are two worker pools for each CPU (one for
- * one extra for works which are better served by workers which are
+ * normal work items and the other for high priority ones) and some extra
- * not bound to any specific CPU.
+ * pools for workqueues which are not bound to any specific CPU - the
+ * number of these backing pools is dynamic.
 *
 * Please read Documentation/workqueue.txt for details.
 */
@@ -540,6 +541,8 @@ static int worker_pool_assign_id(struct worker_pool *pool)
 * This must be called either with pwq_lock held or sched RCU read locked.
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
+ *
+ * Return: The unbound pool_workqueue for @node.
 */
 static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
                                                  int node)
@@ -638,8 +641,6 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
- * Return the worker_pool @work was last associated with.  %NULL if none.
- *
 * Pools are created and destroyed under wq_pool_mutex, and allows read
 * access under sched-RCU read lock.  As such, this function should be
 * called under wq_pool_mutex or with preemption disabled.
@@ -648,6 +649,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
 * mentioned locking is in effect.  If the returned pool needs to be used
 * beyond the critical section, the caller is responsible for ensuring the
 * returned pool is and stays online.
+ *
+ * Return: The worker_pool @work was last associated with.  %NULL if none.
 */
 static struct worker_pool *get_work_pool(struct work_struct *work)
 {
@@ -671,7 +674,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
 * get_work_pool_id - return the worker pool ID a given work is associated with
 * @work: the work item of interest
 *
- * Return the worker_pool ID @work was last associated with.
+ * Return: The worker_pool ID @work was last associated with.
 * %WORK_OFFQ_POOL_NONE if none.
 */
 static int get_work_pool_id(struct work_struct *work)
@@ -830,7 +833,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu)
 * CONTEXT:
 * spin_lock_irq(rq->lock)
 *
- * RETURNS:
+ * Return:
 * Worker task on @cpu to wake up, %NULL if none.
 */
 struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
@@ -965,8 +968,8 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 * CONTEXT:
 * spin_lock_irq(pool->lock).
 *
- * RETURNS:
+ * Return:
- * Pointer to worker which is executing @work if found, NULL
+ * Pointer to worker which is executing @work if found, %NULL
 * otherwise.
 */
 static struct worker *find_worker_executing_work(struct worker_pool *pool,
@@ -1154,14 +1157,16 @@ out_put:
 * @flags: place to store irq state
 *
 * Try to grab PENDING bit of @work.  This function can handle @work in any
- * stable state - idle, on timer or on worklist.  Return values are
+ * stable state - idle, on timer or on worklist.
 *
+ * Return:
 *  1           if @work was pending and we successfully stole PENDING
 *  0           if @work was idle and we claimed PENDING
 *  -EAGAIN     if PENDING couldn't be grabbed at the moment, safe to busy-retry
 *  -ENOENT     if someone else is canceling @work, this state may persist
 *              for arbitrarily long
 *
+ * Note:
 * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
 * interrupted while holding PENDING and @work off queue, irq must be
 * disabled on entry.  This, combined with delayed_work->timer being
@@ -1403,10 +1408,10 @@ retry:
 * @wq: workqueue to use
 * @work: work to queue
 *
- * Returns %false if @work was already on a queue, %true otherwise.
- *
 * We queue the work to a specific CPU, the caller must ensure it
 * can't go away.
+ *
+ * Return: %false if @work was already on a queue, %true otherwise.
 */
 bool queue_work_on(int cpu, struct workqueue_struct *wq,
                   struct work_struct *work)
@@ -1476,7 +1481,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
- * Returns %false if @work was already on a queue, %true otherwise.  If
+ * Return: %false if @work was already on a queue, %true otherwise.  If
 * @delay is zero and @dwork is idle, it will be scheduled for immediate
 * execution.
 */
@@ -1512,7 +1517,7 @@ EXPORT_SYMBOL(queue_delayed_work_on);
 * zero, @work is guaranteed to be scheduled immediately regardless of its
 * current state.
 *
- * Returns %false if @dwork was idle and queued, %true if @dwork was
+ * Return: %false if @dwork was idle and queued, %true if @dwork was
 * pending and its timer was modified.
 *
 * This function is safe to call from any context including IRQ handler.
@@ -1627,7 +1632,7 @@ static void worker_leave_idle(struct worker *worker)
 * Might sleep.  Called without any lock but returns with pool->lock
 * held.
 *
- * RETURNS:
+ * Return:
 * %true if the associated pool is online (@worker is successfully
 * bound), %false if offline.
 */
@@ -1688,7 +1693,7 @@ static struct worker *alloc_worker(void)
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
 *
- * RETURNS:
+ * Return:
 * Pointer to the newly created worker.
 */
 static struct worker *create_worker(struct worker_pool *pool)
@@ -1788,6 +1793,8 @@ static void start_worker(struct worker *worker)
 * @pool: the target pool
 *
 * Grab the managership of @pool and create and start a new worker for it.
+ *
+ * Return: 0 on success. A negative error code otherwise.
 */
 static int create_and_start_worker(struct worker_pool *pool)
 {
@@ -1932,7 +1939,7 @@ static void pool_mayday_timeout(unsigned long __pool)
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
 *
- * RETURNS:
+ * Return:
 * %false if no action was taken and pool->lock stayed locked, %true
 * otherwise.
 */
@@ -1989,7 +1996,7 @@ restart:
 * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Called only from manager.
 *
- * RETURNS:
+ * Return:
 * %false if no action was taken and pool->lock stayed locked, %true
 * otherwise.
 */
@@ -2032,9 +2039,12 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
 * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.
 *
- * RETURNS:
+ * Return:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * %false if the pool don't need management and the caller can safely start
- * multiple times.  Does GFP_KERNEL allocations.
+ * processing works, %true indicates that the function released pool->lock
+ * and reacquired it to perform some management function and that the
+ * conditions that the caller verified while holding the lock before
+ * calling the function might no longer be true.
 */
 static bool manage_workers(struct worker *worker)
 {
@@ -2201,6 +2211,15 @@ __acquires(&pool->lock)
                dump_stack();
        }
+        /*
+         * The following prevents a kworker from hogging CPU on !PREEMPT
+         * kernels, where a requeueing work item waiting for something to
+         * happen could deadlock with stop_machine as such work item could
+         * indefinitely requeue itself while all other CPUs are trapped in
+         * stop_machine.
+         */
+        cond_resched();
        spin_lock_irq(&pool->lock);
        /* clear cpu intensive status */
@@ -2246,6 +2265,8 @@ static void process_scheduled_works(struct worker *worker)
 * work items regardless of their specific target workqueue.  The only
 * exception is work items which belong to workqueues with a rescuer which
 * will be explained in rescuer_thread().
+ *
+ * Return: 0
 */
 static int worker_thread(void *__worker)
 {
@@ -2344,6 +2365,8 @@ sleep:
 * those works so that forward progress can be guaranteed.
 *
 * This should happen rarely.
+ *
+ * Return: 0
 */
 static int rescuer_thread(void *__rescuer)
 {
@@ -2516,7 +2539,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
 * CONTEXT:
 * mutex_lock(wq->mutex).
 *
- * RETURNS:
+ * Return:
 * %true if @flush_color >= 0 and there's something to flush.  %false
 * otherwise.
 */
@@ -2837,7 +2860,7 @@ static bool __flush_work(struct work_struct *work)
 * Wait until @work has finished execution.  @work is guaranteed to be idle
 * on return if it hasn't been requeued since flush started.
 *
- * RETURNS:
+ * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
@@ -2889,7 +2912,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
 * The caller must ensure that the workqueue on which @work was last
 * queued can't be destroyed before this function returns.
 *
- * RETURNS:
+ * Return:
 * %true if @work was pending, %false otherwise.
 */
 bool cancel_work_sync(struct work_struct *work)
@@ -2906,7 +2929,7 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);
 * immediate execution.  Like flush_work(), this function only
 * considers the last queueing instance of @dwork.
 *
- * RETURNS:
+ * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
@@ -2924,11 +2947,15 @@ EXPORT_SYMBOL(flush_delayed_work);
 * cancel_delayed_work - cancel a delayed work
 * @dwork: delayed_work to cancel
 *
- * Kill off a pending delayed_work.  Returns %true if @dwork was pending
+ * Kill off a pending delayed_work.
- * and canceled; %false if wasn't pending.  Note that the work callback
+ *
- * function may still be running on return, unless it returns %true and the
+ * Return: %true if @dwork was pending and canceled; %false if it wasn't
- * work doesn't re-arm itself.  Explicitly flush or use
+ * pending.
- * cancel_delayed_work_sync() to wait on it.
+ *
+ * Note:
+ * The work callback function may still be running on return, unless
+ * it returns %true and the work doesn't re-arm itself.  Explicitly flush or
+ * use cancel_delayed_work_sync() to wait on it.
 *
 * This function is safe to call from any context including IRQ handler.
 */
@@ -2957,7 +2984,7 @@ EXPORT_SYMBOL(cancel_delayed_work);
 *
 * This is cancel_work_sync() for delayed works.
 *
- * RETURNS:
+ * Return:
 * %true if @dwork was pending, %false otherwise.
 */
 bool cancel_delayed_work_sync(struct delayed_work *dwork)
@@ -2974,7 +3001,7 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
 * system workqueue and blocks until all CPUs have completed.
 * schedule_on_each_cpu() is very slow.
 *
- * RETURNS:
+ * Return:
 * 0 on success, -errno on failure.
 */
 int schedule_on_each_cpu(work_func_t func)
@@ -3042,7 +3069,7 @@ EXPORT_SYMBOL(flush_scheduled_work);
 * Executes the function immediately if process context is available,
 * otherwise schedules the function for delayed execution.
 *
- * Returns:     0 - function was executed
+ * Return:      0 - function was executed
 *              1 - function was scheduled for execution
 */
 int execute_in_process_context(work_func_t fn, struct execute_work *ew)
@@ -3086,25 +3113,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev)
        return wq_dev->wq;
 }
-static ssize_t wq_per_cpu_show(struct device *dev,
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
-                               struct device_attribute *attr, char *buf)
+                            char *buf)
 {
        struct workqueue_struct *wq = dev_to_wq(dev);
        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
 }
+static DEVICE_ATTR_RO(per_cpu);
-static ssize_t wq_max_active_show(struct device *dev,
+static ssize_t max_active_show(struct device *dev,
-                                  struct device_attribute *attr, char *buf)
+                               struct device_attribute *attr, char *buf)
 {
        struct workqueue_struct *wq = dev_to_wq(dev);
        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
 }
-static ssize_t wq_max_active_store(struct device *dev,
+static ssize_t max_active_store(struct device *dev,
-                                   struct device_attribute *attr,
+                                struct device_attribute *attr, const char *buf,
-                                   const char *buf, size_t count)
+                                size_t count)
 {
        struct workqueue_struct *wq = dev_to_wq(dev);
        int val;
@@ -3115,12 +3143,14 @@ static ssize_t wq_max_active_store(struct device *dev,
        workqueue_set_max_active(wq, val);
        return count;
 }
+static DEVICE_ATTR_RW(max_active);
-static struct device_attribute wq_sysfs_attrs[] = {
+static struct attribute *wq_sysfs_attrs[] = {
-        __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL),
+        &dev_attr_per_cpu.attr,
-        __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store),
+        &dev_attr_max_active.attr,
-        __ATTR_NULL,
+        NULL,
 };
+ATTRIBUTE_GROUPS(wq_sysfs);
 static ssize_t wq_pool_ids_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
@@ -3270,7 +3300,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = {
 static struct bus_type wq_subsys = {
        .name                           = "workqueue",
-        .dev_attrs                      = wq_sysfs_attrs,
+        .dev_groups                     = wq_sysfs_groups,
 };
 static int __init wq_sysfs_init(void)
@@ -3299,7 +3329,7 @@ static void wq_device_release(struct device *dev)
 * apply_workqueue_attrs() may race against userland updating the
 * attributes.
 *
- * Returns 0 on success, -errno on failure.
+ * Return: 0 on success, -errno on failure.
 */
 int workqueue_sysfs_register(struct workqueue_struct *wq)
 {
@@ -3392,7 +3422,9 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
 * @gfp_mask: allocation mask to use
 *
 * Allocate a new workqueue_attrs, initialize with default settings and
- * return it.  Returns NULL on failure.
+ * return it.
+ *
+ * Return: The allocated new workqueue_attr on success. %NULL on failure.
 */
 struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
 {
@@ -3451,7 +3483,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
 * @pool: worker_pool to initialize
 *
 * Initiailize a newly zalloc'd @pool.  It also allocates @pool->attrs.
- * Returns 0 on success, -errno on failure.  Even on failure, all fields
+ *
+ * Return: 0 on success, -errno on failure.  Even on failure, all fields
 * inside @pool proper are initialized and put_unbound_pool() can be called
 * on @pool safely to release it.
 */
@@ -3558,9 +3591,12 @@ static void put_unbound_pool(struct worker_pool *pool)
 * Obtain a worker_pool which has the same attributes as @attrs, bump the
 * reference count and return it.  If there already is a matching
 * worker_pool, it will be used; otherwise, this function attempts to
- * create a new one.  On failure, returns NULL.
+ * create a new one.
 *
 * Should be called with wq_pool_mutex held.
+ *
+ * Return: On success, a worker_pool with the same attributes as @attrs.
+ * On failure, %NULL.
 */
 static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
 {
@@ -3796,9 +3832,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
 *
 * Calculate the cpumask a workqueue with @attrs should use on @node.  If
 * @cpu_going_down is >= 0, that cpu is considered offline during
- * calculation.  The result is stored in @cpumask.  This function returns
+ * calculation.  The result is stored in @cpumask.
- * %true if the resulting @cpumask is different from @attrs->cpumask,
- * %false if equal.
 *
 * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If
 * enabled and @node has online CPUs requested by @attrs, the returned
@@ -3807,6 +3841,9 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
 *
 * The caller is responsible for ensuring that the cpumask of @node stays
 * stable.
+ *
+ * Return: %true if the resulting @cpumask is different from @attrs->cpumask,
+ * %false if equal.
 */
 static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
                                 int cpu_going_down, cpumask_t *cpumask)
@@ -3860,8 +3897,9 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 * items finish.  Note that a work item which repeatedly requeues itself
 * back-to-back will stay on its current pwq.
 *
- * Performs GFP_KERNEL allocations.  Returns 0 on success and -errno on
+ * Performs GFP_KERNEL allocations.
- * failure.
+ *
+ * Return: 0 on success and -errno on failure.
 */
 int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs)
@@ -4329,6 +4367,8 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
 *
 * Determine whether %current is a workqueue rescuer.  Can be used from
 * work functions to determine whether it's being run off the rescuer task.
+ *
+ * Return: %true if %current is a workqueue rescuer. %false otherwise.
 */
 bool current_is_workqueue_rescuer(void)
 {
@@ -4352,7 +4392,7 @@ bool current_is_workqueue_rescuer(void)
 * workqueue being congested on one CPU doesn't mean the workqueue is also
 * contested on other CPUs / NUMA nodes.
 *
- * RETURNS:
+ * Return:
 * %true if congested, %false otherwise.
 */
 bool workqueue_congested(int cpu, struct workqueue_struct *wq)
@@ -4385,7 +4425,7 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
 * synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
- * RETURNS:
+ * Return:
 * OR'd bitmask of WORK_BUSY_* bits.
 */
 unsigned int work_busy(struct work_struct *work)
@@ -4763,9 +4803,10 @@ static void work_for_cpu_fn(struct work_struct *work)
 * @fn: the function to run
 * @arg: the function arg
 *
- * This will return the value @fn returns.
 * It is up to the caller to ensure that the cpu doesn't go offline.
 * The caller must not hold any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
 */
 long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 {
@@ -4837,7 +4878,7 @@ void freeze_workqueues_begin(void)
 * CONTEXT:
 * Grabs and releases wq_pool_mutex.
 *
- * RETURNS:
+ * Return:
 * %true if some freezable workqueues are still busy.  %false if freezing
 * is complete.
 */