72 files changed, 3355 insertions, 1926 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 35536d9c0964..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,9 +220,16 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
 endif
+config ARCH_SUPPORTS_ATOMIC_RMW
+        bool
 config MUTEX_SPIN_ON_OWNER
        def_bool y
-        depends on SMP && !DEBUG_MUTEXES
+        depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+config RWSEM_SPIN_ON_OWNER
+       def_bool y
+       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
 config ARCH_USE_QUEUE_RWLOCK
        bool
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 70776aec2562..7dc8788cfd52 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root;
 */
 static bool cgrp_dfl_root_visible;
+/*
+ * Set by the boot param of the same name and makes subsystems with NULL
+ * ->dfl_files to use ->legacy_files on the default hierarchy.
+ */
+static bool cgroup_legacy_files_on_dfl;
 /* some controllers are not supported in the default hierarchy */
-static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
+static unsigned int cgrp_dfl_root_inhibit_ss_mask;
-#ifdef CONFIG_CGROUP_DEBUG
-        | (1 << debug_cgrp_id)
-#endif
-        ;
 /* The list of hierarchy roots */
@@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1;
 */
 static int need_forkexit_callback __read_mostly;
-static struct cftype cgroup_base_files[];
+static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_legacy_base_files[];
 static void cgroup_put(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
                             unsigned int ss_mask);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+                      bool visible);
 static void css_release(struct percpu_ref *ref);
 static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp)
 }
 /**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * On the default hierarchy, a subsystem may request other subsystems to be
+ * enabled together through its ->depends_on mask.  In such cases, more
+ * subsystems than specified in "cgroup.subtree_control" may be enabled.
+ *
+ * This function determines which subsystems need to be enabled given the
+ * current @cgrp->subtree_control and records it in
+ * @cgrp->child_subsys_mask.  The resulting mask is always a superset of
+ * @cgrp->subtree_control and follows the usual hierarchy rules.
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+{
+        struct cgroup *parent = cgroup_parent(cgrp);
+        unsigned int cur_ss_mask = cgrp->subtree_control;
+        struct cgroup_subsys *ss;
+        int ssid;
+        lockdep_assert_held(&cgroup_mutex);
+        if (!cgroup_on_dfl(cgrp)) {
+                cgrp->child_subsys_mask = cur_ss_mask;
+                return;
+        }
+        while (true) {
+                unsigned int new_ss_mask = cur_ss_mask;
+                for_each_subsys(ss, ssid)
+                        if (cur_ss_mask & (1 << ssid))
+                                new_ss_mask |= ss->depends_on;
+                /*
+                 * Mask out subsystems which aren't available.  This can
+                 * happen only if some depended-upon subsystems were bound
+                 * to non-default hierarchies.
+                 */
+                if (parent)
+                        new_ss_mask &= parent->child_subsys_mask;
+                else
+                        new_ss_mask &= cgrp->root->subsys_mask;
+                if (new_ss_mask == cur_ss_mask)
+                        break;
+                cur_ss_mask = new_ss_mask;
+        }
+        cgrp->child_subsys_mask = cur_ss_mask;
+}
+/**
 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
 * @kn: the kernfs_node being serviced
 *
@@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
                up_write(&css_set_rwsem);
                src_root->subsys_mask &= ~(1 << ssid);
-                src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+                src_root->cgrp.subtree_control &= ~(1 << ssid);
+                cgroup_refresh_child_subsys_mask(&src_root->cgrp);
                /* default hierarchy doesn't enable controllers by default */
                dst_root->subsys_mask |= 1 << ssid;
-                if (dst_root != &cgrp_dfl_root)
+                if (dst_root != &cgrp_dfl_root) {
-                        dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+                        dst_root->cgrp.subtree_control |= 1 << ssid;
+                        cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+                }
                if (ss->bind)
                        ss->bind(css);
@@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq,
        for_each_subsys(ss, ssid)
                if (root->subsys_mask & (1 << ssid))
                        seq_printf(seq, ",%s", ss->name);
-        if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
-                seq_puts(seq, ",sane_behavior");
        if (root->flags & CGRP_ROOT_NOPREFIX)
                seq_puts(seq, ",noprefix");
        if (root->flags & CGRP_ROOT_XATTR)
@@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        bool all_ss = false, one_ss = false;
        unsigned int mask = -1U;
        struct cgroup_subsys *ss;
+        int nr_opts = 0;
        int i;
 #ifdef CONFIG_CPUSETS
@@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        memset(opts, 0, sizeof(*opts));
        while ((token = strsep(&o, ",")) != NULL) {
+                nr_opts++;
                if (!*token)
                        return -EINVAL;
                if (!strcmp(token, "none")) {
@@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        return -ENOENT;
        }
-        /* Consistency checks */
        if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
                pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+                if (nr_opts != 1) {
-                if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
+                        pr_err("sane_behavior: no other mount options allowed\n");
-                    opts->cpuset_clone_children || opts->release_agent ||
-                    opts->name) {
-                        pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
                        return -EINVAL;
                }
-        } else {
+                return 0;
-                /*
-                 * If the 'all' option was specified select all the
-                 * subsystems, otherwise if 'none', 'name=' and a subsystem
-                 * name options were not specified, let's default to 'all'
-                 */
-                if (all_ss || (!one_ss && !opts->none && !opts->name))
-                        for_each_subsys(ss, i)
-                                if (!ss->disabled)
-                                        opts->subsys_mask |= (1 << i);
-                /*
-                 * We either have to specify by name or by subsystems. (So
-                 * all empty hierarchies must have a name).
-                 */
-                if (!opts->subsys_mask && !opts->name)
-                        return -EINVAL;
        }
        /*
+         * If the 'all' option was specified select all the subsystems,
+         * otherwise if 'none', 'name=' and a subsystem name options were
+         * not specified, let's default to 'all'
+         */
+        if (all_ss || (!one_ss && !opts->none && !opts->name))
+                for_each_subsys(ss, i)
+                        if (!ss->disabled)
+                                opts->subsys_mask |= (1 << i);
+        /*
+         * We either have to specify by name or by subsystems. (So all
+         * empty hierarchies must have a name).
+         */
+        if (!opts->subsys_mask && !opts->name)
+                return -EINVAL;
+        /*
         * Option noprefix was introduced just for backward compatibility
         * with the old cpuset, so we allow noprefix only if mounting just
         * the cpuset subsystem.
@@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
                return -EINVAL;
        /* Can't specify "none" and some subsystems */
        if (opts->subsys_mask && opts->none)
                return -EINVAL;
@@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
        struct cgroup_sb_opts opts;
        unsigned int added_mask, removed_mask;
-        if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+        if (root == &cgrp_dfl_root) {
-                pr_err("sane_behavior: remount is not allowed\n");
+                pr_err("remount is not allowed\n");
                return -EINVAL;
        }
@@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
        removed_mask = root->subsys_mask & ~opts.subsys_mask;
        /* Don't allow flags or name to change at remount */
-        if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
+        if ((opts.flags ^ root->flags) ||
            (opts.name && strcmp(opts.name, root->name))) {
                pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
-                       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
+                       opts.flags, opts.name ?: "", root->flags, root->name);
-                       root->flags & CGRP_ROOT_OPTION_MASK, root->name);
                ret = -EINVAL;
                goto out_unlock;
        }
@@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
 {
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
+        struct cftype *base_files;
        struct css_set *cset;
        int i, ret;
@@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
        }
        root_cgrp->kn = root->kf_root->kn;
-        ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+        if (root == &cgrp_dfl_root)
+                base_files = cgroup_dfl_base_files;
+        else
+                base_files = cgroup_legacy_base_files;
+        ret = cgroup_addrm_files(root_cgrp, base_files, true);
        if (ret)
                goto destroy_root;
@@ -1638,7 +1698,7 @@ destroy_root:
 exit_root_id:
        cgroup_exit_root_id(root);
 cancel_ref:
-        percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+        percpu_ref_exit(&root_cgrp->self.refcnt);
 out:
        free_cgrp_cset_links(&tmp_links);
        return ret;
@@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                goto out_unlock;
        /* look for a matching existing root */
-        if (!opts.subsys_mask && !opts.none && !opts.name) {
+        if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
                cgrp_dfl_root_visible = true;
                root = &cgrp_dfl_root;
                cgroup_get(&root->cgrp);
@@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                        goto out_unlock;
                }
-                if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
+                if (root->flags ^ opts.flags)
-                        if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
+                        pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-                                pr_err("sane_behavior: new mount options should match the existing superblock\n");
-                                ret = -EINVAL;
-                                goto out_unlock;
-                        } else {
-                                pr_warn("new mount options do not match the existing superblock, will be ignored\n");
-                        }
-                }
                /*
                 * We want to reuse @root whose lifetime is governed by its
@@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v)
 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
 {
-        struct cgroup *cgrp = seq_css(seq)->cgroup;
+        seq_puts(seq, "0\n");
-        seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
        return 0;
 }
@@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v)
 {
        struct cgroup *cgrp = seq_css(seq)->cgroup;
-        cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+        cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
        return 0;
 }
@@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
 {
        struct cgroup *cgrp = seq_css(seq)->cgroup;
-        cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+        cgroup_print_ss_mask(seq, cgrp->subtree_control);
        return 0;
 }
@@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                            loff_t off)
 {
        unsigned int enable = 0, disable = 0;
+        unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
        struct cgroup *cgrp, *child;
        struct cgroup_subsys *ss;
        char *tok;
@@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
        for_each_subsys(ss, ssid) {
                if (enable & (1 << ssid)) {
-                        if (cgrp->child_subsys_mask & (1 << ssid)) {
+                        if (cgrp->subtree_control & (1 << ssid)) {
                                enable &= ~(1 << ssid);
                                continue;
                        }
+                        /* unavailable or not enabled on the parent? */
+                        if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+                            (cgroup_parent(cgrp) &&
+                             !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+                                ret = -ENOENT;
+                                goto out_unlock;
+                        }
+                        /*
+                         * @ss is already enabled through dependency and
+                         * we'll just make it visible.  Skip draining.
+                         */
+                        if (cgrp->child_subsys_mask & (1 << ssid))
+                                continue;
                        /*
                         * Because css offlining is asynchronous, userland
                         * might try to re-enable the same controller while
@@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                return restart_syscall();
                        }
-                        /* unavailable or not enabled on the parent? */
-                        if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
-                            (cgroup_parent(cgrp) &&
-                             !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
-                                ret = -ENOENT;
-                                goto out_unlock;
-                        }
                } else if (disable & (1 << ssid)) {
-                        if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+                        if (!(cgrp->subtree_control & (1 << ssid))) {
                                disable &= ~(1 << ssid);
                                continue;
                        }
                        /* a child has it enabled? */
                        cgroup_for_each_live_child(child, cgrp) {
-                                if (child->child_subsys_mask & (1 << ssid)) {
+                                if (child->subtree_control & (1 << ssid)) {
                                        ret = -EBUSY;
                                        goto out_unlock;
                                }
@@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
        }
        /*
-         * Except for the root, child_subsys_mask must be zero for a cgroup
+         * Except for the root, subtree_control must be zero for a cgroup
         * with tasks so that child cgroups don't compete against tasks.
         */
        if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
@@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
        }
        /*
-         * Create csses for enables and update child_subsys_mask.  This
+         * Update subsys masks and calculate what needs to be done.  More
-         * changes cgroup_e_css() results which in turn makes the
+         * subsystems than specified may need to be enabled or disabled
-         * subsequent cgroup_update_dfl_csses() associate all tasks in the
+         * depending on subsystem dependencies.
-         * subtree to the updated csses.
+         */
+        cgrp->subtree_control |= enable;
+        cgrp->subtree_control &= ~disable;
+        old_ctrl = cgrp->child_subsys_mask;
+        cgroup_refresh_child_subsys_mask(cgrp);
+        new_ctrl = cgrp->child_subsys_mask;
+        css_enable = ~old_ctrl & new_ctrl;
+        css_disable = old_ctrl & ~new_ctrl;
+        enable |= css_enable;
+        disable |= css_disable;
+        /*
+         * Create new csses or make the existing ones visible.  A css is
+         * created invisible if it's being implicitly enabled through
+         * dependency.  An invisible css is made visible when the userland
+         * explicitly enables it.
         */
        for_each_subsys(ss, ssid) {
                if (!(enable & (1 << ssid)))
                        continue;
                cgroup_for_each_live_child(child, cgrp) {
-                        ret = create_css(child, ss);
+                        if (css_enable & (1 << ssid))
+                                ret = create_css(child, ss,
+                                        cgrp->subtree_control & (1 << ssid));
+                        else
+                                ret = cgroup_populate_dir(child, 1 << ssid);
                        if (ret)
                                goto err_undo_css;
                }
        }
-        cgrp->child_subsys_mask |= enable;
+        /*
-        cgrp->child_subsys_mask &= ~disable;
+         * At this point, cgroup_e_css() results reflect the new csses
+         * making the following cgroup_update_dfl_csses() properly update
+         * css associations of all tasks in the subtree.
+         */
        ret = cgroup_update_dfl_csses(cgrp);
        if (ret)
                goto err_undo_css;
-        /* all tasks are now migrated away from the old csses, kill them */
+        /*
+         * All tasks are migrated out of disabled csses.  Kill or hide
+         * them.  A css is hidden when the userland requests it to be
+         * disabled while other subsystems are still depending on it.  The
+         * css must not actively control resources and be in the vanilla
+         * state if it's made visible again later.  Controllers which may
+         * be depended upon should provide ->css_reset() for this purpose.
+         */
        for_each_subsys(ss, ssid) {
                if (!(disable & (1 << ssid)))
                        continue;
-                cgroup_for_each_live_child(child, cgrp)
+                cgroup_for_each_live_child(child, cgrp) {
-                        kill_css(cgroup_css(child, ss));
+                        struct cgroup_subsys_state *css = cgroup_css(child, ss);
+                        if (css_disable & (1 << ssid)) {
+                                kill_css(css);
+                        } else {
+                                cgroup_clear_dir(child, 1 << ssid);
+                                if (ss->css_reset)
+                                        ss->css_reset(css);
+                        }
+                }
        }
        kernfs_activate(cgrp->kn);
@@ -2755,8 +2853,9 @@ out_unlock:
        return ret ?: nbytes;
 err_undo_css:
-        cgrp->child_subsys_mask &= ~enable;
+        cgrp->subtree_control &= ~enable;
-        cgrp->child_subsys_mask |= disable;
+        cgrp->subtree_control |= disable;
+        cgroup_refresh_child_subsys_mask(cgrp);
        for_each_subsys(ss, ssid) {
                if (!(enable & (1 << ssid)))
@@ -2764,8 +2863,14 @@ err_undo_css:
                cgroup_for_each_live_child(child, cgrp) {
                        struct cgroup_subsys_state *css = cgroup_css(child, ss);
-                        if (css)
+                        if (!css)
+                                continue;
+                        if (css_enable & (1 << ssid))
                                kill_css(css);
+                        else
+                                cgroup_clear_dir(child, 1 << ssid);
                }
        }
        goto out_unlock;
@@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
        /*
         * This isn't a proper migration and its usefulness is very
-         * limited.  Disallow if sane_behavior.
+         * limited.  Disallow on the default hierarchy.
         */
-        if (cgroup_sane_behavior(cgrp))
+        if (cgroup_on_dfl(cgrp))
                return -EPERM;
        /*
@@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
-                if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+                if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
                        continue;
-                if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+                if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
                        continue;
                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
                        continue;
@@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
                        kfree(cft->kf_ops);
                cft->kf_ops = NULL;
                cft->ss = NULL;
+                /* revert flags set by cgroup core while adding @cfts */
+                cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
        }
 }
@@ -3109,7 +3217,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
 * function currently returns 0 as long as @cfts registration is successful
 * even if some file creation attempts on existing cgroups fail.
 */
-int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
        int ret;
@@ -3135,6 +3243,40 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 }
 /**
+ * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the default hierarchy.
+ */
+int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+        struct cftype *cft;
+        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+                cft->flags |= __CFTYPE_ONLY_ON_DFL;
+        return cgroup_add_cftypes(ss, cfts);
+}
+/**
+ * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the legacy hierarchies.
+ */
+int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+        struct cftype *cft;
+        for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+                cft->flags |= __CFTYPE_NOT_ON_DFL;
+        return cgroup_add_cftypes(ss, cfts);
+}
+/**
 * cgroup_task_count - count the number of tasks in a cgroup.
 * @cgrp: the cgroup in question
 *
@@ -3699,8 +3841,9 @@ after:
 *
 * All this extra complexity was caused by the original implementation
 * committing to an entirely unnecessary property.  In the long term, we
- * want to do away with it.  Explicitly scramble sort order if
+ * want to do away with it.  Explicitly scramble sort order if on the
- * sane_behavior so that no such expectation exists in the new interface.
+ * default hierarchy so that no such expectation exists in the new
+ * interface.
 *
 * Scrambling is done by swapping every two consecutive bits, which is
 * non-identity one-to-one mapping which disturbs sort order sufficiently.
@@ -3715,7 +3858,7 @@ static pid_t pid_fry(pid_t pid)
 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
 {
-        if (cgroup_sane_behavior(cgrp))
+        if (cgroup_on_dfl(cgrp))
                return pid_fry(pid);
        else
                return pid;
@@ -3818,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
        css_task_iter_end(&it);
        length = n;
        /* now sort & (if procs) strip out duplicates */
-        if (cgroup_sane_behavior(cgrp))
+        if (cgroup_on_dfl(cgrp))
                sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
        else
                sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -4040,7 +4183,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
        return 0;
 }
-static struct cftype cgroup_base_files[] = {
+/* cgroup core interface files for the default hierarchy */
+static struct cftype cgroup_dfl_base_files[] = {
        {
                .name = "cgroup.procs",
                .seq_start = cgroup_pidlist_start,
@@ -4052,46 +4196,52 @@ static struct cftype cgroup_base_files[] = {
                .mode = S_IRUGO | S_IWUSR,
        },
        {
-                .name = "cgroup.clone_children",
-                .flags = CFTYPE_INSANE,
-                .read_u64 = cgroup_clone_children_read,
-                .write_u64 = cgroup_clone_children_write,
-        },
-        {
-                .name = "cgroup.sane_behavior",
-                .flags = CFTYPE_ONLY_ON_ROOT,
-                .seq_show = cgroup_sane_behavior_show,
-        },
-        {
                .name = "cgroup.controllers",
-                .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+                .flags = CFTYPE_ONLY_ON_ROOT,
                .seq_show = cgroup_root_controllers_show,
        },
        {
                .name = "cgroup.controllers",
-                .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_controllers_show,
        },
        {
                .name = "cgroup.subtree_control",
-                .flags = CFTYPE_ONLY_ON_DFL,
                .seq_show = cgroup_subtree_control_show,
                .write = cgroup_subtree_control_write,
        },
        {
                .name = "cgroup.populated",
-                .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = cgroup_populated_show,
        },
+        { }     /* terminate */
+};
-        /*
+/* cgroup core interface files for the legacy hierarchies */
-         * Historical crazy stuff.  These don't have "cgroup."  prefix and
+static struct cftype cgroup_legacy_base_files[] = {
-         * don't exist if sane_behavior.  If you're depending on these, be
+        {
-         * prepared to be burned.
+                .name = "cgroup.procs",
-         */
+                .seq_start = cgroup_pidlist_start,
+                .seq_next = cgroup_pidlist_next,
+                .seq_stop = cgroup_pidlist_stop,
+                .seq_show = cgroup_pidlist_show,
+                .private = CGROUP_FILE_PROCS,
+                .write = cgroup_procs_write,
+                .mode = S_IRUGO | S_IWUSR,
+        },
+        {
+                .name = "cgroup.clone_children",
+                .read_u64 = cgroup_clone_children_read,
+                .write_u64 = cgroup_clone_children_write,
+        },
+        {
+                .name = "cgroup.sane_behavior",
+                .flags = CFTYPE_ONLY_ON_ROOT,
+                .seq_show = cgroup_sane_behavior_show,
+        },
        {
                .name = "tasks",
-                .flags = CFTYPE_INSANE,         /* use "procs" instead */
                .seq_start = cgroup_pidlist_start,
                .seq_next = cgroup_pidlist_next,
                .seq_stop = cgroup_pidlist_stop,
@@ -4102,13 +4252,12 @@ static struct cftype cgroup_base_files[] = {
        },
        {
                .name = "notify_on_release",
-                .flags = CFTYPE_INSANE,
                .read_u64 = cgroup_read_notify_on_release,
                .write_u64 = cgroup_write_notify_on_release,
        },
        {
                .name = "release_agent",
-                .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+                .flags = CFTYPE_ONLY_ON_ROOT,
                .seq_show = cgroup_release_agent_show,
                .write = cgroup_release_agent_write,
                .max_write_len = PATH_MAX - 1,
@@ -4175,6 +4324,8 @@ static void css_free_work_fn(struct work_struct *work)
                container_of(work, struct cgroup_subsys_state, destroy_work);
        struct cgroup *cgrp = css->cgroup;
+        percpu_ref_exit(&css->refcnt);
        if (css->ss) {
                /* css free path */
                if (css->parent)
@@ -4314,12 +4465,14 @@ static void offline_css(struct cgroup_subsys_state *css)
 * create_css - create a cgroup_subsys_state
 * @cgrp: the cgroup new css will be associated with
 * @ss: the subsys of new css
+ * @visible: whether to create control knobs for the new css or not
 *
 * Create a new css associated with @cgrp - @ss pair.  On success, the new
- * css is online and installed in @cgrp with all interface files created.
+ * css is online and installed in @cgrp with all interface files created if
- * Returns 0 on success, -errno on failure.
+ * @visible.  Returns 0 on success, -errno on failure.
 */
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+                      bool visible)
 {
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@ -4343,9 +4496,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
                goto err_free_percpu_ref;
        css->id = err;
-        err = cgroup_populate_dir(cgrp, 1 << ss->id);
+        if (visible) {
-        if (err)
+                err = cgroup_populate_dir(cgrp, 1 << ss->id);
-                goto err_free_id;
+                if (err)
+                        goto err_free_id;
+        }
        /* @css is ready to be brought online now, make it visible */
        list_add_tail_rcu(&css->sibling, &parent_css->children);
@@ -4372,7 +4527,7 @@ err_list_del:
 err_free_id:
        cgroup_idr_remove(&ss->css_idr, css->id);
 err_free_percpu_ref:
-        percpu_ref_cancel_init(&css->refcnt);
+        percpu_ref_exit(&css->refcnt);
 err_free_css:
        call_rcu(&css->rcu_head, css_free_rcu_fn);
        return err;
@@ -4385,6 +4540,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        struct cgroup_root *root;
        struct cgroup_subsys *ss;
        struct kernfs_node *kn;
+        struct cftype *base_files;
        int ssid, ret;
        parent = cgroup_kn_lock_live(parent_kn);
@@ -4455,14 +4611,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        if (ret)
                goto out_destroy;
-        ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+        if (cgroup_on_dfl(cgrp))
+                base_files = cgroup_dfl_base_files;
+        else
+                base_files = cgroup_legacy_base_files;
+        ret = cgroup_addrm_files(cgrp, base_files, true);
        if (ret)
                goto out_destroy;
        /* let's create and online css's */
        for_each_subsys(ss, ssid) {
                if (parent->child_subsys_mask & (1 << ssid)) {
-                        ret = create_css(cgrp, ss);
+                        ret = create_css(cgrp, ss,
+                                         parent->subtree_control & (1 << ssid));
                        if (ret)
                                goto out_destroy;
                }
@@ -4470,10 +4632,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
        /*
         * On the default hierarchy, a child doesn't automatically inherit
-         * child_subsys_mask from the parent.  Each is configured manually.
+         * subtree_control from the parent.  Each is configured manually.
         */
-        if (!cgroup_on_dfl(cgrp))
+        if (!cgroup_on_dfl(cgrp)) {
-                cgrp->child_subsys_mask = parent->child_subsys_mask;
+                cgrp->subtree_control = parent->subtree_control;
+                cgroup_refresh_child_subsys_mask(cgrp);
+        }
        kernfs_activate(kn);
@@ -4483,7 +4647,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 out_free_id:
        cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
 out_cancel_ref:
-        percpu_ref_cancel_init(&cgrp->self.refcnt);
+        percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
        kfree(cgrp);
 out_unlock:
@@ -4736,8 +4900,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 */
 int __init cgroup_init_early(void)
 {
-        static struct cgroup_sb_opts __initdata opts =
+        static struct cgroup_sb_opts __initdata opts;
-                { .flags = CGRP_ROOT_SANE_BEHAVIOR };
        struct cgroup_subsys *ss;
        int i;
@@ -4775,7 +4938,8 @@ int __init cgroup_init(void)
        unsigned long key;
        int ssid, err;
-        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+        BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+        BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
        mutex_lock(&cgroup_mutex);
@@ -4807,9 +4971,22 @@ int __init cgroup_init(void)
                 * disabled flag and cftype registration needs kmalloc,
                 * both of which aren't available during early_init.
                 */
-                if (!ss->disabled) {
+                if (ss->disabled)
-                        cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+                        continue;
-                        WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+                cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+                if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
+                        ss->dfl_cftypes = ss->legacy_cftypes;
+                if (!ss->dfl_cftypes)
+                        cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+                if (ss->dfl_cftypes == ss->legacy_cftypes) {
+                        WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
+                } else {
+                        WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
+                        WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
                }
        }
@@ -5205,6 +5382,14 @@ static int __init cgroup_disable(char *str)
 }
 __setup("cgroup_disable=", cgroup_disable);
+static int __init cgroup_set_legacy_files_on_dfl(char *str)
+{
+        printk("cgroup: using legacy files on the default hierarchy\n");
+        cgroup_legacy_files_on_dfl = true;
+        return 0;
+}
+__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
 /**
 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
 * @dentry: directory dentry of interest
@@ -5399,6 +5584,6 @@ static struct cftype debug_files[] =  {
 struct cgroup_subsys debug_cgrp_subsys = {
        .css_alloc = debug_css_alloc,
        .css_free = debug_css_free,
-        .base_cftypes = debug_files,
+        .legacy_cftypes = debug_files,
 };
 #endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index a79e40f9d700..92b98cc0ee76 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = {
        .css_free       = freezer_css_free,
        .attach         = freezer_attach,
        .fork           = freezer_fork,
-        .base_cftypes   = files,
+        .legacy_cftypes = files,
 };
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a343bde710b1..81e2a388a0f6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu)
        rcu_read_unlock();
 }
-static inline void check_for_tasks(int cpu)
+static inline void check_for_tasks(int dead_cpu)
 {
-        struct task_struct *p;
+        struct task_struct *g, *p;
-        cputime_t utime, stime;
-        write_lock_irq(&tasklist_lock);
+        read_lock_irq(&tasklist_lock);
-        for_each_process(p) {
+        do_each_thread(g, p) {
-                task_cputime(p, &utime, &stime);
+                if (!p->on_rq)
-                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
+                        continue;
-                    (utime || stime))
+                /*
-                        pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
+                 * We do the check with unlocked task_rq(p)->lock.
-                                p->comm, task_pid_nr(p), cpu,
+                 * Order the reading to do not warn about a task,
-                                p->state, p->flags);
+                 * which was running on this cpu in the past, and
-        }
+                 * it's just been woken on another cpu.
-        write_unlock_irq(&tasklist_lock);
+                 */
+                rmb();
+                if (task_cpu(p) != dead_cpu)
+                        continue;
+                pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
+                        p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
+        } while_each_thread(g, p);
+        read_unlock_irq(&tasklist_lock);
 }
 struct take_cpu_down_param {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 116a4164720a..22874d7cf2c0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -76,8 +76,34 @@ struct cpuset {
        struct cgroup_subsys_state css;
        unsigned long flags;            /* "unsigned long" so bitops work */
-        cpumask_var_t cpus_allowed;     /* CPUs allowed to tasks in cpuset */
-        nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
+        /*
+         * On default hierarchy:
+         *
+         * The user-configured masks can only be changed by writing to
+         * cpuset.cpus and cpuset.mems, and won't be limited by the
+         * parent masks.
+         *
+         * The effective masks is the real masks that apply to the tasks
+         * in the cpuset. They may be changed if the configured masks are
+         * changed or hotplug happens.
+         *
+         * effective_mask == configured_mask & parent's effective_mask,
+         * and if it ends up empty, it will inherit the parent's mask.
+         *
+         *
+         * On legacy hierachy:
+         *
+         * The user-configured masks are always the same with effective masks.
+         */
+        /* user-configured CPUs and Memory Nodes allow to tasks */
+        cpumask_var_t cpus_allowed;
+        nodemask_t mems_allowed;
+        /* effective CPUs and Memory Nodes allow to tasks */
+        cpumask_var_t effective_cpus;
+        nodemask_t effective_mems;
        /*
         * This is old Memory Nodes tasks took on.
@@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = {
 */
 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 {
-        while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+        while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
                cs = parent_cs(cs);
-        cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
+        cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
 }
 /*
@@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 */
 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 {
-        while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+        while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
                cs = parent_cs(cs);
-        nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
+        nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
 }
 /*
@@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
        if (!trial)
                return NULL;
-        if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
+        if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
-                kfree(trial);
+                goto free_cs;
-                return NULL;
+        if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
-        }
+                goto free_cpus;
-        cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+        cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+        cpumask_copy(trial->effective_cpus, cs->effective_cpus);
        return trial;
+free_cpus:
+        free_cpumask_var(trial->cpus_allowed);
+free_cs:
+        kfree(trial);
+        return NULL;
 }
 /**
@@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 */
 static void free_trial_cpuset(struct cpuset *trial)
 {
+        free_cpumask_var(trial->effective_cpus);
        free_cpumask_var(trial->cpus_allowed);
        kfree(trial);
 }
@@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
        par = parent_cs(cur);
-        /* We must be a subset of our parent cpuset */
+        /* On legacy hiearchy, we must be a subset of our parent cpuset. */
        ret = -EACCES;
-        if (!is_cpuset_subset(trial, par))
+        if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
                goto out;
        /*
@@ -480,11 +514,11 @@ out:
 #ifdef CONFIG_SMP
 /*
 * Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping cpus_allowed masks?
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
 */
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
-        return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
+        return cpumask_intersects(a->effective_cpus, b->effective_cpus);
 }
 static void
@@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
-                cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+                cpumask_copy(doms[0], top_cpuset.effective_cpus);
                goto done;
        }
@@ -705,7 +739,7 @@ restart:
                        struct cpuset *b = csa[j];
                        if (apn == b->pn) {
-                                cpumask_or(dp, dp, b->cpus_allowed);
+                                cpumask_or(dp, dp, b->effective_cpus);
                                if (dattr)
                                        update_domain_attr_tree(dattr + nslot, b);
@@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void)
         * passing doms with offlined cpu to partition_sched_domains().
         * Anyways, hotplug work item will rebuild sched domains.
         */
-        if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+        if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
                goto out;
        /* Generate domain masks and attrs */
@@ -781,45 +815,6 @@ void rebuild_sched_domains(void)
        mutex_unlock(&cpuset_mutex);
 }
-/*
- * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
- * @cs: the cpuset in interest
- *
- * A cpuset's effective cpumask is the cpumask of the nearest ancestor
- * with non-empty cpus. We use effective cpumask whenever:
- * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
- *   if the cpuset they reside in has no cpus)
- * - we want to retrieve task_cs(tsk)'s cpus_allowed.
- *
- * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
- * exception. See comments there.
- */
-static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
-{
-        while (cpumask_empty(cs->cpus_allowed))
-                cs = parent_cs(cs);
-        return cs;
-}
-/*
- * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
- * @cs: the cpuset in interest
- *
- * A cpuset's effective nodemask is the nodemask of the nearest ancestor
- * with non-empty memss. We use effective nodemask whenever:
- * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
- *   if the cpuset they reside in has no mems)
- * - we want to retrieve task_cs(tsk)'s mems_allowed.
- *
- * Called with cpuset_mutex held.
- */
-static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
-{
-        while (nodes_empty(cs->mems_allowed))
-                cs = parent_cs(cs);
-        return cs;
-}
 /**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
 */
 static void update_tasks_cpumask(struct cpuset *cs)
 {
-        struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
        struct css_task_iter it;
        struct task_struct *task;
        css_task_iter_start(&cs->css, &it);
        while ((task = css_task_iter_next(&it)))
-                set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+                set_cpus_allowed_ptr(task, cs->effective_cpus);
        css_task_iter_end(&it);
 }
 /*
- * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
- * @root_cs: the root cpuset of the hierarchy
+ * @cs: the cpuset to consider
- * @update_root: update root cpuset or not?
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
 *
- * This will update cpumasks of tasks in @root_cs and all other empty cpusets
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
- * which take on cpumask of @root_cs.
 *
 * Called with cpuset_mutex held
 */
-static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
+        bool need_rebuild_sched_domains = false;
        rcu_read_lock();
-        cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
-                if (cp == root_cs) {
+                struct cpuset *parent = parent_cs(cp);
-                        if (!update_root)
-                                continue;
+                cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
-                } else {
-                        /* skip the whole subtree if @cp have some CPU */
+                /*
-                        if (!cpumask_empty(cp->cpus_allowed)) {
+                 * If it becomes empty, inherit the effective mask of the
-                                pos_css = css_rightmost_descendant(pos_css);
+                 * parent, which is guaranteed to have some CPUs.
-                                continue;
+                 */
-                        }
+                if (cpumask_empty(new_cpus))
+                        cpumask_copy(new_cpus, parent->effective_cpus);
+                /* Skip the whole subtree if the cpumask remains the same. */
+                if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+                        pos_css = css_rightmost_descendant(pos_css);
+                        continue;
                }
                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();
+                mutex_lock(&callback_mutex);
+                cpumask_copy(cp->effective_cpus, new_cpus);
+                mutex_unlock(&callback_mutex);
+                WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+                        !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
                update_tasks_cpumask(cp);
+                /*
+                 * If the effective cpumask of any non-empty cpuset is changed,
+                 * we need to rebuild sched domains.
+                 */
+                if (!cpumask_empty(cp->cpus_allowed) &&
+                    is_sched_load_balance(cp))
+                        need_rebuild_sched_domains = true;
                rcu_read_lock();
                css_put(&cp->css);
        }
        rcu_read_unlock();
+        if (need_rebuild_sched_domains)
+                rebuild_sched_domains_locked();
 }
 /**
@@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                          const char *buf)
 {
        int retval;
-        int is_load_balanced;
        /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
        if (cs == &top_cpuset)
@@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
                if (retval < 0)
                        return retval;
-                if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
+                if (!cpumask_subset(trialcs->cpus_allowed,
+                                    top_cpuset.cpus_allowed))
                        return -EINVAL;
        }
@@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
        if (retval < 0)
                return retval;
-        is_load_balanced = is_sched_load_balance(trialcs);
        mutex_lock(&callback_mutex);
        cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
        mutex_unlock(&callback_mutex);
-        update_tasks_cpumask_hier(cs, true);
+        /* use trialcs->cpus_allowed as a temp variable */
+        update_cpumasks_hier(cs, trialcs->cpus_allowed);
-        if (is_load_balanced)
-                rebuild_sched_domains_locked();
        return 0;
 }
@@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
                                                        const nodemask_t *to)
 {
        struct task_struct *tsk = current;
-        struct cpuset *mems_cs;
        tsk->mems_allowed = *to;
        do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
        rcu_read_lock();
-        mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+        guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
-        guarantee_online_mems(mems_cs, &tsk->mems_allowed);
        rcu_read_unlock();
 }
@@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound;
 static void update_tasks_nodemask(struct cpuset *cs)
 {
        static nodemask_t newmems;      /* protected by cpuset_mutex */
-        struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
        struct css_task_iter it;
        struct task_struct *task;
        cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
-        guarantee_online_mems(mems_cs, &newmems);
+        guarantee_online_mems(cs, &newmems);
        /*
         * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs)
 }
 /*
- * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
+ * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
- * @cs: the root cpuset of the hierarchy
+ * @cs: the cpuset to consider
- * @update_root: update the root cpuset or not?
+ * @new_mems: a temp variable for calculating new effective_mems
 *
- * This will update nodemasks of tasks in @root_cs and all other empty cpusets
+ * When configured nodemask is changed, the effective nodemasks of this cpuset
- * which take on nodemask of @root_cs.
+ * and all its descendants need to be updated.
+ *
+ * On legacy hiearchy, effective_mems will be the same with mems_allowed.
 *
 * Called with cpuset_mutex held
 */
-static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 {
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;
        rcu_read_lock();
-        cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
+        cpuset_for_each_descendant_pre(cp, pos_css, cs) {
-                if (cp == root_cs) {
+                struct cpuset *parent = parent_cs(cp);
-                        if (!update_root)
-                                continue;
+                nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
-                } else {
-                        /* skip the whole subtree if @cp have some CPU */
+                /*
-                        if (!nodes_empty(cp->mems_allowed)) {
+                 * If it becomes empty, inherit the effective mask of the
-                                pos_css = css_rightmost_descendant(pos_css);
+                 * parent, which is guaranteed to have some MEMs.
-                                continue;
+                 */
-                        }
+                if (nodes_empty(*new_mems))
+                        *new_mems = parent->effective_mems;
+                /* Skip the whole subtree if the nodemask remains the same. */
+                if (nodes_equal(*new_mems, cp->effective_mems)) {
+                        pos_css = css_rightmost_descendant(pos_css);
+                        continue;
                }
                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();
+                mutex_lock(&callback_mutex);
+                cp->effective_mems = *new_mems;
+                mutex_unlock(&callback_mutex);
+                WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+                        !nodes_equal(cp->mems_allowed, cp->effective_mems));
                update_tasks_nodemask(cp);
                rcu_read_lock();
@@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                        goto done;
                if (!nodes_subset(trialcs->mems_allowed,
-                                node_states[N_MEMORY])) {
+                                  top_cpuset.mems_allowed)) {
-                        retval =  -EINVAL;
+                        retval = -EINVAL;
                        goto done;
                }
        }
@@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        cs->mems_allowed = trialcs->mems_allowed;
        mutex_unlock(&callback_mutex);
-        update_tasks_nodemask_hier(cs, true);
+        /* use trialcs->mems_allowed as a temp variable */
+        update_nodemasks_hier(cs, &cs->mems_allowed);
 done:
        return retval;
 }
@@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
        mutex_lock(&cpuset_mutex);
-        /*
+        /* allow moving tasks into an empty cpuset if on default hierarchy */
-         * We allow to move tasks into an empty cpuset if sane_behavior
-         * flag is set.
-         */
        ret = -ENOSPC;
-        if (!cgroup_sane_behavior(css->cgroup) &&
+        if (!cgroup_on_dfl(css->cgroup) &&
            (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                goto out_unlock;
@@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
        struct task_struct *leader = cgroup_taskset_first(tset);
        struct cpuset *cs = css_cs(css);
        struct cpuset *oldcs = cpuset_attach_old_cs;
-        struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
-        struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
        mutex_lock(&cpuset_mutex);
@@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
        if (cs == &top_cpuset)
                cpumask_copy(cpus_attach, cpu_possible_mask);
        else
-                guarantee_online_cpus(cpus_cs, cpus_attach);
+                guarantee_online_cpus(cs, cpus_attach);
-        guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
        cgroup_taskset_for_each(task, tset) {
                /*
@@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
         * Change mm, possibly for multiple threads in a threadgroup. This is
         * expensive and may sleep.
         */
-        cpuset_attach_nodemask_to = cs->mems_allowed;
+        cpuset_attach_nodemask_to = cs->effective_mems;
        mm = get_task_mm(leader);
        if (mm) {
-                struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
                mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
                /*
@@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
                 * mm from.
                 */
                if (is_memory_migrate(cs)) {
-                        cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
+                        cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
                                          &cpuset_attach_nodemask_to);
                }
                mmput(mm);
@@ -1516,6 +1541,8 @@ typedef enum {
        FILE_MEMORY_MIGRATE,
        FILE_CPULIST,
        FILE_MEMLIST,
+        FILE_EFFECTIVE_CPULIST,
+        FILE_EFFECTIVE_MEMLIST,
        FILE_CPU_EXCLUSIVE,
        FILE_MEM_EXCLUSIVE,
        FILE_MEM_HARDWALL,
@@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
        case FILE_MEMLIST:
                s += nodelist_scnprintf(s, count, cs->mems_allowed);
                break;
+        case FILE_EFFECTIVE_CPULIST:
+                s += cpulist_scnprintf(s, count, cs->effective_cpus);
+                break;
+        case FILE_EFFECTIVE_MEMLIST:
+                s += nodelist_scnprintf(s, count, cs->effective_mems);
+                break;
        default:
                ret = -EINVAL;
                goto out_unlock;
@@ -1779,6 +1812,18 @@ static struct cftype files[] = {
        },
        {
+                .name = "effective_cpus",
+                .seq_show = cpuset_common_seq_show,
+                .private = FILE_EFFECTIVE_CPULIST,
+        },
+        {
+                .name = "effective_mems",
+                .seq_show = cpuset_common_seq_show,
+                .private = FILE_EFFECTIVE_MEMLIST,
+        },
+        {
                .name = "cpu_exclusive",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
@@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);
-        if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
+        if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
-                kfree(cs);
+                goto free_cs;
-                return ERR_PTR(-ENOMEM);
+        if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
-        }
+                goto free_cpus;
        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        cpumask_clear(cs->cpus_allowed);
        nodes_clear(cs->mems_allowed);
+        cpumask_clear(cs->effective_cpus);
+        nodes_clear(cs->effective_mems);
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;
        return &cs->css;
+free_cpus:
+        free_cpumask_var(cs->cpus_allowed);
+free_cs:
+        kfree(cs);
+        return ERR_PTR(-ENOMEM);
 }
 static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
        cpuset_inc();
+        mutex_lock(&callback_mutex);
+        if (cgroup_on_dfl(cs->css.cgroup)) {
+                cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+                cs->effective_mems = parent->effective_mems;
+        }
+        mutex_unlock(&callback_mutex);
        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;
@@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
 {
        struct cpuset *cs = css_cs(css);
+        free_cpumask_var(cs->effective_cpus);
        free_cpumask_var(cs->cpus_allowed);
        kfree(cs);
 }
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+        mutex_lock(&cpuset_mutex);
+        mutex_lock(&callback_mutex);
+        if (cgroup_on_dfl(root_css->cgroup)) {
+                cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+                top_cpuset.mems_allowed = node_possible_map;
+        } else {
+                cpumask_copy(top_cpuset.cpus_allowed,
+                             top_cpuset.effective_cpus);
+                top_cpuset.mems_allowed = top_cpuset.effective_mems;
+        }
+        mutex_unlock(&callback_mutex);
+        mutex_unlock(&cpuset_mutex);
+}
 struct cgroup_subsys cpuset_cgrp_subsys = {
-        .css_alloc = cpuset_css_alloc,
+        .css_alloc      = cpuset_css_alloc,
-        .css_online = cpuset_css_online,
+        .css_online     = cpuset_css_online,
-        .css_offline = cpuset_css_offline,
+        .css_offline    = cpuset_css_offline,
-        .css_free = cpuset_css_free,
+        .css_free       = cpuset_css_free,
-        .can_attach = cpuset_can_attach,
+        .can_attach     = cpuset_can_attach,
-        .cancel_attach = cpuset_cancel_attach,
+        .cancel_attach  = cpuset_cancel_attach,
-        .attach = cpuset_attach,
+        .attach         = cpuset_attach,
-        .base_cftypes = files,
+        .bind           = cpuset_bind,
-        .early_init = 1,
+        .legacy_cftypes = files,
+        .early_init     = 1,
 };
 /**
@@ -1990,9 +2070,13 @@ int __init cpuset_init(void)
        if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
                BUG();
+        if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+                BUG();
        cpumask_setall(top_cpuset.cpus_allowed);
        nodes_setall(top_cpuset.mems_allowed);
+        cpumask_setall(top_cpuset.effective_cpus);
+        nodes_setall(top_cpuset.effective_mems);
        fmeter_init(&top_cpuset.fmeter);
        set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
        }
 }
+static void
+hotplug_update_tasks_legacy(struct cpuset *cs,
+                            struct cpumask *new_cpus, nodemask_t *new_mems,
+                            bool cpus_updated, bool mems_updated)
+{
+        bool is_empty;
+        mutex_lock(&callback_mutex);
+        cpumask_copy(cs->cpus_allowed, new_cpus);
+        cpumask_copy(cs->effective_cpus, new_cpus);
+        cs->mems_allowed = *new_mems;
+        cs->effective_mems = *new_mems;
+        mutex_unlock(&callback_mutex);
+        /*
+         * Don't call update_tasks_cpumask() if the cpuset becomes empty,
+         * as the tasks will be migratecd to an ancestor.
+         */
+        if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+                update_tasks_cpumask(cs);
+        if (mems_updated && !nodes_empty(cs->mems_allowed))
+                update_tasks_nodemask(cs);
+        is_empty = cpumask_empty(cs->cpus_allowed) ||
+                   nodes_empty(cs->mems_allowed);
+        mutex_unlock(&cpuset_mutex);
+        /*
+         * Move tasks to the nearest ancestor with execution resources,
+         * This is full cgroup operation which will also call back into
+         * cpuset. Should be done outside any lock.
+         */
+        if (is_empty)
+                remove_tasks_in_empty_cpuset(cs);
+        mutex_lock(&cpuset_mutex);
+}
+static void
+hotplug_update_tasks(struct cpuset *cs,
+                     struct cpumask *new_cpus, nodemask_t *new_mems,
+                     bool cpus_updated, bool mems_updated)
+{
+        if (cpumask_empty(new_cpus))
+                cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+        if (nodes_empty(*new_mems))
+                *new_mems = parent_cs(cs)->effective_mems;
+        mutex_lock(&callback_mutex);
+        cpumask_copy(cs->effective_cpus, new_cpus);
+        cs->effective_mems = *new_mems;
+        mutex_unlock(&callback_mutex);
+        if (cpus_updated)
+                update_tasks_cpumask(cs);
+        if (mems_updated)
+                update_tasks_nodemask(cs);
+}
 /**
 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
 * @cs: cpuset in interest
@@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 */
 static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 {
-        static cpumask_t off_cpus;
+        static cpumask_t new_cpus;
-        static nodemask_t off_mems;
+        static nodemask_t new_mems;
-        bool is_empty;
+        bool cpus_updated;
-        bool sane = cgroup_sane_behavior(cs->css.cgroup);
+        bool mems_updated;
 retry:
        wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -2064,51 +2207,20 @@ retry:
                goto retry;
        }
-        cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+        cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
-        nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
+        nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
-        mutex_lock(&callback_mutex);
-        cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
-        mutex_unlock(&callback_mutex);
-        /*
-         * If sane_behavior flag is set, we need to update tasks' cpumask
-         * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
-         * call update_tasks_cpumask() if the cpuset becomes empty, as
-         * the tasks in it will be migrated to an ancestor.
-         */
-        if ((sane && cpumask_empty(cs->cpus_allowed)) ||
-            (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
-                update_tasks_cpumask(cs);
-        mutex_lock(&callback_mutex);
+        cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
-        nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+        mems_updated = !nodes_equal(new_mems, cs->effective_mems);
-        mutex_unlock(&callback_mutex);
-        /*
-         * If sane_behavior flag is set, we need to update tasks' nodemask
-         * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
-         * call update_tasks_nodemask() if the cpuset becomes empty, as
-         * the tasks in it will be migratd to an ancestor.
-         */
-        if ((sane && nodes_empty(cs->mems_allowed)) ||
-            (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
-                update_tasks_nodemask(cs);
-        is_empty = cpumask_empty(cs->cpus_allowed) ||
+        if (cgroup_on_dfl(cs->css.cgroup))
-                nodes_empty(cs->mems_allowed);
+                hotplug_update_tasks(cs, &new_cpus, &new_mems,
+                                     cpus_updated, mems_updated);
+        else
+                hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+                                            cpus_updated, mems_updated);
        mutex_unlock(&cpuset_mutex);
-        /*
-         * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
-         *
-         * Otherwise move tasks to the nearest ancestor with execution
-         * resources.  This is full cgroup operation which will
-         * also call back into cpuset.  Should be done outside any lock.
-         */
-        if (!sane && is_empty)
-                remove_tasks_in_empty_cpuset(cs);
 }
 /**
@@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        static cpumask_t new_cpus;
        static nodemask_t new_mems;
        bool cpus_updated, mems_updated;
+        bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
        mutex_lock(&cpuset_mutex);
@@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        cpumask_copy(&new_cpus, cpu_active_mask);
        new_mems = node_states[N_MEMORY];
-        cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
+        cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
-        mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+        mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
        /* synchronize cpus_allowed to cpu_active_mask */
        if (cpus_updated) {
                mutex_lock(&callback_mutex);
-                cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+                if (!on_dfl)
+                        cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+                cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
                mutex_unlock(&callback_mutex);
                /* we don't mess with cpumasks of tasks in top_cpuset */
        }
@@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        /* synchronize mems_allowed to N_MEMORY */
        if (mems_updated) {
                mutex_lock(&callback_mutex);
-                top_cpuset.mems_allowed = new_mems;
+                if (!on_dfl)
+                        top_cpuset.mems_allowed = new_mems;
+                top_cpuset.effective_mems = new_mems;
                mutex_unlock(&callback_mutex);
                update_tasks_nodemask(&top_cpuset);
        }
@@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void)
        top_cpuset.mems_allowed = node_states[N_MEMORY];
        top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
+        cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+        top_cpuset.effective_mems = node_states[N_MEMORY];
        register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
 }
@@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void)
 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 {
-        struct cpuset *cpus_cs;
        mutex_lock(&callback_mutex);
        rcu_read_lock();
-        cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+        guarantee_online_cpus(task_cs(tsk), pmask);
-        guarantee_online_cpus(cpus_cs, pmask);
        rcu_read_unlock();
        mutex_unlock(&callback_mutex);
 }
 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
 {
-        struct cpuset *cpus_cs;
        rcu_read_lock();
-        cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+        do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
-        do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
        rcu_read_unlock();
        /*
@@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void)
 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 {
-        struct cpuset *mems_cs;
        nodemask_t mask;
        mutex_lock(&callback_mutex);
        rcu_read_lock();
-        mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+        guarantee_online_mems(task_cs(tsk), &mask);
-        guarantee_online_mems(mems_cs, &mask);
        rcu_read_unlock();
        mutex_unlock(&callback_mutex);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a33d9a2bcbd7..1cf24b3e42ec 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2320,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
        next_parent = rcu_dereference(next_ctx->parent_ctx);
        /* If neither context have a parent context; they cannot be clones. */
-        if (!parent && !next_parent)
+        if (!parent || !next_parent)
                goto unlock;
        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -5266,6 +5266,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
                goto got_name;
        } else {
+                if (vma->vm_ops && vma->vm_ops->name) {
+                        name = (char *) vma->vm_ops->name(vma);
+                        if (name)
+                                goto cpy_name;
+                }
                name = (char *)arch_vma_name(vma);
                if (name)
                        goto cpy_name;
@@ -7458,7 +7464,19 @@ __perf_event_exit_task(struct perf_event *child_event,
                         struct perf_event_context *child_ctx,
                         struct task_struct *child)
 {
-        perf_remove_from_context(child_event, true);
+        /*
+         * Do not destroy the 'original' grouping; because of the context
+         * switch optimization the original events could've ended up in a
+         * random child task.
+         *
+         * If we were to destroy the original group, all group related
+         * operations would cease to function properly after this random
+         * child dies.
+         *
+         * Do destroy all inherited groups, we don't care about those
+         * and being thorough is better.
+         */
+        perf_remove_from_context(child_event, !!child_event->parent);
        /*
         * It can happen that the parent exits first, and has events
@@ -7474,7 +7492,7 @@ __perf_event_exit_task(struct perf_event *child_event,
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 {
        struct perf_event *child_event, *next;
-        struct perf_event_context *child_ctx;
+        struct perf_event_context *child_ctx, *parent_ctx;
        unsigned long flags;
        if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7499,6 +7517,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
        raw_spin_lock(&child_ctx->lock);
        task_ctx_sched_out(child_ctx);
        child->perf_event_ctxp[ctxn] = NULL;
+        /*
+         * In order to avoid freeing: child_ctx->parent_ctx->task
+         * under perf_event_context::lock, grab another reference.
+         */
+        parent_ctx = child_ctx->parent_ctx;
+        if (parent_ctx)
+                get_ctx(parent_ctx);
        /*
         * If this context is a clone; unclone it so it can't get
         * swapped to another process while we're removing all
@@ -7509,6 +7536,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
        /*
+         * Now that we no longer hold perf_event_context::lock, drop
+         * our extra child_ctx->parent_ctx reference.
+         */
+        if (parent_ctx)
+                put_ctx(parent_ctx);
+        /*
         * Report the task dead after unscheduling the events so that we
         * won't get any samples after PERF_RECORD_EXIT. We can however still
         * get a few PERF_RECORD_READ events.
@@ -7776,7 +7810,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 /*
 * Initialize the perf_event context in task_struct
 */
-int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, int ctxn)
 {
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
diff --git a/kernel/fork.c b/kernel/fork.c
index 627b7f80afb0..5f1bf3bebb4f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1095,7 +1095,6 @@ static void rt_mutex_init_task(struct task_struct *p)
        p->pi_waiters = RB_ROOT;
        p->pi_waiters_leftmost = NULL;
        p->pi_blocked_on = NULL;
-        p->pi_top_task = NULL;
 #endif
 }
diff --git a/kernel/futex.c b/kernel/futex.c
index b632b5f3f094..d3a9d946d0b7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -792,94 +792,91 @@ void exit_pi_state_list(struct task_struct *curr)
 * [10] There is no transient state which leaves owner and user space
 *      TID out of sync.
 */
-static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+/*
-                union futex_key *key, struct futex_pi_state **ps)
+ * Validate that the existing waiter has a pi_state and sanity check
+ * the pi_state against the user space value. If correct, attach to
+ * it.
+ */
+static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+                              struct futex_pi_state **ps)
 {
-        struct futex_pi_state *pi_state = NULL;
-        struct futex_q *this, *next;
-        struct task_struct *p;
        pid_t pid = uval & FUTEX_TID_MASK;
-        plist_for_each_entry_safe(this, next, &hb->chain, list) {
+        /*
-                if (match_futex(&this->key, key)) {
+         * Userspace might have messed up non-PI and PI futexes [3]
-                        /*
+         */
-                         * Sanity check the waiter before increasing
+        if (unlikely(!pi_state))
-                         * the refcount and attaching to it.
+                return -EINVAL;
-                         */
-                        pi_state = this->pi_state;
-                        /*
-                         * Userspace might have messed up non-PI and
-                         * PI futexes [3]
-                         */
-                        if (unlikely(!pi_state))
-                                return -EINVAL;
-                        WARN_ON(!atomic_read(&pi_state->refcount));
+        WARN_ON(!atomic_read(&pi_state->refcount));
+        /*
+         * Handle the owner died case:
+         */
+        if (uval & FUTEX_OWNER_DIED) {
+                /*
+                 * exit_pi_state_list sets owner to NULL and wakes the
+                 * topmost waiter. The task which acquires the
+                 * pi_state->rt_mutex will fixup owner.
+                 */
+                if (!pi_state->owner) {
                        /*
-                         * Handle the owner died case:
+                         * No pi state owner, but the user space TID
+                         * is not 0. Inconsistent state. [5]
                         */
-                        if (uval & FUTEX_OWNER_DIED) {
+                        if (pid)
-                                /*
+                                return -EINVAL;
-                                 * exit_pi_state_list sets owner to NULL and
-                                 * wakes the topmost waiter. The task which
-                                 * acquires the pi_state->rt_mutex will fixup
-                                 * owner.
-                                 */
-                                if (!pi_state->owner) {
-                                        /*
-                                         * No pi state owner, but the user
-                                         * space TID is not 0. Inconsistent
-                                         * state. [5]
-                                         */
-                                        if (pid)
-                                                return -EINVAL;
-                                        /*
-                                         * Take a ref on the state and
-                                         * return. [4]
-                                         */
-                                        goto out_state;
-                                }
-                                /*
-                                 * If TID is 0, then either the dying owner
-                                 * has not yet executed exit_pi_state_list()
-                                 * or some waiter acquired the rtmutex in the
-                                 * pi state, but did not yet fixup the TID in
-                                 * user space.
-                                 *
-                                 * Take a ref on the state and return. [6]
-                                 */
-                                if (!pid)
-                                        goto out_state;
-                        } else {
-                                /*
-                                 * If the owner died bit is not set,
-                                 * then the pi_state must have an
-                                 * owner. [7]
-                                 */
-                                if (!pi_state->owner)
-                                        return -EINVAL;
-                        }
                        /*
-                         * Bail out if user space manipulated the
+                         * Take a ref on the state and return success. [4]
-                         * futex value. If pi state exists then the
-                         * owner TID must be the same as the user
-                         * space TID. [9/10]
                         */
-                        if (pid != task_pid_vnr(pi_state->owner))
+                        goto out_state;
-                                return -EINVAL;
-                out_state:
-                        atomic_inc(&pi_state->refcount);
-                        *ps = pi_state;
-                        return 0;
                }
+                /*
+                 * If TID is 0, then either the dying owner has not
+                 * yet executed exit_pi_state_list() or some waiter
+                 * acquired the rtmutex in the pi state, but did not
+                 * yet fixup the TID in user space.
+                 *
+                 * Take a ref on the state and return success. [6]
+                 */
+                if (!pid)
+                        goto out_state;
+        } else {
+                /*
+                 * If the owner died bit is not set, then the pi_state
+                 * must have an owner. [7]
+                 */
+                if (!pi_state->owner)
+                        return -EINVAL;
        }
        /*
+         * Bail out if user space manipulated the futex value. If pi
+         * state exists then the owner TID must be the same as the
+         * user space TID. [9/10]
+         */
+        if (pid != task_pid_vnr(pi_state->owner))
+                return -EINVAL;
+out_state:
+        atomic_inc(&pi_state->refcount);
+        *ps = pi_state;
+        return 0;
+}
+/*
+ * Lookup the task for the TID provided from user space and attach to
+ * it after doing proper sanity checks.
+ */
+static int attach_to_pi_owner(u32 uval, union futex_key *key,
+                              struct futex_pi_state **ps)
+{
+        pid_t pid = uval & FUTEX_TID_MASK;
+        struct futex_pi_state *pi_state;
+        struct task_struct *p;
+        /*
         * We are the first waiter - try to look up the real owner and attach
         * the new pi_state to it, but bail out when TID = 0 [1]
         */
@@ -920,7 +917,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        pi_state = alloc_pi_state();
        /*
-         * Initialize the pi_mutex in locked state and make 'p'
+         * Initialize the pi_mutex in locked state and make @p
         * the owner of it:
         */
        rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
@@ -940,6 +937,36 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        return 0;
 }
+static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+                           union futex_key *key, struct futex_pi_state **ps)
+{
+        struct futex_q *match = futex_top_waiter(hb, key);
+        /*
+         * If there is a waiter on that futex, validate it and
+         * attach to the pi_state when the validation succeeds.
+         */
+        if (match)
+                return attach_to_pi_state(uval, match->pi_state, ps);
+        /*
+         * We are the first waiter - try to look up the owner based on
+         * @uval and attach to it.
+         */
+        return attach_to_pi_owner(uval, key, ps);
+}
+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+{
+        u32 uninitialized_var(curval);
+        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
+                return -EFAULT;
+        /*If user space value changed, let the caller retry */
+        return curval != uval ? -EAGAIN : 0;
+}
 /**
 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
 * @uaddr:              the pi futex user address
@@ -963,113 +990,69 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                struct futex_pi_state **ps,
                                struct task_struct *task, int set_waiters)
 {
-        int lock_taken, ret, force_take = 0;
+        u32 uval, newval, vpid = task_pid_vnr(task);
-        u32 uval, newval, curval, vpid = task_pid_vnr(task);
+        struct futex_q *match;
+        int ret;
-retry:
-        ret = lock_taken = 0;
        /*
-         * To avoid races, we attempt to take the lock here again
+         * Read the user space value first so we can validate a few
-         * (by doing a 0 -> TID atomic cmpxchg), while holding all
+         * things before proceeding further.
-         * the locks. It will most likely not succeed.
         */
-        newval = vpid;
+        if (get_futex_value_locked(&uval, uaddr))
-        if (set_waiters)
-                newval |= FUTEX_WAITERS;
-        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
                return -EFAULT;
        /*
         * Detect deadlocks.
         */
-        if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
+        if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
                return -EDEADLK;
        /*
-         * Surprise - we got the lock, but we do not trust user space at all.
+         * Lookup existing state first. If it exists, try to attach to
-         */
+         * its pi_state.
-        if (unlikely(!curval)) {
-                /*
-                 * We verify whether there is kernel state for this
-                 * futex. If not, we can safely assume, that the 0 ->
-                 * TID transition is correct. If state exists, we do
-                 * not bother to fixup the user space state as it was
-                 * corrupted already.
-                 */
-                return futex_top_waiter(hb, key) ? -EINVAL : 1;
-        }
-        uval = curval;
-        /*
-         * Set the FUTEX_WAITERS flag, so the owner will know it has someone
-         * to wake at the next unlock.
         */
-        newval = curval | FUTEX_WAITERS;
+        match = futex_top_waiter(hb, key);
+        if (match)
+                return attach_to_pi_state(uval, match->pi_state, ps);
        /*
-         * Should we force take the futex? See below.
+         * No waiter and user TID is 0. We are here because the
+         * waiters or the owner died bit is set or called from
+         * requeue_cmp_pi or for whatever reason something took the
+         * syscall.
         */
-        if (unlikely(force_take)) {
+        if (!(uval & FUTEX_TID_MASK)) {
                /*
-                 * Keep the OWNER_DIED and the WAITERS bit and set the
+                 * We take over the futex. No other waiters and the user space
-                 * new TID value.
+                 * TID is 0. We preserve the owner died bit.
                 */
-                newval = (curval & ~FUTEX_TID_MASK) | vpid;
+                newval = uval & FUTEX_OWNER_DIED;
-                force_take = 0;
+                newval |= vpid;
-                lock_taken = 1;
-        }
-        if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
+                /* The futex requeue_pi code can enforce the waiters bit */
-                return -EFAULT;
+                if (set_waiters)
-        if (unlikely(curval != uval))
+                        newval |= FUTEX_WAITERS;
-                goto retry;
+                ret = lock_pi_update_atomic(uaddr, uval, newval);
+                /* If the take over worked, return 1 */
+                return ret < 0 ? ret : 1;
+        }
        /*
-         * We took the lock due to forced take over.
+         * First waiter. Set the waiters bit before attaching ourself to
+         * the owner. If owner tries to unlock, it will be forced into
+         * the kernel and blocked on hb->lock.
         */
-        if (unlikely(lock_taken))
+        newval = uval | FUTEX_WAITERS;
-                return 1;
+        ret = lock_pi_update_atomic(uaddr, uval, newval);
+        if (ret)
+                return ret;
        /*
-         * We dont have the lock. Look up the PI state (or create it if
+         * If the update of the user space value succeeded, we try to
-         * we are the first waiter):
+         * attach to the owner. If that fails, no harm done, we only
+         * set the FUTEX_WAITERS bit in the user space variable.
         */
-        ret = lookup_pi_state(uval, hb, key, ps);
+        return attach_to_pi_owner(uval, key, ps);
-        if (unlikely(ret)) {
-                switch (ret) {
-                case -ESRCH:
-                        /*
-                         * We failed to find an owner for this
-                         * futex. So we have no pi_state to block
-                         * on. This can happen in two cases:
-                         *
-                         * 1) The owner died
-                         * 2) A stale FUTEX_WAITERS bit
-                         *
-                         * Re-read the futex value.
-                         */
-                        if (get_futex_value_locked(&curval, uaddr))
-                                return -EFAULT;
-                        /*
-                         * If the owner died or we have a stale
-                         * WAITERS bit the owner TID in the user space
-                         * futex is 0.
-                         */
-                        if (!(curval & FUTEX_TID_MASK)) {
-                                force_take = 1;
-                                goto retry;
-                        }
-                default:
-                        break;
-                }
-        }
-        return ret;
 }
 /**
@@ -1186,22 +1169,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        return 0;
 }
-static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
-{
-        u32 uninitialized_var(oldval);
-        /*
-         * There is no waiter, so we unlock the futex. The owner died
-         * bit has not to be preserved here. We are the owner:
-         */
-        if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
-                return -EFAULT;
-        if (oldval != uval)
-                return -EAGAIN;
-        return 0;
-}
 /*
 * Express the locking dependencies for lockdep:
 */
@@ -1659,7 +1626,12 @@ retry_private:
                                goto retry;
                        goto out;
                case -EAGAIN:
-                        /* The owner was exiting, try again. */
+                        /*
+                         * Two reasons for this:
+                         * - Owner is exiting and we just wait for the
+                         *   exit to complete.
+                         * - The user space value changed.
+                         */
                        double_unlock_hb(hb1, hb2);
                        hb_waiters_dec(hb2);
                        put_futex_key(&key2);
@@ -1718,7 +1690,7 @@ retry_private:
                        this->pi_state = pi_state;
                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
                                                        this->rt_waiter,
-                                                        this->task, 1);
+                                                        this->task);
                        if (ret == 1) {
                                /* We got the lock. */
                                requeue_pi_wake_futex(this, &key2, hb2);
@@ -2316,8 +2288,10 @@ retry_private:
                        goto uaddr_faulted;
                case -EAGAIN:
                        /*
-                         * Task is exiting and we just wait for the
+                         * Two reasons for this:
-                         * exit to complete.
+                         * - Task is exiting and we just wait for the
+                         *   exit to complete.
+                         * - The user space value changed.
                         */
                        queue_unlock(hb);
                        put_futex_key(&q.key);
@@ -2337,9 +2311,9 @@ retry_private:
        /*
         * Block on the PI mutex:
         */
-        if (!trylock)
+        if (!trylock) {
-                ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
+                ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
-        else {
+        } else {
                ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
                /* Fixup the trylock return value: */
                ret = ret ? 0 : -EWOULDBLOCK;
@@ -2401,10 +2375,10 @@ uaddr_faulted:
 */
 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
 {
-        struct futex_hash_bucket *hb;
+        u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
-        struct futex_q *this, *next;
        union futex_key key = FUTEX_KEY_INIT;
-        u32 uval, vpid = task_pid_vnr(current);
+        struct futex_hash_bucket *hb;
+        struct futex_q *match;
        int ret;
 retry:
@@ -2417,57 +2391,47 @@ retry:
                return -EPERM;
        ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
-        if (unlikely(ret != 0))
+        if (ret)
-                goto out;
+                return ret;
        hb = hash_futex(&key);
        spin_lock(&hb->lock);
        /*
-         * To avoid races, try to do the TID -> 0 atomic transition
+         * Check waiters first. We do not trust user space values at
-         * again. If it succeeds then we can return without waking
+         * all and we at least want to know if user space fiddled
-         * anyone else up. We only try this if neither the waiters nor
+         * with the futex value instead of blindly unlocking.
-         * the owner died bit are set.
-         */
-        if (!(uval & ~FUTEX_TID_MASK) &&
-            cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
-                goto pi_faulted;
-        /*
-         * Rare case: we managed to release the lock atomically,
-         * no need to wake anyone else up:
-         */
-        if (unlikely(uval == vpid))
-                goto out_unlock;
-        /*
-         * Ok, other tasks may need to be woken up - check waiters
-         * and do the wakeup if necessary:
         */
-        plist_for_each_entry_safe(this, next, &hb->chain, list) {
+        match = futex_top_waiter(hb, &key);
-                if (!match_futex (&this->key, &key))
+        if (match) {
-                        continue;
+                ret = wake_futex_pi(uaddr, uval, match);
-                ret = wake_futex_pi(uaddr, uval, this);
                /*
-                 * The atomic access to the futex value
+                 * The atomic access to the futex value generated a
-                 * generated a pagefault, so retry the
+                 * pagefault, so retry the user-access and the wakeup:
-                 * user-access and the wakeup:
                 */
                if (ret == -EFAULT)
                        goto pi_faulted;
                goto out_unlock;
        }
        /*
-         * No waiters - kernel unlocks the futex:
+         * We have no kernel internal state, i.e. no waiters in the
+         * kernel. Waiters which are about to queue themselves are stuck
+         * on hb->lock. So we can safely ignore them. We do neither
+         * preserve the WAITERS bit not the OWNER_DIED one. We are the
+         * owner.
         */
-        ret = unlock_futex_pi(uaddr, uval);
+        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
-        if (ret == -EFAULT)
                goto pi_faulted;
+        /*
+         * If uval has changed, let user space handle it.
+         */
+        ret = (curval == uval) ? 0 : -EAGAIN;
 out_unlock:
        spin_unlock(&hb->lock);
        put_futex_key(&key);
-out:
        return ret;
 pi_faulted:
@@ -2669,7 +2633,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 */
                WARN_ON(!q.pi_state);
                pi_mutex = &q.pi_state->pi_mutex;
-                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
                debug_rt_mutex_free_waiter(&rt_waiter);
                spin_lock(q.lock_ptr);
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 452d6f2ba21d..cf80e7b0ddab 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class;
 /*
 * irq_map_generic_chip - Map a generic chip for an irq domain
 */
-static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
+int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
-                                irq_hw_number_t hw_irq)
+                         irq_hw_number_t hw_irq)
 {
        struct irq_data *data = irq_get_irq_data(virq);
        struct irq_domain_chip_generic *dgc = d->gc;
@@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
        irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
        return 0;
 }
+EXPORT_SYMBOL_GPL(irq_map_generic_chip);
 struct irq_domain_ops irq_generic_chip_ops = {
        .map    = irq_map_generic_chip,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index eb5e10e32e05..6534ff6ce02e 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain)
 }
 EXPORT_SYMBOL_GPL(irq_set_default_host);
-static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
+void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
 {
        struct irq_data *irq_data = irq_get_irq_data(irq);
        irq_hw_number_t hwirq;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index a82170e2fa78..e6bcbe756663 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -16,11 +16,12 @@
 #include <linux/tick.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/smp.h>
 #include <asm/processor.h>
-static DEFINE_PER_CPU(struct llist_head, irq_work_list);
+static DEFINE_PER_CPU(struct llist_head, raised_list);
-static DEFINE_PER_CPU(int, irq_work_raised);
+static DEFINE_PER_CPU(struct llist_head, lazy_list);
 /*
 * Claim the entry so that no one else will poke at it.
@@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void)
         */
 }
+#ifdef CONFIG_SMP
 /*
- * Enqueue the irq_work @entry unless it's already pending
+ * Enqueue the irq_work @work on @cpu unless it's already pending
 * somewhere.
 *
 * Can be re-enqueued while the callback is still in progress.
 */
+bool irq_work_queue_on(struct irq_work *work, int cpu)
+{
+        /* All work should have been flushed before going offline */
+        WARN_ON_ONCE(cpu_is_offline(cpu));
+        /* Arch remote IPI send/receive backend aren't NMI safe */
+        WARN_ON_ONCE(in_nmi());
+        /* Only queue if not already pending */
+        if (!irq_work_claim(work))
+                return false;
+        if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+                arch_send_call_function_single_ipi(cpu);
+        return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue_on);
+#endif
+/* Enqueue the irq work @work on the current CPU */
 bool irq_work_queue(struct irq_work *work)
 {
        /* Only queue if not already pending */
@@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work)
        /* Queue the entry and raise the IPI if needed. */
        preempt_disable();
-        llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
+        /* If the work is "lazy", handle it from next tick if any */
+        if (work->flags & IRQ_WORK_LAZY) {
-        /*
+                if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) &&
-         * If the work is not "lazy" or the tick is stopped, raise the irq
+                    tick_nohz_tick_stopped())
-         * work interrupt (if supported by the arch), otherwise, just wait
+                        arch_irq_work_raise();
-         * for the next tick.
+        } else {
-         */
+                if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))
-        if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
-                if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
                        arch_irq_work_raise();
        }
@@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
 bool irq_work_needs_cpu(void)
 {
-        struct llist_head *this_list;
+        struct llist_head *raised, *lazy;
-        this_list = &__get_cpu_var(irq_work_list);
+        raised = &__get_cpu_var(raised_list);
-        if (llist_empty(this_list))
+        lazy = &__get_cpu_var(lazy_list);
+        if (llist_empty(raised) && llist_empty(lazy))
                return false;
        /* All work should have been flushed before going offline */
@@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void)
        return true;
 }
-static void __irq_work_run(void)
+static void irq_work_run_list(struct llist_head *list)
 {
        unsigned long flags;
        struct irq_work *work;
-        struct llist_head *this_list;
        struct llist_node *llnode;
+        BUG_ON(!irqs_disabled());
-        /*
+        if (llist_empty(list))
-         * Reset the "raised" state right before we check the list because
-         * an NMI may enqueue after we find the list empty from the runner.
-         */
-        __this_cpu_write(irq_work_raised, 0);
-        barrier();
-        this_list = &__get_cpu_var(irq_work_list);
-        if (llist_empty(this_list))
                return;
-        BUG_ON(!irqs_disabled());
+        llnode = llist_del_all(list);
-        llnode = llist_del_all(this_list);
        while (llnode != NULL) {
                work = llist_entry(llnode, struct irq_work, llnode);
@@ -149,13 +161,13 @@ static void __irq_work_run(void)
 }
 /*
- * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * hotplug calls this through:
- * context with local IRQs disabled.
+ *  hotplug_cfd() -> flush_smp_call_function_queue()
 */
 void irq_work_run(void)
 {
-        BUG_ON(!in_irq());
+        irq_work_run_list(&__get_cpu_var(raised_list));
-        __irq_work_run();
+        irq_work_run_list(&__get_cpu_var(lazy_list));
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
@@ -171,35 +183,3 @@ void irq_work_sync(struct irq_work *work)
                cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
-#ifdef CONFIG_HOTPLUG_CPU
-static int irq_work_cpu_notify(struct notifier_block *self,
-                               unsigned long action, void *hcpu)
-{
-        long cpu = (long)hcpu;
-        switch (action) {
-        case CPU_DYING:
-                /* Called from stop_machine */
-                if (WARN_ON_ONCE(cpu != smp_processor_id()))
-                        break;
-                __irq_work_run();
-                break;
-        default:
-                break;
-        }
-        return NOTIFY_OK;
-}
-static struct notifier_block cpu_notify;
-static __init int irq_work_init_cpu_notifier(void)
-{
-        cpu_notify.notifier_call = irq_work_cpu_notify;
-        cpu_notify.priority = 0;
-        register_cpu_notifier(&cpu_notify);
-        return 0;
-}
-device_initcall(irq_work_init_cpu_notifier);
-#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 369f41a94124..4b8f0c925884 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
 #include <linux/swap.h>
 #include <linux/syscore_ops.h>
 #include <linux/compiler.h>
+#include <linux/hugetlb.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1619,6 +1620,9 @@ static int __init crash_save_vmcoreinfo_init(void)
 #endif
        VMCOREINFO_NUMBER(PG_head_mask);
        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLBFS
+        VMCOREINFO_SYMBOL(free_huge_page);
+#endif
        arch_crash_save_vmcoreinfo();
        update_vmcoreinfo_note();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3214289df5a7..734e9a7d280b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2037,19 +2037,23 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
 {
        unsigned long *iter;
        struct kprobe_blacklist_entry *ent;
-        unsigned long offset = 0, size = 0;
+        unsigned long entry, offset = 0, size = 0;
        for (iter = start; iter < end; iter++) {
-                if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) {
+                entry = arch_deref_entry_point((void *)*iter);
-                        pr_err("Failed to find blacklist %p\n", (void *)*iter);
+                if (!kernel_text_address(entry) ||
+                    !kallsyms_lookup_size_offset(entry, &size, &offset)) {
+                        pr_err("Failed to find blacklist at %p\n",
+                                (void *)entry);
                        continue;
                }
                ent = kmalloc(sizeof(*ent), GFP_KERNEL);
                if (!ent)
                        return -ENOMEM;
-                ent->start_addr = *iter;
+                ent->start_addr = entry;
-                ent->end_addr = *iter + size;
+                ent->end_addr = entry + size;
                INIT_LIST_HEAD(&ent->list);
                list_add_tail(&ent->list, &kprobe_blacklist);
        }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c2390f41307b..ef483220e855 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker,
        list_add_tail(&work->node, pos);
        work->worker = worker;
-        if (likely(worker->task))
+        if (!worker->current_work && likely(worker->task))
                wake_up_process(worker->task);
 }
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d24e4339b46d..88d0d4420ad2 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -384,7 +384,9 @@ static void print_lockdep_off(const char *bug_msg)
 {
        printk(KERN_DEBUG "%s\n", bug_msg);
        printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+#ifdef CONFIG_LOCK_STAT
        printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+#endif
 }
 static int save_trace(struct stack_trace *trace)
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..9887a905a762 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -1,6 +1,4 @@
 #include <linux/percpu.h>
-#include <linux/mutex.h>
 #include <linux/sched.h>
 #include "mcs_spinlock.h"
@@ -14,21 +12,47 @@
 * called from interrupt context and we have preemption disabled while
 * spinning.
 */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+        return cpu_nr + 1;
+}
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+        int cpu_nr = encoded_cpu_val - 1;
+        return per_cpu_ptr(&osq_node, cpu_nr);
+}
 /*
 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
 * Can return NULL in case we were the last queued and we updated @lock instead.
 */
-static inline struct optimistic_spin_queue *
+static inline struct optimistic_spin_node *
-osq_wait_next(struct optimistic_spin_queue **lock,
+osq_wait_next(struct optimistic_spin_queue *lock,
-              struct optimistic_spin_queue *node,
+              struct optimistic_spin_node *node,
-              struct optimistic_spin_queue *prev)
+              struct optimistic_spin_node *prev)
 {
-        struct optimistic_spin_queue *next = NULL;
+        struct optimistic_spin_node *next = NULL;
+        int curr = encode_cpu(smp_processor_id());
+        int old;
+        /*
+         * If there is a prev node in queue, then the 'old' value will be
+         * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+         * we're currently last in queue, then the queue will then become empty.
+         */
+        old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
        for (;;) {
-                if (*lock == node && cmpxchg(lock, node, prev) == node) {
+                if (atomic_read(&lock->tail) == curr &&
+                    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
                        /*
                         * We were the last queued, we moved @lock back. @prev
                         * will now observe @lock and will complete its
@@ -53,24 +77,29 @@ osq_wait_next(struct optimistic_spin_queue **lock,
                                break;
                }
-                arch_mutex_cpu_relax();
+                cpu_relax_lowlatency();
        }
        return next;
 }
-bool osq_lock(struct optimistic_spin_queue **lock)
+bool osq_lock(struct optimistic_spin_queue *lock)
 {
-        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
-        struct optimistic_spin_queue *prev, *next;
+        struct optimistic_spin_node *prev, *next;
+        int curr = encode_cpu(smp_processor_id());
+        int old;
        node->locked = 0;
        node->next = NULL;
+        node->cpu = curr;
-        node->prev = prev = xchg(lock, node);
+        old = atomic_xchg(&lock->tail, curr);
-        if (likely(prev == NULL))
+        if (old == OSQ_UNLOCKED_VAL)
                return true;
+        prev = decode_cpu(old);
+        node->prev = prev;
        ACCESS_ONCE(prev->next) = node;
        /*
@@ -89,7 +118,7 @@ bool osq_lock(struct optimistic_spin_queue **lock)
                if (need_resched())
                        goto unqueue;
-                arch_mutex_cpu_relax();
+                cpu_relax_lowlatency();
        }
        return true;
@@ -115,7 +144,7 @@ unqueue:
                if (smp_load_acquire(&node->locked))
                        return true;
-                arch_mutex_cpu_relax();
+                cpu_relax_lowlatency();
                /*
                 * Or we race against a concurrent unqueue()'s step-B, in which
@@ -149,20 +178,21 @@ unqueue:
        return false;
 }
-void osq_unlock(struct optimistic_spin_queue **lock)
+void osq_unlock(struct optimistic_spin_queue *lock)
 {
-        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_node *node, *next;
-        struct optimistic_spin_queue *next;
+        int curr = encode_cpu(smp_processor_id());
        /*
         * Fast path for the uncontended case.
         */
-        if (likely(cmpxchg(lock, node, NULL) == node))
+        if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
                return;
        /*
         * Second most likely case.
         */
+        node = this_cpu_ptr(&osq_node);
        next = xchg(&node->next, NULL);
        if (next) {
                ACCESS_ONCE(next->locked) = 1;
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..23e89c5930e9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -27,7 +27,7 @@ struct mcs_spinlock {
 #define arch_mcs_spin_lock_contended(l)                                 \
 do {                                                                    \
        while (!(smp_load_acquire(l)))                                  \
-                arch_mutex_cpu_relax();                                 \
+                cpu_relax_lowlatency();                                 \
 } while (0)
 #endif
@@ -104,7 +104,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
                        return;
                /* Wait until the next pointer is set */
                while (!(next = ACCESS_ONCE(node->next)))
-                        arch_mutex_cpu_relax();
+                        cpu_relax_lowlatency();
        }
        /* Pass lock to next waiter. */
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 * mutex_lock()/rwsem_down_{read,write}() etc.
 */
-struct optimistic_spin_queue {
+struct optimistic_spin_node {
-        struct optimistic_spin_queue *next, *prev;
+        struct optimistic_spin_node *next, *prev;
        int locked; /* 1 if lock acquired */
+        int cpu; /* encoded CPU # value */
 };
-extern bool osq_lock(struct optimistic_spin_queue **lock);
+extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue **lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);
 #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33c6760..ae712b25e492 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -46,12 +46,6 @@
 # include <asm/mutex.h>
 #endif
-/*
- * A negative mutex count indicates that waiters are sleeping waiting for the
- * mutex.
- */
-#define MUTEX_SHOW_NO_WAITER(mutex)     (atomic_read(&(mutex)->count) >= 0)
 void
 __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 {
@@ -60,7 +54,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
        INIT_LIST_HEAD(&lock->wait_list);
        mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-        lock->osq = NULL;
+        osq_lock_init(&lock->osq);
 #endif
        debug_mutex_init(lock, name, key);
@@ -152,7 +146,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
                if (need_resched())
                        break;
-                arch_mutex_cpu_relax();
+                cpu_relax_lowlatency();
        }
        rcu_read_unlock();
@@ -388,12 +382,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        /*
         * Optimistic spinning.
         *
-         * We try to spin for acquisition when we find that there are no
+         * We try to spin for acquisition when we find that the lock owner
-         * pending waiters and the lock owner is currently running on a
+         * is currently running on a (different) CPU and while we don't
-         * (different) CPU.
+         * need to reschedule. The rationale is that if the lock owner is
-         *
+         * running, it is likely to release the lock soon.
-         * The rationale is that if the lock owner is running, it is likely to
-         * release the lock soon.
         *
         * Since this needs the lock owner, and this mutex implementation
         * doesn't track the owner atomically in the lock field, we need to
@@ -440,7 +432,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                if (owner && !mutex_spin_on_owner(lock, owner))
                        break;
-                if ((atomic_read(&lock->count) == 1) &&
+                /* Try to acquire the mutex if it is unlocked. */
+                if (!mutex_is_locked(lock) &&
                    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
                        lock_acquired(&lock->dep_map, ip);
                        if (use_ww_ctx) {
@@ -471,7 +464,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * memory barriers as we'll eventually observe the right
                 * values at the cost of a few extra spins.
                 */
-                arch_mutex_cpu_relax();
+                cpu_relax_lowlatency();
        }
        osq_unlock(&lock->osq);
 slowpath:
@@ -485,8 +478,11 @@ slowpath:
 #endif
        spin_lock_mutex(&lock->wait_lock, flags);
-        /* once more, can we acquire the lock? */
+        /*
-        if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1))
+         * Once more, try to acquire the lock. Only try-lock the mutex if
+         * it is unlocked to reduce unnecessary xchg() operations.
+         */
+        if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
                goto skip_wait;
        debug_mutex_lock_common(lock, &waiter);
@@ -506,9 +502,10 @@ slowpath:
                 * it's unlocked. Later on, if we sleep, this is the
                 * operation that gives us the lock. We xchg it to -1, so
                 * that when we release the lock, we properly wake up the
-                 * other waiters:
+                 * other waiters. We only attempt the xchg if the count is
+                 * non-negative in order to avoid unnecessary xchg operations:
                 */
-                if (MUTEX_SHOW_NO_WAITER(lock) &&
+                if (atomic_read(&lock->count) >= 0 &&
                    (atomic_xchg(&lock->count, -1) == 1))
                        break;
@@ -823,6 +820,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
        unsigned long flags;
        int prev;
+        /* No need to trylock if the mutex is locked. */
+        if (mutex_is_locked(lock))
+                return 0;
        spin_lock_mutex(&lock->wait_lock, flags);
        prev = atomic_xchg(&lock->count, -1);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fb5b8ac411a5..f956ede7f90d 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,7 +20,6 @@
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
-#include <linux/mutex.h>
 #include <asm/qrwlock.h>
 /**
@@ -35,7 +34,7 @@ static __always_inline void
 rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
 {
        while ((cnts & _QW_WMASK) == _QW_LOCKED) {
-                arch_mutex_cpu_relax();
+                cpu_relax_lowlatency();
                cnts = smp_load_acquire((u32 *)&lock->cnts);
        }
 }
@@ -75,7 +74,7 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
         * to make sure that the write lock isn't taken.
         */
        while (atomic_read(&lock->cnts) & _QW_WMASK)
-                arch_mutex_cpu_relax();
+                cpu_relax_lowlatency();
        cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
        rspin_until_writer_unlock(lock, cnts);
@@ -114,7 +113,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
                                    cnts | _QW_WAITING) == cnts))
                        break;
-                arch_mutex_cpu_relax();
+                cpu_relax_lowlatency();
        }
        /* When no more readers, set the locked flag */
@@ -125,7 +124,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
                                    _QW_LOCKED) == _QW_WAITING))
                        break;
-                arch_mutex_cpu_relax();
+                cpu_relax_lowlatency();
        }
 unlock:
        arch_spin_unlock(&lock->lock);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 49b2ed3dced8..62b6cee8ea7f 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct task_struct *task)
 * the deadlock. We print when we return. act_waiter can be NULL in
 * case of a remove waiter operation.
 */
-void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+                             struct rt_mutex_waiter *act_waiter,
                             struct rt_mutex *lock)
 {
        struct task_struct *task;
-        if (!debug_locks || detect || !act_waiter)
+        if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
                return;
        task = rt_mutex_owner(act_waiter->lock);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index ab29b6a22669..d0519c3432b6 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
 extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
                                      struct task_struct *powner);
 extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+                                    struct rt_mutex_waiter *waiter,
                                    struct rt_mutex *lock);
 extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
 # define debug_rt_mutex_reset_waiter(w)                 \
        do { (w)->deadlock_lock = NULL; } while (0)
-static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
-                                                 int detect)
+                                                  enum rtmutex_chainwalk walk)
 {
        return (waiter != NULL);
 }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index fc605941b9b8..a0ea2a141b3b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -308,6 +308,32 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
 }
 /*
+ * Deadlock detection is conditional:
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
+ * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
+ * conducted independent of the detect argument.
+ *
+ * If the waiter argument is NULL this indicates the deboost path and
+ * deadlock detection is disabled independent of the detect argument
+ * and the config settings.
+ */
+static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+                                          enum rtmutex_chainwalk chwalk)
+{
+        /*
+         * This is just a wrapper function for the following call,
+         * because debug_rt_mutex_detect_deadlock() smells like a magic
+         * debug feature and I wanted to keep the cond function in the
+         * main source file along with the comments instead of having
+         * two of the same in the headers.
+         */
+        return debug_rt_mutex_detect_deadlock(waiter, chwalk);
+}
+/*
 * Max number of times we'll walk the boosting chain:
 */
 int max_lock_depth = 1024;
@@ -337,21 +363,65 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
 * @top_task:   the current top waiter
 *
 * Returns 0 or -EDEADLK.
+ *
+ * Chain walk basics and protection scope
+ *
+ * [R] refcount on task
+ * [P] task->pi_lock held
+ * [L] rtmutex->wait_lock held
+ *
+ * Step Description                             Protected by
+ *      function arguments:
+ *      @task                                   [R]
+ *      @orig_lock if != NULL                   @top_task is blocked on it
+ *      @next_lock                              Unprotected. Cannot be
+ *                                              dereferenced. Only used for
+ *                                              comparison.
+ *      @orig_waiter if != NULL                 @top_task is blocked on it
+ *      @top_task                               current, or in case of proxy
+ *                                              locking protected by calling
+ *                                              code
+ *      again:
+ *        loop_sanity_check();
+ *      retry:
+ * [1]    lock(task->pi_lock);                  [R] acquire [P]
+ * [2]    waiter = task->pi_blocked_on;         [P]
+ * [3]    check_exit_conditions_1();            [P]
+ * [4]    lock = waiter->lock;                  [P]
+ * [5]    if (!try_lock(lock->wait_lock)) {     [P] try to acquire [L]
+ *          unlock(task->pi_lock);              release [P]
+ *          goto retry;
+ *        }
+ * [6]    check_exit_conditions_2();            [P] + [L]
+ * [7]    requeue_lock_waiter(lock, waiter);    [P] + [L]
+ * [8]    unlock(task->pi_lock);                release [P]
+ *        put_task_struct(task);                release [R]
+ * [9]    check_exit_conditions_3();            [L]
+ * [10]   task = owner(lock);                   [L]
+ *        get_task_struct(task);                [L] acquire [R]
+ *        lock(task->pi_lock);                  [L] acquire [P]
+ * [11]   requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
+ * [12]   check_exit_conditions_4();            [P] + [L]
+ * [13]   unlock(task->pi_lock);                release [P]
+ *        unlock(lock->wait_lock);              release [L]
+ *        goto again;
 */
 static int rt_mutex_adjust_prio_chain(struct task_struct *task,
-                                      int deadlock_detect,
+                                      enum rtmutex_chainwalk chwalk,
                                      struct rt_mutex *orig_lock,
                                      struct rt_mutex *next_lock,
                                      struct rt_mutex_waiter *orig_waiter,
                                      struct task_struct *top_task)
 {
-        struct rt_mutex *lock;
        struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
-        int detect_deadlock, ret = 0, depth = 0;
+        struct rt_mutex_waiter *prerequeue_top_waiter;
+        int ret = 0, depth = 0;
+        struct rt_mutex *lock;
+        bool detect_deadlock;
        unsigned long flags;
+        bool requeue = true;
-        detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
+        detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
-                                                         deadlock_detect);
        /*
         * The (de)boosting is a step by step approach with a lot of
@@ -360,6 +430,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
         * carefully whether things change under us.
         */
 again:
+        /*
+         * We limit the lock chain length for each invocation.
+         */
        if (++depth > max_lock_depth) {
                static int prev_max;
@@ -377,13 +450,28 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                return -EDEADLK;
        }
+        /*
+         * We are fully preemptible here and only hold the refcount on
+         * @task. So everything can have changed under us since the
+         * caller or our own code below (goto retry/again) dropped all
+         * locks.
+         */
 retry:
        /*
-         * Task can not go away as we did a get_task() before !
+         * [1] Task cannot go away as we did a get_task() before !
         */
        raw_spin_lock_irqsave(&task->pi_lock, flags);
+        /*
+         * [2] Get the waiter on which @task is blocked on.
+         */
        waiter = task->pi_blocked_on;
+        /*
+         * [3] check_exit_conditions_1() protected by task->pi_lock.
+         */
        /*
         * Check whether the end of the boosting chain has been
         * reached or the state of the chain has changed while we
@@ -421,20 +509,41 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                        goto out_unlock_pi;
                /*
                 * If deadlock detection is off, we stop here if we
-                 * are not the top pi waiter of the task.
+                 * are not the top pi waiter of the task. If deadlock
+                 * detection is enabled we continue, but stop the
+                 * requeueing in the chain walk.
                 */
-                if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
+                if (top_waiter != task_top_pi_waiter(task)) {
-                        goto out_unlock_pi;
+                        if (!detect_deadlock)
+                                goto out_unlock_pi;
+                        else
+                                requeue = false;
+                }
        }
        /*
-         * When deadlock detection is off then we check, if further
+         * If the waiter priority is the same as the task priority
-         * priority adjustment is necessary.
+         * then there is no further priority adjustment necessary.  If
+         * deadlock detection is off, we stop the chain walk. If its
+         * enabled we continue, but stop the requeueing in the chain
+         * walk.
         */
-        if (!detect_deadlock && waiter->prio == task->prio)
+        if (waiter->prio == task->prio) {
-                goto out_unlock_pi;
+                if (!detect_deadlock)
+                        goto out_unlock_pi;
+                else
+                        requeue = false;
+        }
+        /*
+         * [4] Get the next lock
+         */
        lock = waiter->lock;
+        /*
+         * [5] We need to trylock here as we are holding task->pi_lock,
+         * which is the reverse lock order versus the other rtmutex
+         * operations.
+         */
        if (!raw_spin_trylock(&lock->wait_lock)) {
                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
                cpu_relax();
@@ -442,79 +551,180 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        }
        /*
+         * [6] check_exit_conditions_2() protected by task->pi_lock and
+         * lock->wait_lock.
+         *
         * Deadlock detection. If the lock is the same as the original
         * lock which caused us to walk the lock chain or if the
         * current lock is owned by the task which initiated the chain
         * walk, we detected a deadlock.
         */
        if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
-                debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+                debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
                raw_spin_unlock(&lock->wait_lock);
                ret = -EDEADLK;
                goto out_unlock_pi;
        }
-        top_waiter = rt_mutex_top_waiter(lock);
+        /*
+         * If we just follow the lock chain for deadlock detection, no
+         * need to do all the requeue operations. To avoid a truckload
+         * of conditionals around the various places below, just do the
+         * minimum chain walk checks.
+         */
+        if (!requeue) {
+                /*
+                 * No requeue[7] here. Just release @task [8]
+                 */
+                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+                put_task_struct(task);
+                /*
+                 * [9] check_exit_conditions_3 protected by lock->wait_lock.
+                 * If there is no owner of the lock, end of chain.
+                 */
+                if (!rt_mutex_owner(lock)) {
+                        raw_spin_unlock(&lock->wait_lock);
+                        return 0;
+                }
+                /* [10] Grab the next task, i.e. owner of @lock */
+                task = rt_mutex_owner(lock);
+                get_task_struct(task);
+                raw_spin_lock_irqsave(&task->pi_lock, flags);
+                /*
+                 * No requeue [11] here. We just do deadlock detection.
+                 *
+                 * [12] Store whether owner is blocked
+                 * itself. Decision is made after dropping the locks
+                 */
+                next_lock = task_blocked_on_lock(task);
+                /*
+                 * Get the top waiter for the next iteration
+                 */
+                top_waiter = rt_mutex_top_waiter(lock);
+                /* [13] Drop locks */
+                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+                raw_spin_unlock(&lock->wait_lock);
+                /* If owner is not blocked, end of chain. */
+                if (!next_lock)
+                        goto out_put_task;
+                goto again;
+        }
-        /* Requeue the waiter */
+        /*
+         * Store the current top waiter before doing the requeue
+         * operation on @lock. We need it for the boost/deboost
+         * decision below.
+         */
+        prerequeue_top_waiter = rt_mutex_top_waiter(lock);
+        /* [7] Requeue the waiter in the lock waiter list. */
        rt_mutex_dequeue(lock, waiter);
        waiter->prio = task->prio;
        rt_mutex_enqueue(lock, waiter);
-        /* Release the task */
+        /* [8] Release the task */
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+        put_task_struct(task);
+        /*
+         * [9] check_exit_conditions_3 protected by lock->wait_lock.
+         *
+         * We must abort the chain walk if there is no lock owner even
+         * in the dead lock detection case, as we have nothing to
+         * follow here. This is the end of the chain we are walking.
+         */
        if (!rt_mutex_owner(lock)) {
                /*
-                 * If the requeue above changed the top waiter, then we need
+                 * If the requeue [7] above changed the top waiter,
-                 * to wake the new top waiter up to try to get the lock.
+                 * then we need to wake the new top waiter up to try
+                 * to get the lock.
                 */
+                if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
-                if (top_waiter != rt_mutex_top_waiter(lock))
                        wake_up_process(rt_mutex_top_waiter(lock)->task);
                raw_spin_unlock(&lock->wait_lock);
-                goto out_put_task;
+                return 0;
        }
-        put_task_struct(task);
-        /* Grab the next task */
+        /* [10] Grab the next task, i.e. the owner of @lock */
        task = rt_mutex_owner(lock);
        get_task_struct(task);
        raw_spin_lock_irqsave(&task->pi_lock, flags);
+        /* [11] requeue the pi waiters if necessary */
        if (waiter == rt_mutex_top_waiter(lock)) {
-                /* Boost the owner */
+                /*
-                rt_mutex_dequeue_pi(task, top_waiter);
+                 * The waiter became the new top (highest priority)
+                 * waiter on the lock. Replace the previous top waiter
+                 * in the owner tasks pi waiters list with this waiter
+                 * and adjust the priority of the owner.
+                 */
+                rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
                rt_mutex_enqueue_pi(task, waiter);
                __rt_mutex_adjust_prio(task);
-        } else if (top_waiter == waiter) {
+        } else if (prerequeue_top_waiter == waiter) {
-                /* Deboost the owner */
+                /*
+                 * The waiter was the top waiter on the lock, but is
+                 * no longer the top prority waiter. Replace waiter in
+                 * the owner tasks pi waiters list with the new top
+                 * (highest priority) waiter and adjust the priority
+                 * of the owner.
+                 * The new top waiter is stored in @waiter so that
+                 * @waiter == @top_waiter evaluates to true below and
+                 * we continue to deboost the rest of the chain.
+                 */
                rt_mutex_dequeue_pi(task, waiter);
                waiter = rt_mutex_top_waiter(lock);
                rt_mutex_enqueue_pi(task, waiter);
                __rt_mutex_adjust_prio(task);
+        } else {
+                /*
+                 * Nothing changed. No need to do any priority
+                 * adjustment.
+                 */
        }
        /*
+         * [12] check_exit_conditions_4() protected by task->pi_lock
+         * and lock->wait_lock. The actual decisions are made after we
+         * dropped the locks.
+         *
         * Check whether the task which owns the current lock is pi
         * blocked itself. If yes we store a pointer to the lock for
         * the lock chain change detection above. After we dropped
         * task->pi_lock next_lock cannot be dereferenced anymore.
         */
        next_lock = task_blocked_on_lock(task);
+        /*
+         * Store the top waiter of @lock for the end of chain walk
+         * decision below.
+         */
+        top_waiter = rt_mutex_top_waiter(lock);
+        /* [13] Drop the locks */
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-        top_waiter = rt_mutex_top_waiter(lock);
        raw_spin_unlock(&lock->wait_lock);
        /*
+         * Make the actual exit decisions [12], based on the stored
+         * values.
+         *
         * We reached the end of the lock chain. Stop right here. No
         * point to go back just to figure that out.
         */
        if (!next_lock)
                goto out_put_task;
+        /*
+         * If the current waiter is not the top waiter on the lock,
+         * then we can stop the chain walk here if we are not in full
+         * deadlock detection mode.
+         */
        if (!detect_deadlock && waiter != top_waiter)
                goto out_put_task;
@@ -533,76 +743,119 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 *
 * Must be called with lock->wait_lock held.
 *
- * @lock:   the lock to be acquired.
+ * @lock:   The lock to be acquired.
- * @task:   the task which wants to acquire the lock
+ * @task:   The task which wants to acquire the lock
- * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
+ * @waiter: The waiter that is queued to the lock's wait list if the
+ *          callsite called task_blocked_on_lock(), otherwise NULL
 */
 static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
-                struct rt_mutex_waiter *waiter)
+                                struct rt_mutex_waiter *waiter)
 {
+        unsigned long flags;
        /*
-         * We have to be careful here if the atomic speedups are
+         * Before testing whether we can acquire @lock, we set the
-         * enabled, such that, when
+         * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
-         *  - no other waiter is on the lock
+         * other tasks which try to modify @lock into the slow path
-         *  - the lock has been released since we did the cmpxchg
+         * and they serialize on @lock->wait_lock.
-         * the lock can be released or taken while we are doing the
+         *
-         * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+         * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
+         * as explained at the top of this file if and only if:
         *
-         * The atomic acquire/release aware variant of
+         * - There is a lock owner. The caller must fixup the
-         * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
+         *   transient state if it does a trylock or leaves the lock
-         * the WAITERS bit, the atomic release / acquire can not
+         *   function due to a signal or timeout.
-         * happen anymore and lock->wait_lock protects us from the
-         * non-atomic case.
         *
-         * Note, that this might set lock->owner =
+         * - @task acquires the lock and there are no other
-         * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
+         *   waiters. This is undone in rt_mutex_set_owner(@task) at
-         * any more. This is fixed up when we take the ownership.
+         *   the end of this function.
-         * This is the transitional state explained at the top of this file.
         */
        mark_rt_mutex_waiters(lock);
+        /*
+         * If @lock has an owner, give up.
+         */
        if (rt_mutex_owner(lock))
                return 0;
        /*
-         * It will get the lock because of one of these conditions:
+         * If @waiter != NULL, @task has already enqueued the waiter
-         * 1) there is no waiter
+         * into @lock waiter list. If @waiter == NULL then this is a
-         * 2) higher priority than waiters
+         * trylock attempt.
-         * 3) it is top waiter
         */
-        if (rt_mutex_has_waiters(lock)) {
+        if (waiter) {
-                if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
+                /*
-                        if (!waiter || waiter != rt_mutex_top_waiter(lock))
+                 * If waiter is not the highest priority waiter of
-                                return 0;
+                 * @lock, give up.
-                }
+                 */
-        }
+                if (waiter != rt_mutex_top_waiter(lock))
+                        return 0;
-        if (waiter || rt_mutex_has_waiters(lock)) {
-                unsigned long flags;
-                struct rt_mutex_waiter *top;
-                raw_spin_lock_irqsave(&task->pi_lock, flags);
-                /* remove the queued waiter. */
+                /*
-                if (waiter) {
+                 * We can acquire the lock. Remove the waiter from the
-                        rt_mutex_dequeue(lock, waiter);
+                 * lock waiters list.
-                        task->pi_blocked_on = NULL;
+                 */
-                }
+                rt_mutex_dequeue(lock, waiter);
+        } else {
                /*
-                 * We have to enqueue the top waiter(if it exists) into
+                 * If the lock has waiters already we check whether @task is
-                 * task->pi_waiters list.
+                 * eligible to take over the lock.
+                 *
+                 * If there are no other waiters, @task can acquire
+                 * the lock.  @task->pi_blocked_on is NULL, so it does
+                 * not need to be dequeued.
                 */
                if (rt_mutex_has_waiters(lock)) {
-                        top = rt_mutex_top_waiter(lock);
+                        /*
-                        rt_mutex_enqueue_pi(task, top);
+                         * If @task->prio is greater than or equal to
+                         * the top waiter priority (kernel view),
+                         * @task lost.
+                         */
+                        if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+                                return 0;
+                        /*
+                         * The current top waiter stays enqueued. We
+                         * don't have to change anything in the lock
+                         * waiters order.
+                         */
+                } else {
+                        /*
+                         * No waiters. Take the lock without the
+                         * pi_lock dance.@task->pi_blocked_on is NULL
+                         * and we have no waiters to enqueue in @task
+                         * pi waiters list.
+                         */
+                        goto takeit;
                }
-                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        }
+        /*
+         * Clear @task->pi_blocked_on. Requires protection by
+         * @task->pi_lock. Redundant operation for the @waiter == NULL
+         * case, but conditionals are more expensive than a redundant
+         * store.
+         */
+        raw_spin_lock_irqsave(&task->pi_lock, flags);
+        task->pi_blocked_on = NULL;
+        /*
+         * Finish the lock acquisition. @task is the new owner. If
+         * other waiters exist we have to insert the highest priority
+         * waiter into @task->pi_waiters list.
+         */
+        if (rt_mutex_has_waiters(lock))
+                rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
+        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+takeit:
        /* We got the lock. */
        debug_rt_mutex_lock(lock);
+        /*
+         * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
+         * are still waiters or clears it.
+         */
        rt_mutex_set_owner(lock, task);
        rt_mutex_deadlock_account_lock(lock, task);
@@ -620,7 +873,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                                   struct rt_mutex_waiter *waiter,
                                   struct task_struct *task,
-                                   int detect_deadlock)
+                                   enum rtmutex_chainwalk chwalk)
 {
        struct task_struct *owner = rt_mutex_owner(lock);
        struct rt_mutex_waiter *top_waiter = waiter;
@@ -666,7 +919,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                __rt_mutex_adjust_prio(owner);
                if (owner->pi_blocked_on)
                        chain_walk = 1;
-        } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
+        } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
                chain_walk = 1;
        }
@@ -691,7 +944,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        raw_spin_unlock(&lock->wait_lock);
-        res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
+        res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
                                         next_lock, waiter, task);
        raw_spin_lock(&lock->wait_lock);
@@ -753,9 +1006,9 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 static void remove_waiter(struct rt_mutex *lock,
                          struct rt_mutex_waiter *waiter)
 {
-        int first = (waiter == rt_mutex_top_waiter(lock));
+        bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
        struct task_struct *owner = rt_mutex_owner(lock);
-        struct rt_mutex *next_lock = NULL;
+        struct rt_mutex *next_lock;
        unsigned long flags;
        raw_spin_lock_irqsave(&current->pi_lock, flags);
@@ -763,29 +1016,31 @@ static void remove_waiter(struct rt_mutex *lock,
        current->pi_blocked_on = NULL;
        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
-        if (!owner)
+        /*
+         * Only update priority if the waiter was the highest priority
+         * waiter of the lock and there is an owner to update.
+         */
+        if (!owner || !is_top_waiter)
                return;
-        if (first) {
+        raw_spin_lock_irqsave(&owner->pi_lock, flags);
-                raw_spin_lock_irqsave(&owner->pi_lock, flags);
-                rt_mutex_dequeue_pi(owner, waiter);
+        rt_mutex_dequeue_pi(owner, waiter);
-                if (rt_mutex_has_waiters(lock)) {
+        if (rt_mutex_has_waiters(lock))
-                        struct rt_mutex_waiter *next;
+                rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
-                        next = rt_mutex_top_waiter(lock);
+        __rt_mutex_adjust_prio(owner);
-                        rt_mutex_enqueue_pi(owner, next);
-                }
-                __rt_mutex_adjust_prio(owner);
-                /* Store the lock on which owner is blocked or NULL */
+        /* Store the lock on which owner is blocked or NULL */
-                next_lock = task_blocked_on_lock(owner);
+        next_lock = task_blocked_on_lock(owner);
-                raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+        raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
-        }
+        /*
+         * Don't walk the chain, if the owner task is not blocked
+         * itself.
+         */
        if (!next_lock)
                return;
@@ -794,7 +1049,8 @@ static void remove_waiter(struct rt_mutex *lock,
        raw_spin_unlock(&lock->wait_lock);
-        rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
+        rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+                                   next_lock, NULL, current);
        raw_spin_lock(&lock->wait_lock);
 }
@@ -824,7 +1080,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
        /* gets dropped in rt_mutex_adjust_prio_chain()! */
        get_task_struct(task);
-        rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
+        rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+                                   next_lock, NULL, task);
 }
 /**
@@ -902,7 +1159,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
 static int __sched
 rt_mutex_slowlock(struct rt_mutex *lock, int state,
                  struct hrtimer_sleeper *timeout,
-                  int detect_deadlock)
+                  enum rtmutex_chainwalk chwalk)
 {
        struct rt_mutex_waiter waiter;
        int ret = 0;
@@ -928,7 +1185,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                        timeout->task = NULL;
        }
-        ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
+        ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
        if (likely(!ret))
                ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
@@ -937,7 +1194,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        if (unlikely(ret)) {
                remove_waiter(lock, &waiter);
-                rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
+                rt_mutex_handle_deadlock(ret, chwalk, &waiter);
        }
        /*
@@ -960,22 +1217,31 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 /*
 * Slow path try-lock function:
 */
-static inline int
+static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
-rt_mutex_slowtrylock(struct rt_mutex *lock)
 {
-        int ret = 0;
+        int ret;
+        /*
+         * If the lock already has an owner we fail to get the lock.
+         * This can be done without taking the @lock->wait_lock as
+         * it is only being read, and this is a trylock anyway.
+         */
+        if (rt_mutex_owner(lock))
+                return 0;
+        /*
+         * The mutex has currently no owner. Lock the wait lock and
+         * try to acquire the lock.
+         */
        raw_spin_lock(&lock->wait_lock);
-        if (likely(rt_mutex_owner(lock) != current)) {
+        ret = try_to_take_rt_mutex(lock, current, NULL);
-                ret = try_to_take_rt_mutex(lock, current, NULL);
+        /*
-                /*
+         * try_to_take_rt_mutex() sets the lock waiters bit
-                 * try_to_take_rt_mutex() sets the lock waiters
+         * unconditionally. Clean this up.
-                 * bit unconditionally. Clean this up.
+         */
-                 */
+        fixup_rt_mutex_waiters(lock);
-                fixup_rt_mutex_waiters(lock);
-        }
        raw_spin_unlock(&lock->wait_lock);
@@ -1053,30 +1319,31 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
 */
 static inline int
 rt_mutex_fastlock(struct rt_mutex *lock, int state,
-                  int detect_deadlock,
                  int (*slowfn)(struct rt_mutex *lock, int state,
                                struct hrtimer_sleeper *timeout,
-                                int detect_deadlock))
+                                enum rtmutex_chainwalk chwalk))
 {
-        if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+        if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
                rt_mutex_deadlock_account_lock(lock, current);
                return 0;
        } else
-                return slowfn(lock, state, NULL, detect_deadlock);
+                return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
 }
 static inline int
 rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
-                        struct hrtimer_sleeper *timeout, int detect_deadlock,
+                        struct hrtimer_sleeper *timeout,
+                        enum rtmutex_chainwalk chwalk,
                        int (*slowfn)(struct rt_mutex *lock, int state,
                                      struct hrtimer_sleeper *timeout,
-                                      int detect_deadlock))
+                                      enum rtmutex_chainwalk chwalk))
 {
-        if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+        if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
+            likely(rt_mutex_cmpxchg(lock, NULL, current))) {
                rt_mutex_deadlock_account_lock(lock, current);
                return 0;
        } else
-                return slowfn(lock, state, timeout, detect_deadlock);
+                return slowfn(lock, state, timeout, chwalk);
 }
 static inline int
@@ -1109,54 +1376,61 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
 {
        might_sleep();
-        rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+        rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_lock);
 /**
 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
 *
- * @lock:               the rt_mutex to be locked
+ * @lock:               the rt_mutex to be locked
- * @detect_deadlock:    deadlock detection on/off
 *
 * Returns:
- *  0           on success
+ *  0           on success
- * -EINTR       when interrupted by a signal
+ * -EINTR       when interrupted by a signal
- * -EDEADLK     when the lock would deadlock (when deadlock detection is on)
 */
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
-                                                 int detect_deadlock)
 {
        might_sleep();
-        return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
+        return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
-                                 detect_deadlock, rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+/*
+ * Futex variant with full deadlock detection.
+ */
+int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+                              struct hrtimer_sleeper *timeout)
+{
+        might_sleep();
+        return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+                                       RT_MUTEX_FULL_CHAINWALK,
+                                       rt_mutex_slowlock);
+}
 /**
 * rt_mutex_timed_lock - lock a rt_mutex interruptible
 *                      the timeout structure is provided
 *                      by the caller
 *
- * @lock:               the rt_mutex to be locked
+ * @lock:               the rt_mutex to be locked
 * @timeout:            timeout structure or NULL (no timeout)
- * @detect_deadlock:    deadlock detection on/off
 *
 * Returns:
- *  0           on success
+ *  0           on success
- * -EINTR       when interrupted by a signal
+ * -EINTR       when interrupted by a signal
 * -ETIMEDOUT   when the timeout expired
- * -EDEADLK     when the lock would deadlock (when deadlock detection is on)
 */
 int
-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
-                    int detect_deadlock)
 {
        might_sleep();
        return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
-                                       detect_deadlock, rt_mutex_slowlock);
+                                       RT_MUTEX_MIN_CHAINWALK,
+                                       rt_mutex_slowlock);
 }
 EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
@@ -1262,7 +1536,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 * @lock:               the rt_mutex to take
 * @waiter:             the pre-initialized rt_mutex_waiter
 * @task:               the task to prepare
- * @detect_deadlock:    perform deadlock detection (1) or not (0)
 *
 * Returns:
 *  0 - task blocked on lock
@@ -1273,7 +1546,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 */
 int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                              struct rt_mutex_waiter *waiter,
-                              struct task_struct *task, int detect_deadlock)
+                              struct task_struct *task)
 {
        int ret;
@@ -1285,7 +1558,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
        }
        /* We enforce deadlock detection for futexes */
-        ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
+        ret = task_blocks_on_rt_mutex(lock, waiter, task,
+                                      RT_MUTEX_FULL_CHAINWALK);
        if (ret && !rt_mutex_owner(lock)) {
                /*
@@ -1331,22 +1605,20 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
 * rt_mutex_finish_proxy_lock() - Complete lock acquisition
 * @lock:               the rt_mutex we were woken on
 * @to:                 the timeout, null if none. hrtimer should already have
- *                      been started.
+ *                      been started.
 * @waiter:             the pre-initialized rt_mutex_waiter
- * @detect_deadlock:    perform deadlock detection (1) or not (0)
 *
 * Complete the lock acquisition started our behalf by another thread.
 *
 * Returns:
 *  0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ * <0 - error, one of -EINTR, -ETIMEDOUT
 *
 * Special API call for PI-futex requeue support
 */
 int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
                               struct hrtimer_sleeper *to,
-                               struct rt_mutex_waiter *waiter,
+                               struct rt_mutex_waiter *waiter)
-                               int detect_deadlock)
 {
        int ret;
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index f6a1f3c133b1..c4060584c407 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -22,10 +22,15 @@
 #define debug_rt_mutex_init(m, n)                       do { } while (0)
 #define debug_rt_mutex_deadlock(d, a ,l)                do { } while (0)
 #define debug_rt_mutex_print_deadlock(w)                do { } while (0)
-#define debug_rt_mutex_detect_deadlock(w,d)             (d)
 #define debug_rt_mutex_reset_waiter(w)                  do { } while (0)
 static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
 {
        WARN(1, "rtmutex deadlock detected\n");
 }
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
+                                                  enum rtmutex_chainwalk walk)
+{
+        return walk == RT_MUTEX_FULL_CHAINWALK;
+}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7431a9c86f35..855212501407 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -102,6 +102,21 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
 }
 /*
+ * Constants for rt mutex functions which have a selectable deadlock
+ * detection.
+ *
+ * RT_MUTEX_MIN_CHAINWALK:      Stops the lock chain walk when there are
+ *                              no further PI adjustments to be made.
+ *
+ * RT_MUTEX_FULL_CHAINWALK:     Invoke deadlock detection with a full
+ *                              walk of the lock chain.
+ */
+enum rtmutex_chainwalk {
+        RT_MUTEX_MIN_CHAINWALK,
+        RT_MUTEX_FULL_CHAINWALK,
+};
+/*
 * PI-futex support (proxy locking functions, etc.):
 */
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -111,12 +126,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                                  struct task_struct *proxy_owner);
 extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                                     struct rt_mutex_waiter *waiter,
-                                     struct task_struct *task,
+                                     struct task_struct *task);
-                                     int detect_deadlock);
 extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
                                      struct hrtimer_sleeper *to,
-                                      struct rt_mutex_waiter *waiter,
+                                      struct rt_mutex_waiter *waiter);
-                                      int detect_deadlock);
+extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
        unsigned long flags;
        if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
-                ret = (sem->activity != 0);
+                ret = (sem->count != 0);
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
        }
        return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
        lockdep_init_map(&sem->dep_map, name, key, 0);
 #endif
-        sem->activity = 0;
+        sem->count = 0;
        raw_spin_lock_init(&sem->wait_lock);
        INIT_LIST_HEAD(&sem->wait_list);
 }
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
                waiter = list_entry(next, struct rwsem_waiter, list);
        } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-        sem->activity += woken;
+        sem->count += woken;
 out:
        return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
                /* granted */
-                sem->activity++;
+                sem->count++;
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
                goto out;
        }
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
                /* granted */
-                sem->activity++;
+                sem->count++;
                ret = 1;
        }
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
                 * itself into sleep and waiting for system woke it or someone
                 * else in the head of the wait list up.
                 */
-                if (sem->activity == 0)
+                if (sem->count == 0)
                        break;
                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
                raw_spin_lock_irqsave(&sem->wait_lock, flags);
        }
        /* got the lock */
-        sem->activity = -1;
+        sem->count = -1;
        list_del(&waiter.list);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity == 0) {
+        if (sem->count == 0) {
                /* got the lock */
-                sem->activity = -1;
+                sem->count = -1;
                ret = 1;
        }
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (--sem->activity == 0 && !list_empty(&sem->wait_list))
+        if (--sem->count == 0 && !list_empty(&sem->wait_list))
                sem = __rwsem_wake_one_writer(sem);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->activity = 0;
+        sem->count = 0;
        if (!list_empty(&sem->wait_list))
                sem = __rwsem_do_wake(sem, 1);
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->activity = 1;
+        sem->count = 1;
        if (!list_empty(&sem->wait_list))
                sem = __rwsem_do_wake(sem, 0);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc32142fcc..d6203faf2eb1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -82,9 +82,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
        sem->count = RWSEM_UNLOCKED_VALUE;
        raw_spin_lock_init(&sem->wait_lock);
        INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
        sem->owner = NULL;
-        sem->osq = NULL;
+        osq_lock_init(&sem->osq);
 #endif
 }
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
        return false;
 }
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 /*
 * Try to acquire write lock before the writer has been put on wait queue.
 */
@@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
 {
        struct task_struct *owner;
-        bool on_cpu = true;
+        bool on_cpu = false;
        if (need_resched())
-                return 0;
+                return false;
        rcu_read_lock();
        owner = ACCESS_ONCE(sem->owner);
@@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
        rcu_read_unlock();
        /*
-         * If sem->owner is not set, the rwsem owner may have
+         * If sem->owner is not set, yet we have just recently entered the
-         * just acquired it and not set the owner yet or the rwsem
+         * slowpath, then there is a possibility reader(s) may have the lock.
-         * has been released.
+         * To be safe, avoid spinning in these situations.
         */
        return on_cpu;
 }
@@ -329,7 +329,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
                if (need_resched())
                        break;
-                arch_mutex_cpu_relax();
+                cpu_relax_lowlatency();
        }
        rcu_read_unlock();
@@ -381,7 +381,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
                 * memory barriers as we'll eventually observe the right
                 * values at the cost of a few extra spins.
                 */
-                arch_mutex_cpu_relax();
+                cpu_relax_lowlatency();
        }
        osq_unlock(&sem->osq);
 done:
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 42f806de49d4..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,7 +12,7 @@
 #include <linux/atomic.h>
-#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 {
        sem->owner = current;
diff --git a/kernel/module.c b/kernel/module.c
index 81e727cf6df9..ae79ce615cb9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -60,7 +60,6 @@
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
-#include <linux/fips.h>
 #include <uapi/linux/module.h>
 #include "module-internal.h"
@@ -2448,9 +2447,6 @@ static int module_sig_check(struct load_info *info)
        }
        /* Not having a signature is only an error if we're strict. */
-        if (err < 0 && fips_enabled)
-                panic("Module verification failed with error %d in FIPS mode\n",
-                      err);
        if (err == -ENOKEY && !sig_enforce)
                err = 0;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fcc2611d3f14..a9dfa79b6bab 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -371,7 +371,6 @@ int hibernation_snapshot(int platform_mode)
        }
        suspend_console();
-        ftrace_stop();
        pm_restrict_gfp_mask();
        error = dpm_suspend(PMSG_FREEZE);
@@ -397,7 +396,6 @@ int hibernation_snapshot(int platform_mode)
        if (error || !in_suspend)
                pm_restore_gfp_mask();
-        ftrace_start();
        resume_console();
        dpm_complete(msg);
@@ -500,7 +498,6 @@ int hibernation_restore(int platform_mode)
        pm_prepare_console();
        suspend_console();
-        ftrace_stop();
        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
@@ -508,7 +505,6 @@ int hibernation_restore(int platform_mode)
                dpm_resume_end(PMSG_RECOVER);
        }
        pm_restore_gfp_mask();
-        ftrace_start();
        resume_console();
        pm_restore_console();
        return error;
@@ -535,7 +531,6 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
-        ftrace_stop();
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -579,7 +574,6 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
-        ftrace_start();
        resume_console();
 Close:
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0ca8d83e2369..4ee194eb524b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -186,6 +186,7 @@ void thaw_processes(void)
        printk("Restarting tasks ... ");
+        __usermodehelper_set_disable_depth(UMH_FREEZING);
        thaw_workqueues();
        read_lock(&tasklist_lock);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4dd8822f732a..4b736b4dfa96 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -248,7 +248,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                goto Platform_wake;
        }
-        ftrace_stop();
        error = disable_nonboot_cpus();
        if (error || suspend_test(TEST_CPUS))
                goto Enable_cpus;
@@ -275,7 +274,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 Enable_cpus:
        enable_nonboot_cpus();
-        ftrace_start();
 Platform_wake:
        if (need_suspend_ops(state) && suspend_ops->wake)
@@ -306,7 +304,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                error = suspend_ops->begin(state);
                if (error)
                        goto Close;
-        } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) {
+        } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
                error = freeze_ops->begin();
                if (error)
                        goto Close;
@@ -335,7 +333,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 Close:
        if (need_suspend_ops(state) && suspend_ops->end)
                suspend_ops->end();
-        else if (state == PM_SUSPEND_FREEZE && freeze_ops->end)
+        else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
                freeze_ops->end();
        return error;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index adf98622cb32..54e75226c2c4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,12 +28,6 @@
 #include <linux/compat.h>
-static int ptrace_trapping_sleep_fn(void *flags)
-{
-        schedule();
-        return 0;
-}
 /*
 * ptrace a task: make the debugger its new parent and
 * move it to the ptrace list.
@@ -371,7 +365,7 @@ unlock_creds:
 out:
        if (!retval) {
                wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
-                            ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
+                            TASK_UNINTERRUPTIBLE);
                proc_ptrace_connector(task, PTRACE_ATTACH);
        }
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index bfda2726ca45..ff1a6de62f17 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
 void kfree(const void *);
+/*
+ * Reclaim the specified callback, either by invoking it (non-lazy case)
+ * or freeing it directly (lazy case).  Return true if lazy, false otherwise.
+ */
 static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
 {
        unsigned long offset = (unsigned long)head->func;
@@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
                RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
                kfree((void *)head - offset);
                rcu_lock_release(&rcu_callback_map);
-                return 1;
+                return true;
        } else {
                RCU_TRACE(trace_rcu_invoke_callback(rn, head));
                head->func(head);
                rcu_lock_release(&rcu_callback_map);
-                return 0;
+                return false;
        }
 }
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7fa34f86e5ba..948a7693748e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -18,7 +18,7 @@
 * Copyright (C) IBM Corporation, 2005, 2006
 *
 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
- *        Josh Triplett <josh@freedesktop.org>
+ *        Josh Triplett <josh@joshtriplett.org>
 *
 * See also:  Documentation/RCU/torture.txt
 */
@@ -51,7 +51,7 @@
 #include <linux/torture.h>
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
 torture_param(int, fqs_duration, 0,
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index c639556f3fa0..e037f3eb2f7b 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
        idx = ACCESS_ONCE(sp->completed) & 0x1;
        preempt_disable();
-        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
+        __this_cpu_inc(sp->per_cpu_ref->c[idx]);
        smp_mb(); /* B */  /* Avoid leaking the critical section. */
-        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
+        __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
        preempt_enable();
        return idx;
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f1ba77363fbb..1b70cb6fbe3c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu)
        rdp->passed_quiesce = 1;
 }
+static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+        .dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+        .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+        .dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+};
+/*
+ * Let the RCU core know that this CPU has gone through the scheduler,
+ * which is a quiescent state.  This is called when the need for a
+ * quiescent state is urgent, so we burn an atomic operation and full
+ * memory barriers to let the RCU core know about it, regardless of what
+ * this CPU might (or might not) do in the near future.
+ *
+ * We inform the RCU core by emulating a zero-duration dyntick-idle
+ * period, which we in turn do by incrementing the ->dynticks counter
+ * by two.
+ */
+static void rcu_momentary_dyntick_idle(void)
+{
+        unsigned long flags;
+        struct rcu_data *rdp;
+        struct rcu_dynticks *rdtp;
+        int resched_mask;
+        struct rcu_state *rsp;
+        local_irq_save(flags);
+        /*
+         * Yes, we can lose flag-setting operations.  This is OK, because
+         * the flag will be set again after some delay.
+         */
+        resched_mask = raw_cpu_read(rcu_sched_qs_mask);
+        raw_cpu_write(rcu_sched_qs_mask, 0);
+        /* Find the flavor that needs a quiescent state. */
+        for_each_rcu_flavor(rsp) {
+                rdp = raw_cpu_ptr(rsp->rda);
+                if (!(resched_mask & rsp->flavor_mask))
+                        continue;
+                smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
+                if (ACCESS_ONCE(rdp->mynode->completed) !=
+                    ACCESS_ONCE(rdp->cond_resched_completed))
+                        continue;
+                /*
+                 * Pretend to be momentarily idle for the quiescent state.
+                 * This allows the grace-period kthread to record the
+                 * quiescent state, with no need for this CPU to do anything
+                 * further.
+                 */
+                rdtp = this_cpu_ptr(&rcu_dynticks);
+                smp_mb__before_atomic(); /* Earlier stuff before QS. */
+                atomic_add(2, &rdtp->dynticks);  /* QS. */
+                smp_mb__after_atomic(); /* Later stuff after QS. */
+                break;
+        }
+        local_irq_restore(flags);
+}
 /*
 * Note a context switch.  This is a quiescent state for RCU-sched,
 * and requires special handling for preemptible RCU.
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu)
        trace_rcu_utilization(TPS("Start context switch"));
        rcu_sched_qs(cpu);
        rcu_preempt_note_context_switch(cpu);
+        if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+                rcu_momentary_dyntick_idle();
        trace_rcu_utilization(TPS("End context switch"));
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
-        .dynticks = ATOMIC_INIT(1),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-        .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
-        .dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-};
 static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;    /* If this many pending, ignore blimit. */
 static long qlowmark = 100;     /* Once only this many pending, use blimit. */
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
 module_param(jiffies_till_first_fqs, ulong, 0644);
 module_param(jiffies_till_next_fqs, ulong, 0644);
+/*
+ * How long the grace period must be before we start recruiting
+ * quiescent-state help from rcu_note_context_switch().
+ */
+static ulong jiffies_till_sched_qs = HZ / 20;
+module_param(jiffies_till_sched_qs, ulong, 0644);
 static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
                                  struct rcu_data *rdp);
 static void force_qs_rnp(struct rcu_state *rsp,
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
                                    bool *isidle, unsigned long *maxj)
 {
        unsigned int curr;
+        int *rcrmp;
        unsigned int snap;
        curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
        }
        /*
-         * There is a possibility that a CPU in adaptive-ticks state
+         * A CPU running for an extended time within the kernel can
-         * might run in the kernel with the scheduling-clock tick disabled
+         * delay RCU grace periods.  When the CPU is in NO_HZ_FULL mode,
-         * for an extended time period.  Invoke rcu_kick_nohz_cpu() to
+         * even context-switching back and forth between a pair of
-         * force the CPU to restart the scheduling-clock tick in this
+         * in-kernel CPU-bound tasks cannot advance grace periods.
-         * CPU is in this state.
+         * So if the grace period is old enough, make the CPU pay attention.
-         */
+         * Note that the unsynchronized assignments to the per-CPU
-        rcu_kick_nohz_cpu(rdp->cpu);
+         * rcu_sched_qs_mask variable are safe.  Yes, setting of
+         * bits can be lost, but they will be set again on the next
-        /*
+         * force-quiescent-state pass.  So lost bit sets do not result
-         * Alternatively, the CPU might be running in the kernel
+         * in incorrect behavior, merely in a grace period lasting
-         * for an extended period of time without a quiescent state.
+         * a few jiffies longer than it might otherwise.  Because
-         * Attempt to force the CPU through the scheduler to gain the
+         * there are at most four threads involved, and because the
-         * needed quiescent state, but only if the grace period has gone
+         * updates are only once every few jiffies, the probability of
-         * on for an uncommonly long time.  If there are many stuck CPUs,
+         * lossage (and thus of slight grace-period extension) is
-         * we will beat on the first one until it gets unstuck, then move
+         * quite low.
-         * to the next.  Only do this for the primary flavor of RCU.
+         *
+         * Note that if the jiffies_till_sched_qs boot/sysfs parameter
+         * is set too high, we override with half of the RCU CPU stall
+         * warning delay.
         */
-        if (rdp->rsp == rcu_state_p &&
+        rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
+        if (ULONG_CMP_GE(jiffies,
+                         rdp->rsp->gp_start + jiffies_till_sched_qs) ||
            ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
-                rdp->rsp->jiffies_resched += 5;
+                if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
-                resched_cpu(rdp->cpu);
+                        ACCESS_ONCE(rdp->cond_resched_completed) =
+                                ACCESS_ONCE(rdp->mynode->completed);
+                        smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+                        ACCESS_ONCE(*rcrmp) =
+                                ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+                        resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+                        rdp->rsp->jiffies_resched += 5; /* Enable beating. */
+                } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+                        /* Time to beat on that CPU again! */
+                        resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+                        rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+                }
        }
        return 0;
@@ -932,10 +1013,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
 }
 /*
- * Dump stacks of all tasks running on stalled CPUs.  This is a fallback
+ * Dump stacks of all tasks running on stalled CPUs.
- * for architectures that do not implement trigger_all_cpu_backtrace().
- * The NMI-triggered stack traces are more accurate because they are
- * printed by the target CPU.
 */
 static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 {
@@ -1013,7 +1091,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
               (long)rsp->gpnum, (long)rsp->completed, totqlen);
        if (ndetected == 0)
                pr_err("INFO: Stall ended before state dump start\n");
-        else if (!trigger_all_cpu_backtrace())
+        else
                rcu_dump_cpu_stacks(rsp);
        /* Complain about tasks blocking the grace period. */
@@ -1044,8 +1122,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
        pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
                jiffies - rsp->gp_start,
                (long)rsp->gpnum, (long)rsp->completed, totqlen);
-        if (!trigger_all_cpu_backtrace())
+        rcu_dump_cpu_stacks(rsp);
-                dump_stack();
        raw_spin_lock_irqsave(&rnp->lock, flags);
        if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
@@ -1224,10 +1301,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
         * believe that a grace period is in progress, then we must wait
         * for the one following, which is in "c".  Because our request
         * will be noticed at the end of the current grace period, we don't
-         * need to explicitly start one.
+         * need to explicitly start one.  We only do the lockless check
+         * of rnp_root's fields if the current rcu_node structure thinks
+         * there is no grace period in flight, and because we hold rnp->lock,
+         * the only possible change is when rnp_root's two fields are
+         * equal, in which case rnp_root->gpnum might be concurrently
+         * incremented.  But that is OK, as it will just result in our
+         * doing some extra useless work.
         */
        if (rnp->gpnum != rnp->completed ||
-            ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+            ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
                rnp->need_future_gp[c & 0x1]++;
                trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
                goto out;
@@ -1564,11 +1647,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
                                            rnp->level, rnp->grplo,
                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock_irq(&rnp->lock);
-#ifdef CONFIG_PROVE_RCU_DELAY
-                if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
-                    system_state == SYSTEM_RUNNING)
-                        udelay(200);
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
                cond_resched();
        }
@@ -2266,7 +2344,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        smp_mb(); /* List handling before counting for rcu_barrier(). */
        rdp->qlen_lazy -= count_lazy;
-        ACCESS_ONCE(rdp->qlen) -= count;
+        ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
        rdp->n_cbs_invoked += count;
        /* Reinstate batch limit if we have worked down the excess. */
@@ -2404,14 +2482,14 @@ static void force_quiescent_state(struct rcu_state *rsp)
        struct rcu_node *rnp_old = NULL;
        /* Funnel through hierarchy to reduce memory contention. */
-        rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+        rnp = __this_cpu_read(rsp->rda->mynode);
        for (; rnp != NULL; rnp = rnp->parent) {
                ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
                      !raw_spin_trylock(&rnp->fqslock);
                if (rnp_old != NULL)
                        raw_spin_unlock(&rnp_old->fqslock);
                if (ret) {
-                        ACCESS_ONCE(rsp->n_force_qs_lh)++;
+                        rsp->n_force_qs_lh++;
                        return;
                }
                rnp_old = rnp;
@@ -2423,7 +2501,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
        smp_mb__after_unlock_lock();
        raw_spin_unlock(&rnp_old->fqslock);
        if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
-                ACCESS_ONCE(rsp->n_force_qs_lh)++;
+                rsp->n_force_qs_lh++;
                raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
                return;  /* Someone beat us to it. */
        }
@@ -2581,7 +2659,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        unsigned long flags;
        struct rcu_data *rdp;
-        WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
+        WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
        if (debug_rcu_head_queue(head)) {
                /* Probable double call_rcu(), so leak the callback. */
                ACCESS_ONCE(head->func) = rcu_leak_callback;
@@ -2612,7 +2690,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
                local_irq_restore(flags);
                return;
        }
-        ACCESS_ONCE(rdp->qlen)++;
+        ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
        if (lazy)
                rdp->qlen_lazy++;
        else
@@ -3176,7 +3254,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
         * ACCESS_ONCE() to prevent the compiler from speculating
         * the increment to precede the early-exit check.
         */
-        ACCESS_ONCE(rsp->n_barrier_done)++;
+        ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
        WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
        _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
        smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3226,7 +3304,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
        /* Increment ->n_barrier_done to prevent duplicate work. */
        smp_mb(); /* Keep increment after above mechanism. */
-        ACCESS_ONCE(rsp->n_barrier_done)++;
+        ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
        WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
        _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
        smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3483,14 +3561,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 static void __init rcu_init_one(struct rcu_state *rsp,
                struct rcu_data __percpu *rda)
 {
-        static char *buf[] = { "rcu_node_0",
+        static const char * const buf[] = {
-                               "rcu_node_1",
+                "rcu_node_0",
-                               "rcu_node_2",
+                "rcu_node_1",
-                               "rcu_node_3" };  /* Match MAX_RCU_LVLS */
+                "rcu_node_2",
-        static char *fqs[] = { "rcu_node_fqs_0",
+                "rcu_node_3" };  /* Match MAX_RCU_LVLS */
-                               "rcu_node_fqs_1",
+        static const char * const fqs[] = {
-                               "rcu_node_fqs_2",
+                "rcu_node_fqs_0",
-                               "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
+                "rcu_node_fqs_1",
+                "rcu_node_fqs_2",
+                "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
+        static u8 fl_mask = 0x1;
        int cpustride = 1;
        int i;
        int j;
@@ -3509,6 +3590,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        for (i = 1; i < rcu_num_lvls; i++)
                rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
        rcu_init_levelspread(rsp);
+        rsp->flavor_mask = fl_mask;
+        fl_mask <<= 1;
        /* Initialize the elements themselves, starting from the leaves. */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf2c1e669691..71e64c718f75 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -172,6 +172,14 @@ struct rcu_node {
                                /*  queued on this rcu_node structure that */
                                /*  are blocking the current grace period, */
                                /*  there can be no such task. */
+        struct completion boost_completion;
+                                /* Used to ensure that the rt_mutex used */
+                                /*  to carry out the boosting is fully */
+                                /*  released with no future boostee accesses */
+                                /*  before that rt_mutex is re-initialized. */
+        struct rt_mutex boost_mtx;
+                                /* Used only for the priority-boosting */
+                                /*  side effect, not as a lock. */
        unsigned long boost_time;
                                /* When to start boosting (jiffies). */
        struct task_struct *boost_kthread_task;
@@ -307,6 +315,9 @@ struct rcu_data {
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
        unsigned long dynticks_fqs;     /* Kicked due to dynticks idle. */
        unsigned long offline_fqs;      /* Kicked due to being offline. */
+        unsigned long cond_resched_completed;
+                                        /* Grace period that needs help */
+                                        /*  from cond_resched(). */
        /* 5) __rcu_pending() statistics. */
        unsigned long n_rcu_pending;    /* rcu_pending() calls since boot. */
@@ -331,11 +342,29 @@ struct rcu_data {
        struct rcu_head **nocb_tail;
        atomic_long_t nocb_q_count;     /* # CBs waiting for kthread */
        atomic_long_t nocb_q_count_lazy; /*  (approximate). */
+        struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
+        struct rcu_head **nocb_follower_tail;
+        atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
+        atomic_long_t nocb_follower_count_lazy; /*  (approximate). */
        int nocb_p_count;               /* # CBs being invoked by kthread */
        int nocb_p_count_lazy;          /*  (approximate). */
        wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
        struct task_struct *nocb_kthread;
        bool nocb_defer_wakeup;         /* Defer wakeup of nocb_kthread. */
+        /* The following fields are used by the leader, hence own cacheline. */
+        struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
+                                        /* CBs waiting for GP. */
+        struct rcu_head **nocb_gp_tail;
+        long nocb_gp_count;
+        long nocb_gp_count_lazy;
+        bool nocb_leader_wake;          /* Is the nocb leader thread awake? */
+        struct rcu_data *nocb_next_follower;
+                                        /* Next follower in wakeup chain. */
+        /* The following fields are used by the follower, hence new cachline. */
+        struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
+                                        /* Leader CPU takes GP-end wakeups. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
        /* 8) RCU CPU stall data. */
@@ -392,6 +421,7 @@ struct rcu_state {
        struct rcu_node *level[RCU_NUM_LVLS];   /* Hierarchy levels. */
        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
        u8 levelspread[RCU_NUM_LVLS];           /* kids/node in each level. */
+        u8 flavor_mask;                         /* bit in flavor mask. */
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
        void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
                     void (*func)(struct rcu_head *head));
@@ -563,7 +593,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
 static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void rcu_kick_nohz_cpu(int cpu);
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
@@ -583,8 +613,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
 /* Sum up queue lengths for tracing. */
 static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 {
-        *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
+        *ql = atomic_long_read(&rdp->nocb_q_count) +
-        *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
+              rdp->nocb_p_count +
+              atomic_long_read(&rdp->nocb_follower_count) +
+              rdp->nocb_p_count + rdp->nocb_gp_count;
+        *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
+               rdp->nocb_p_count_lazy +
+               atomic_long_read(&rdp->nocb_follower_count_lazy) +
+               rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
 }
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
 static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cbc2c45265e2..00dc411e9676 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -33,6 +33,7 @@
 #define RCU_KTHREAD_PRIO 1
 #ifdef CONFIG_RCU_BOOST
+#include "../locking/rtmutex_common.h"
 #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
 #else
 #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
@@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t)
        unsigned long flags;
        struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
-        struct rt_mutex *rbmp = NULL;
+        bool drop_boost_mutex = false;
 #endif /* #ifdef CONFIG_RCU_BOOST */
        struct rcu_node *rnp;
        int special;
@@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
                if (&t->rcu_node_entry == rnp->boost_tasks)
                        rnp->boost_tasks = np;
-                /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
+                /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
-                if (t->rcu_boost_mutex) {
+                drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
-                        rbmp = t->rcu_boost_mutex;
-                        t->rcu_boost_mutex = NULL;
-                }
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
@@ -427,8 +425,10 @@ void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
                /* Unboost if we were boosted. */
-                if (rbmp)
+                if (drop_boost_mutex) {
-                        rt_mutex_unlock(rbmp);
+                        rt_mutex_unlock(&rnp->boost_mtx);
+                        complete(&rnp->boost_completion);
+                }
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
@@ -988,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 /* Because preemptible RCU does not exist, no quieting of tasks. */
 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+        __releases(rnp->lock)
 {
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -1149,7 +1150,6 @@ static void rcu_wake_cond(struct task_struct *t, int status)
 static int rcu_boost(struct rcu_node *rnp)
 {
        unsigned long flags;
-        struct rt_mutex mtx;
        struct task_struct *t;
        struct list_head *tb;
@@ -1200,11 +1200,15 @@ static int rcu_boost(struct rcu_node *rnp)
         * section.
         */
        t = container_of(tb, struct task_struct, rcu_node_entry);
-        rt_mutex_init_proxy_locked(&mtx, t);
+        rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
-        t->rcu_boost_mutex = &mtx;
+        init_completion(&rnp->boost_completion);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
+        /* Lock only for side effect: boosts task t's priority. */
-        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
+        rt_mutex_lock(&rnp->boost_mtx);
+        rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
+        /* Wait for boostee to be done w/boost_mtx before reinitializing. */
+        wait_for_completion(&rnp->boost_completion);
        return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
               ACCESS_ONCE(rnp->boost_tasks) != NULL;
@@ -1256,6 +1260,7 @@ static int rcu_boost_kthread(void *arg)
 * about it going away.
 */
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+        __releases(rnp->lock)
 {
        struct task_struct *t;
@@ -1491,6 +1496,7 @@ static void rcu_prepare_kthreads(int cpu)
 #else /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+        __releases(rnp->lock)
 {
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -2060,6 +2066,22 @@ bool rcu_is_nocb_cpu(int cpu)
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 /*
+ * Kick the leader kthread for this NOCB group.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+        struct rcu_data *rdp_leader = rdp->nocb_leader;
+        if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+                return;
+        if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) {
+                /* Prior xchg orders against prior callback enqueue. */
+                ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true;
+                wake_up(&rdp_leader->nocb_wq);
+        }
+}
+/*
 * Enqueue the specified string of rcu_head structures onto the specified
 * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the
 * string by rhp, and the tail of the string by rhtp.  The non-lazy/lazy
@@ -2093,7 +2115,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
        len = atomic_long_read(&rdp->nocb_q_count);
        if (old_rhpp == &rdp->nocb_head) {
                if (!irqs_disabled_flags(flags)) {
-                        wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
+                        /* ... if queue was empty ... */
+                        wake_nocb_leader(rdp, false);
                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
                                            TPS("WakeEmpty"));
                } else {
@@ -2103,7 +2126,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
                }
                rdp->qlen_last_fqs_check = 0;
        } else if (len > rdp->qlen_last_fqs_check + qhimark) {
-                wake_up_process(t); /* ... or if many callbacks queued. */
+                /* ... or if many callbacks queued. */
+                wake_nocb_leader(rdp, true);
                rdp->qlen_last_fqs_check = LONG_MAX / 2;
                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
        } else {
@@ -2213,13 +2237,150 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 }
 /*
+ * Leaders come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_leader_wait(struct rcu_data *my_rdp)
+{
+        bool firsttime = true;
+        bool gotcbs;
+        struct rcu_data *rdp;
+        struct rcu_head **tail;
+wait_again:
+        /* Wait for callbacks to appear. */
+        if (!rcu_nocb_poll) {
+                trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+                wait_event_interruptible(my_rdp->nocb_wq,
+                                         ACCESS_ONCE(my_rdp->nocb_leader_wake));
+                /* Memory barrier handled by smp_mb() calls below and repoll. */
+        } else if (firsttime) {
+                firsttime = false; /* Don't drown trace log with "Poll"! */
+                trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+        }
+        /*
+         * Each pass through the following loop checks a follower for CBs.
+         * We are our own first follower.  Any CBs found are moved to
+         * nocb_gp_head, where they await a grace period.
+         */
+        gotcbs = false;
+        for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+                rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+                if (!rdp->nocb_gp_head)
+                        continue;  /* No CBs here, try next follower. */
+                /* Move callbacks to wait-for-GP list, which is empty. */
+                ACCESS_ONCE(rdp->nocb_head) = NULL;
+                rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
+                rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
+                rdp->nocb_gp_count_lazy =
+                        atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
+                gotcbs = true;
+        }
+        /*
+         * If there were no callbacks, sleep a bit, rescan after a
+         * memory barrier, and go retry.
+         */
+        if (unlikely(!gotcbs)) {
+                if (!rcu_nocb_poll)
+                        trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+                                            "WokeEmpty");
+                flush_signals(current);
+                schedule_timeout_interruptible(1);
+                /* Rescan in case we were a victim of memory ordering. */
+                my_rdp->nocb_leader_wake = false;
+                smp_mb();  /* Ensure _wake false before scan. */
+                for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
+                        if (ACCESS_ONCE(rdp->nocb_head)) {
+                                /* Found CB, so short-circuit next wait. */
+                                my_rdp->nocb_leader_wake = true;
+                                break;
+                        }
+                goto wait_again;
+        }
+        /* Wait for one grace period. */
+        rcu_nocb_wait_gp(my_rdp);
+        /*
+         * We left ->nocb_leader_wake set to reduce cache thrashing.
+         * We clear it now, but recheck for new callbacks while
+         * traversing our follower list.
+         */
+        my_rdp->nocb_leader_wake = false;
+        smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */
+        /* Each pass through the following loop wakes a follower, if needed. */
+        for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+                if (ACCESS_ONCE(rdp->nocb_head))
+                        my_rdp->nocb_leader_wake = true; /* No need to wait. */
+                if (!rdp->nocb_gp_head)
+                        continue; /* No CBs, so no need to wake follower. */
+                /* Append callbacks to follower's "done" list. */
+                tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+                *tail = rdp->nocb_gp_head;
+                atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
+                atomic_long_add(rdp->nocb_gp_count_lazy,
+                                &rdp->nocb_follower_count_lazy);
+                if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
+                        /*
+                         * List was empty, wake up the follower.
+                         * Memory barriers supplied by atomic_long_add().
+                         */
+                        wake_up(&rdp->nocb_wq);
+                }
+        }
+        /* If we (the leader) don't have CBs, go wait some more. */
+        if (!my_rdp->nocb_follower_head)
+                goto wait_again;
+}
+/*
+ * Followers come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_follower_wait(struct rcu_data *rdp)
+{
+        bool firsttime = true;
+        for (;;) {
+                if (!rcu_nocb_poll) {
+                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                            "FollowerSleep");
+                        wait_event_interruptible(rdp->nocb_wq,
+                                                 ACCESS_ONCE(rdp->nocb_follower_head));
+                } else if (firsttime) {
+                        /* Don't drown trace log with "Poll"! */
+                        firsttime = false;
+                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
+                }
+                if (smp_load_acquire(&rdp->nocb_follower_head)) {
+                        /* ^^^ Ensure CB invocation follows _head test. */
+                        return;
+                }
+                if (!rcu_nocb_poll)
+                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                                            "WokeEmpty");
+                flush_signals(current);
+                schedule_timeout_interruptible(1);
+        }
+}
+/*
 * Per-rcu_data kthread, but only for no-CBs CPUs.  Each kthread invokes
- * callbacks queued by the corresponding no-CBs CPU.
+ * callbacks queued by the corresponding no-CBs CPU, however, there is
+ * an optional leader-follower relationship so that the grace-period
+ * kthreads don't have to do quite so many wakeups.
 */
 static int rcu_nocb_kthread(void *arg)
 {
        int c, cl;
-        bool firsttime = 1;
        struct rcu_head *list;
        struct rcu_head *next;
        struct rcu_head **tail;
@@ -2227,41 +2388,22 @@ static int rcu_nocb_kthread(void *arg)
        /* Each pass through this loop invokes one batch of callbacks */
        for (;;) {
-                /* If not polling, wait for next batch of callbacks. */
+                /* Wait for callbacks. */
-                if (!rcu_nocb_poll) {
+                if (rdp->nocb_leader == rdp)
-                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                        nocb_leader_wait(rdp);
-                                            TPS("Sleep"));
+                else
-                        wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
+                        nocb_follower_wait(rdp);
-                        /* Memory barrier provide by xchg() below. */
-                } else if (firsttime) {
+                /* Pull the ready-to-invoke callbacks onto local list. */
-                        firsttime = 0;
+                list = ACCESS_ONCE(rdp->nocb_follower_head);
-                        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                BUG_ON(!list);
-                                            TPS("Poll"));
+                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
-                }
+                ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
-                list = ACCESS_ONCE(rdp->nocb_head);
+                tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
-                if (!list) {
+                c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
-                        if (!rcu_nocb_poll)
+                cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
-                                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+                rdp->nocb_p_count += c;
-                                                    TPS("WokeEmpty"));
+                rdp->nocb_p_count_lazy += cl;
-                        schedule_timeout_interruptible(1);
-                        flush_signals(current);
-                        continue;
-                }
-                firsttime = 1;
-                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
-                                    TPS("WokeNonEmpty"));
-                /*
-                 * Extract queued callbacks, update counts, and wait
-                 * for a grace period to elapse.
-                 */
-                ACCESS_ONCE(rdp->nocb_head) = NULL;
-                tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
-                c = atomic_long_xchg(&rdp->nocb_q_count, 0);
-                cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
-                ACCESS_ONCE(rdp->nocb_p_count) += c;
-                ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
-                rcu_nocb_wait_gp(rdp);
                /* Each pass through the following loop invokes a callback. */
                trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2305,7 +2447,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
        if (!rcu_nocb_need_deferred_wakeup(rdp))
                return;
        ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
-        wake_up(&rdp->nocb_wq);
+        wake_nocb_leader(rdp, false);
        trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
 }
@@ -2314,19 +2456,57 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
        rdp->nocb_tail = &rdp->nocb_head;
        init_waitqueue_head(&rdp->nocb_wq);
+        rdp->nocb_follower_tail = &rdp->nocb_follower_head;
 }
-/* Create a kthread for each RCU flavor for each no-CBs CPU. */
+/* How many follower CPU IDs per leader?  Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_leader_stride = -1;
+module_param(rcu_nocb_leader_stride, int, 0444);
+/*
+ * Create a kthread for each RCU flavor for each no-CBs CPU.
+ * Also initialize leader-follower relationships.
+ */
 static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
 {
        int cpu;
+        int ls = rcu_nocb_leader_stride;
+        int nl = 0;  /* Next leader. */
        struct rcu_data *rdp;
+        struct rcu_data *rdp_leader = NULL;  /* Suppress misguided gcc warn. */
+        struct rcu_data *rdp_prev = NULL;
        struct task_struct *t;
        if (rcu_nocb_mask == NULL)
                return;
+#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL)
+        if (tick_nohz_full_running)
+                cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */
+        if (ls == -1) {
+                ls = int_sqrt(nr_cpu_ids);
+                rcu_nocb_leader_stride = ls;
+        }
+        /*
+         * Each pass through this loop sets up one rcu_data structure and
+         * spawns one rcu_nocb_kthread().
+         */
        for_each_cpu(cpu, rcu_nocb_mask) {
                rdp = per_cpu_ptr(rsp->rda, cpu);
+                if (rdp->cpu >= nl) {
+                        /* New leader, set up for followers & next leader. */
+                        nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+                        rdp->nocb_leader = rdp;
+                        rdp_leader = rdp;
+                } else {
+                        /* Another follower, link to previous leader. */
+                        rdp->nocb_leader = rdp_leader;
+                        rdp_prev->nocb_next_follower = rdp;
+                }
+                rdp_prev = rdp;
+                /* Spawn the kthread for this CPU. */
                t = kthread_run(rcu_nocb_kthread, rdp,
                                "rcuo%c/%d", rsp->abbr, cpu);
                BUG_ON(IS_ERR(t));
@@ -2404,7 +2584,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
 * if an adaptive-ticks CPU is failing to respond to the current grace
 * period and has not be idle from an RCU perspective, kick it.
 */
-static void rcu_kick_nohz_cpu(int cpu)
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
        if (tick_nohz_full_cpu(cpu))
@@ -2843,12 +3023,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
 */
 static void rcu_bind_gp_kthread(void)
 {
-#ifdef CONFIG_NO_HZ_FULL
+        int __maybe_unused cpu;
-        int cpu = ACCESS_ONCE(tick_do_timer_cpu);
-        if (cpu < 0 || cpu >= nr_cpu_ids)
+        if (!tick_nohz_full_enabled())
                return;
-        if (raw_smp_processor_id() != cpu)
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+        cpu = tick_do_timer_cpu;
+        if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
                set_cpus_allowed_ptr(current, cpumask_of(cpu));
-#endif /* #ifdef CONFIG_NO_HZ_FULL */
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+        if (!is_housekeeping_cpu(raw_smp_processor_id()))
+                housekeeping_affine(current);
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 }
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a2aeb4df0f60..4056d7992a6c 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -90,9 +90,6 @@ void __rcu_read_unlock(void)
        } else {
                barrier();  /* critical section before exit code. */
                t->rcu_read_lock_nesting = INT_MIN;
-#ifdef CONFIG_PROVE_RCU_DELAY
-                udelay(10); /* Make preemption more probable. */
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
                barrier();  /* assign before ->rcu_read_unlock_special load */
                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
                        rcu_read_unlock_special(t);
@@ -200,12 +197,12 @@ void wait_rcu_gp(call_rcu_func_t crf)
 EXPORT_SYMBOL_GPL(wait_rcu_gp);
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-static inline void debug_init_rcu_head(struct rcu_head *head)
+void init_rcu_head(struct rcu_head *head)
 {
        debug_object_init(head, &rcuhead_debug_descr);
 }
-static inline void debug_rcu_head_free(struct rcu_head *head)
+void destroy_rcu_head(struct rcu_head *head)
 {
        debug_object_free(head, &rcuhead_debug_descr);
 }
@@ -350,21 +347,3 @@ static int __init check_cpu_stall_init(void)
 early_initcall(check_cpu_stall_init);
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
-/*
- * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
- */
-DEFINE_PER_CPU(int, rcu_cond_resched_count);
-/*
- * Report a set of RCU quiescent states, for use by cond_resched()
- * and friends.  Out of line due to being called infrequently.
- */
-void rcu_resched(void)
-{
-        preempt_disable();
-        __this_cpu_write(rcu_cond_resched_count, 0);
-        rcu_note_context_switch(smp_processor_id());
-        preempt_enable();
-}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bdf01b494fe..1211575a2208 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq)
                return;
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+        if (delta < 0)
+                return;
        rq->clock += delta;
        update_rq_clock_task(rq, delta);
 }
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        char buf[64];
        char *cmp;
        int i;
+        struct inode *inode;
        if (cnt > 63)
                cnt = 63;
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        buf[cnt] = 0;
        cmp = strstrip(buf);
+        /* Ensure the static_key remains in a consistent state */
+        inode = file_inode(filp);
+        mutex_lock(&inode->i_mutex);
        i = sched_feat_set(cmp);
+        mutex_unlock(&inode->i_mutex);
        if (i == __SCHED_FEAT_NR)
                return -EINVAL;
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p)
 #endif
 /*
- * resched_task - mark a task 'to be rescheduled now'.
+ * resched_curr - mark rq's current task 'to be rescheduled now'.
 *
 * On UP this means the setting of the need_resched flag, on SMP it
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
-void resched_task(struct task_struct *p)
+void resched_curr(struct rq *rq)
 {
+        struct task_struct *curr = rq->curr;
        int cpu;
-        lockdep_assert_held(&task_rq(p)->lock);
+        lockdep_assert_held(&rq->lock);
-        if (test_tsk_need_resched(p))
+        if (test_tsk_need_resched(curr))
                return;
-        cpu = task_cpu(p);
+        cpu = cpu_of(rq);
        if (cpu == smp_processor_id()) {
-                set_tsk_need_resched(p);
+                set_tsk_need_resched(curr);
                set_preempt_need_resched();
                return;
        }
-        if (set_nr_and_not_polling(p))
+        if (set_nr_and_not_polling(curr))
                smp_send_reschedule(cpu);
        else
                trace_sched_wake_idle_without_ipi(cpu);
@@ -623,7 +631,7 @@ void resched_cpu(int cpu)
        if (!raw_spin_trylock_irqsave(&rq->lock, flags))
                return;
-        resched_task(cpu_curr(cpu));
+        resched_curr(rq);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu)
 static bool wake_up_full_nohz_cpu(int cpu)
 {
+        /*
+         * We just need the target to call irq_exit() and re-evaluate
+         * the next tick. The nohz full kick at least implies that.
+         * If needed we can still optimize that later with an
+         * empty IRQ.
+         */
        if (tick_nohz_full_cpu(cpu)) {
                if (cpu != smp_processor_id() ||
                    tick_nohz_tick_stopped())
-                        smp_send_reschedule(cpu);
+                        tick_nohz_full_kick_cpu(cpu);
                return true;
        }
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)
 #ifdef CONFIG_NO_HZ_FULL
 bool sched_can_stop_tick(void)
 {
-       struct rq *rq;
+        /*
+         * More than one running task need preemption.
-       rq = this_rq();
+         * nr_running update is assumed to be visible
+         * after IPI is sent from wakers.
-       /* Make sure rq->nr_running update is visible after the IPI */
+         */
-       smp_rmb();
+        if (this_rq()->nr_running > 1)
+                return false;
-       /* More than one running task need preemption */
-       if (rq->nr_running > 1)
-               return false;
-       return true;
+        return true;
 }
 #endif /* CONFIG_NO_HZ_FULL */
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                        if (class == rq->curr->sched_class)
                                break;
                        if (class == p->sched_class) {
-                                resched_task(rq->curr);
+                                resched_curr(rq);
                                break;
                        }
                }
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void)
         */
        preempt_fold_need_resched();
-        if (llist_empty(&this_rq()->wake_list)
+        if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
-                        && !tick_nohz_full_cpu(smp_processor_id())
-                        && !got_nohz_idle_kick())
                return;
        /*
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void)
         * somewhat pessimize the simple resched case.
         */
        irq_enter();
-        tick_nohz_full_check();
        sched_ttwu_pending();
        /*
@@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 {
        u64 ns = 0;
-        if (task_current(rq, p)) {
+        /*
+         * Must be ->curr _and_ ->on_rq.  If dequeued, we would
+         * project cycles that may never be accounted to this
+         * thread, breaking clock_gettime().
+         */
+        if (task_current(rq, p) && p->on_rq) {
                update_rq_clock(rq);
                ns = rq_clock_task(rq) - p->se.exec_start;
                if ((s64)ns < 0)
@@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)
         * If we race with it leaving cpu, we'll take a lock. So we're correct.
         * If we race with it entering cpu, unaccounted time is 0. This is
         * indistinguishable from the read occurring a few cycles earlier.
+         * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+         * been accounted, so we're correct here as well.
         */
-        if (!p->on_cpu)
+        if (!p->on_cpu || !p->on_rq)
                return p->se.sum_exec_runtime;
 #endif
@@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        }
        trace_sched_pi_setprio(p, prio);
-        p->pi_top_task = rt_mutex_get_top_task(p);
        oldprio = p->prio;
        prev_class = p->sched_class;
        on_rq = p->on_rq;
@@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         *          running task
         */
        if (dl_prio(prio)) {
-                if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
+                struct task_struct *pi_task = rt_mutex_get_top_task(p);
-                        dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+                if (!dl_prio(p->normal_prio) ||
+                    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                        p->dl.dl_boosted = 1;
                        p->dl.dl_throttled = 0;
                        enqueue_flag = ENQUEUE_REPLENISH;
@@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice)
                 * lowered its priority, then reschedule its CPU:
                 */
                if (delta < 0 || (delta > 0 && task_running(rq, p)))
-                        resched_task(rq->curr);
+                        resched_curr(rq);
        }
 out_unlock:
        task_rq_unlock(rq, p, &flags);
@@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
        dl_se->dl_yielded = 0;
 }
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
 static void __setscheduler_params(struct task_struct *p,
                const struct sched_attr *attr)
 {
        int policy = attr->sched_policy;
-        if (policy == -1) /* setparam */
+        if (policy == SETPARAM_POLICY)
                policy = p->policy;
        p->policy = policy;
@@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
                .sched_nice     = PRIO_TO_NICE(p->static_prio),
        };
-        /*
+        /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
-         * Fixup the legacy SCHED_RESET_ON_FORK hack
+        if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
-         */
-        if (policy & SCHED_RESET_ON_FORK) {
                attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
                policy &= ~SCHED_RESET_ON_FORK;
                attr.sched_policy = policy;
@@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
 */
 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
-        return do_sched_setscheduler(pid, -1, param);
+        return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
 }
 /**
@@ -4147,7 +4166,6 @@ static void __cond_resched(void)
 int __sched _cond_resched(void)
 {
-        rcu_cond_resched();
        if (should_resched()) {
                __cond_resched();
                return 1;
@@ -4166,18 +4184,15 @@ EXPORT_SYMBOL(_cond_resched);
 */
 int __cond_resched_lock(spinlock_t *lock)
 {
-        bool need_rcu_resched = rcu_should_resched();
        int resched = should_resched();
        int ret = 0;
        lockdep_assert_held(lock);
-        if (spin_needbreak(lock) || resched || need_rcu_resched) {
+        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
                if (resched)
                        __cond_resched();
-                else if (unlikely(need_rcu_resched))
-                        rcu_resched();
                else
                        cpu_relax();
                ret = 1;
@@ -4191,7 +4206,6 @@ int __sched __cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
-        rcu_cond_resched();  /* BH disabled OK, just recording QSes. */
        if (should_resched()) {
                local_bh_enable();
                __cond_resched();
@@ -4290,7 +4304,7 @@ again:
                 * fairness.
                 */
                if (preempt && rq != p_rq)
-                        resched_task(p_rq->curr);
+                        resched_curr(p_rq);
        }
 out_unlock:
@@ -6470,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                sched_domain_level_max = max(sched_domain_level_max, sd->level);
                child->parent = sd;
                sd->child = child;
+                if (!cpumask_subset(sched_domain_span(child),
+                                    sched_domain_span(sd))) {
+                        pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+                        pr_err("     the %s domain not a subset of the %s domain\n",
+                                        child->name, sd->name);
+#endif
+                        /* Fixup, ensure @sd has at least @child cpus. */
+                        cpumask_or(sched_domain_span(sd),
+                                   sched_domain_span(sd),
+                                   sched_domain_span(child));
+                }
        }
        set_domain_attribute(sd, attr);
@@ -7097,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
        __setscheduler(rq, p, &attr);
        if (on_rq) {
                enqueue_task(rq, p, 0);
-                resched_task(rq->curr);
+                resched_curr(rq);
        }
        check_class_changed(rq, p, prev_class, old_prio);
@@ -7808,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        if (period > max_cfs_quota_period)
                return -EINVAL;
+        /*
+         * Prevent race between setting of cfs_rq->runtime_enabled and
+         * unthrottle_offline_cfs_rqs().
+         */
+        get_online_cpus();
        mutex_lock(&cfs_constraints_mutex);
        ret = __cfs_schedulable(tg, period, quota);
        if (ret)
@@ -7833,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        }
        raw_spin_unlock_irq(&cfs_b->lock);
-        for_each_possible_cpu(i) {
+        for_each_online_cpu(i) {
                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
                struct rq *rq = cfs_rq->rq;
@@ -7849,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                cfs_bandwidth_usage_dec();
 out_unlock:
        mutex_unlock(&cfs_constraints_mutex);
+        put_online_cpus();
        return ret;
 }
@@ -8088,7 +8122,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
-        .base_cftypes   = cpu_files,
+        .legacy_cftypes = cpu_files,
        .early_init     = 1,
 };
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9cf350c94ec4..dd7cbb55bbf2 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
 struct cgroup_subsys cpuacct_cgrp_subsys = {
        .css_alloc      = cpuacct_css_alloc,
        .css_free       = cpuacct_css_free,
-        .base_cftypes   = files,
+        .legacy_cftypes = files,
        .early_init     = 1,
 };
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc4f98b1258f..255ce138b652 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
 * the overrunning entity can't interfere with other entity in the system and
 * can't make them miss their deadlines. Reasons why this kind of overruns
 * could happen are, typically, a entity voluntarily trying to overcome its
- * runtime, or it just underestimated it during sched_setscheduler_ex().
+ * runtime, or it just underestimated it during sched_setattr().
 */
 static void replenish_dl_entity(struct sched_dl_entity *dl_se,
                                struct sched_dl_entity *pi_se)
@@ -535,7 +535,7 @@ again:
                if (task_has_dl_policy(rq->curr))
                        check_preempt_curr_dl(rq, p, 0);
                else
-                        resched_task(rq->curr);
+                        resched_curr(rq);
 #ifdef CONFIG_SMP
                /*
                 * Queueing this task back might have overloaded rq,
@@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq)
                        enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
                if (!is_leftmost(curr, &rq->dl))
-                        resched_task(curr);
+                        resched_curr(rq);
        }
        /*
@@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
            cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
                return;
-        resched_task(rq->curr);
+        resched_curr(rq);
 }
 static int pull_dl_task(struct rq *this_rq);
@@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
                                  int flags)
 {
        if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
-                resched_task(rq->curr);
+                resched_curr(rq);
                return;
        }
@@ -1333,7 +1333,7 @@ retry:
        if (dl_task(rq->curr) &&
            dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
            rq->curr->nr_cpus_allowed > 1) {
-                resched_task(rq->curr);
+                resched_curr(rq);
                return 0;
        }
@@ -1373,7 +1373,7 @@ retry:
        set_task_cpu(next_task, later_rq->cpu);
        activate_task(later_rq, next_task, 0);
-        resched_task(later_rq->curr);
+        resched_curr(later_rq);
        double_unlock_balance(rq, later_rq);
@@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
                 */
                if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
                    rq->curr == p)
-                        resched_task(p);
+                        resched_curr(rq);
 #else
                /*
                 * Again, we don't know if p has a earlier
                 * or later deadline, so let's blindly set a
                 * (maybe not needed) rescheduling point.
                 */
-                resched_task(p);
+                resched_curr(rq);
 #endif /* CONFIG_SMP */
        } else
                switched_to_dl(rq, p);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 695f9773bb60..627b3c34b821 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                avg_atom = p->se.sum_exec_runtime;
                if (nr_switches)
-                        do_div(avg_atom, nr_switches);
+                        avg_atom = div64_ul(avg_atom, nr_switches);
                else
                        avg_atom = -1LL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fea7d3335e1f..bfa3c86d0d68 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
        if (!cpus)
                return;
-        ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
        ns->task_capacity =
                DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
        ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,
        env->best_cpu = env->dst_cpu;
 }
-static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
+static bool load_too_imbalanced(long src_load, long dst_load,
-                                long src_load, long dst_load,
                                struct task_numa_env *env)
 {
        long imb, old_imb;
+        long orig_src_load, orig_dst_load;
+        long src_capacity, dst_capacity;
+        /*
+         * The load is corrected for the CPU capacity available on each node.
+         *
+         * src_load        dst_load
+         * ------------ vs ---------
+         * src_capacity    dst_capacity
+         */
+        src_capacity = env->src_stats.compute_capacity;
+        dst_capacity = env->dst_stats.compute_capacity;
        /* We care about the slope of the imbalance, not the direction. */
        if (dst_load < src_load)
                swap(dst_load, src_load);
        /* Is the difference below the threshold? */
-        imb = dst_load * 100 - src_load * env->imbalance_pct;
+        imb = dst_load * src_capacity * 100 -
+              src_load * dst_capacity * env->imbalance_pct;
        if (imb <= 0)
                return false;
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
         * The imbalance is above the allowed threshold.
         * Compare it with the old imbalance.
         */
+        orig_src_load = env->src_stats.load;
+        orig_dst_load = env->dst_stats.load;
        if (orig_dst_load < orig_src_load)
                swap(orig_dst_load, orig_src_load);
-        old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+        old_imb = orig_dst_load * src_capacity * 100 -
+                  orig_src_load * dst_capacity * env->imbalance_pct;
        /* Would this change make things worse? */
        return (imb > old_imb);
@@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env,
        struct rq *src_rq = cpu_rq(env->src_cpu);
        struct rq *dst_rq = cpu_rq(env->dst_cpu);
        struct task_struct *cur;
-        long orig_src_load, src_load;
+        long src_load, dst_load;
-        long orig_dst_load, dst_load;
        long load;
-        long imp = (groupimp > 0) ? groupimp : taskimp;
+        long imp = env->p->numa_group ? groupimp : taskimp;
+        long moveimp = imp;
        rcu_read_lock();
        cur = ACCESS_ONCE(dst_rq->curr);
@@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
                         * itself (not part of a group), use the task weight
                         * instead.
                         */
-                        if (env->p->numa_group)
-                                imp = groupimp;
-                        else
-                                imp = taskimp;
                        if (cur->numa_group)
                                imp += group_weight(cur, env->src_nid) -
                                       group_weight(cur, env->dst_nid);
@@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env,
                }
        }
-        if (imp < env->best_imp)
+        if (imp <= env->best_imp && moveimp <= env->best_imp)
                goto unlock;
        if (!cur) {
@@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env,
        }
        /* Balance doesn't matter much if we're running a task per cpu */
-        if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+        if (imp > env->best_imp && src_rq->nr_running == 1 &&
+                        dst_rq->nr_running == 1)
                goto assign;
        /*
         * In the overloaded case, try and keep the load balanced.
         */
 balance:
-        orig_dst_load = env->dst_stats.load;
-        orig_src_load = env->src_stats.load;
-        /* XXX missing capacity terms */
        load = task_h_load(env->p);
-        dst_load = orig_dst_load + load;
+        dst_load = env->dst_stats.load + load;
-        src_load = orig_src_load - load;
+        src_load = env->src_stats.load - load;
+        if (moveimp > imp && moveimp > env->best_imp) {
+                /*
+                 * If the improvement from just moving env->p direction is
+                 * better than swapping tasks around, check if a move is
+                 * possible. Store a slightly smaller score than moveimp,
+                 * so an actually idle CPU will win.
+                 */
+                if (!load_too_imbalanced(src_load, dst_load, env)) {
+                        imp = moveimp - 1;
+                        cur = NULL;
+                        goto assign;
+                }
+        }
+        if (imp <= env->best_imp)
+                goto unlock;
        if (cur) {
                load = task_h_load(cur);
@@ -1225,8 +1249,7 @@ balance:
                src_load += load;
        }
-        if (load_too_imbalanced(orig_src_load, orig_dst_load,
+        if (load_too_imbalanced(src_load, dst_load, env))
-                                src_load, dst_load, env))
                goto unlock;
 assign:
@@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p)
        groupimp = group_weight(p, env.dst_nid) - groupweight;
        update_numa_stats(&env.dst_stats, env.dst_nid);
-        /* If the preferred nid has free capacity, try to use it. */
+        /* Try to find a spot on the preferred nid. */
-        if (env.dst_stats.has_free_capacity)
+        task_numa_find_cpu(&env, taskimp, groupimp);
-                task_numa_find_cpu(&env, taskimp, groupimp);
        /* No space available on the preferred nid. Look elsewhere. */
        if (env.best_cpu == -1) {
@@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p)
                }
        }
-        /* No better CPU than the current one was found. */
-        if (env.best_cpu == -1)
-                return -EAGAIN;
        /*
         * If the task is part of a workload that spans multiple NUMA nodes,
         * and is migrating into one of the workload's active nodes, remember
@@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p)
         * A task that migrated to a second choice node will be better off
         * trying for a better one later. Do not set the preferred node here.
         */
-        if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
+        if (p->numa_group) {
-                sched_setnuma(p, env.dst_nid);
+                if (env.best_cpu == -1)
+                        nid = env.src_nid;
+                else
+                        nid = env.dst_nid;
+                if (node_isset(nid, p->numa_group->active_nodes))
+                        sched_setnuma(p, env.dst_nid);
+        }
+        /* No better CPU than the current one was found. */
+        if (env.best_cpu == -1)
+                return -EAGAIN;
        /*
         * Reset the scan period if the task is being rescheduled on an
@@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
 /*
 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
 * increments. The more local the fault statistics are, the higher the scan
- * period will be for the next scan window. If local/remote ratio is below
+ * period will be for the next scan window. If local/(local+remote) ratio is
- * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
- * scan period will decrease
+ * the scan period will decrease. Aim for 70% local accesses.
 */
 #define NUMA_PERIOD_SLOTS 10
-#define NUMA_PERIOD_THRESHOLD 3
+#define NUMA_PERIOD_THRESHOLD 7
 /*
 * Increase the scan period (slow down scanning) if the majority of
@@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p)
        if (p->numa_group) {
                update_numa_active_node_mask(p->numa_group);
-                /*
-                 * If the preferred task and group nids are different,
-                 * iterate over the nodes again to find the best place.
-                 */
-                if (max_nid != max_group_nid) {
-                        unsigned long weight, max_weight = 0;
-                        for_each_online_node(nid) {
-                                weight = task_weight(p, nid) + group_weight(p, nid);
-                                if (weight > max_weight) {
-                                        max_weight = weight;
-                                        max_nid = nid;
-                                }
-                        }
-                }
                spin_unlock_irq(group_lock);
+                max_nid = max_group_nid;
        }
-        /* Preferred node as the node with the most faults */
+        if (max_faults) {
-        if (max_faults && max_nid != p->numa_preferred_nid) {
+                /* Set the new preferred node */
-                /* Update the preferred nid and migrate task if possible */
+                if (max_nid != p->numa_preferred_nid)
-                sched_setnuma(p, max_nid);
+                        sched_setnuma(p, max_nid);
-                numa_migrate_preferred(p);
+                if (task_node(p) != p->numa_preferred_nid)
+                        numa_migrate_preferred(p);
        }
 }
@@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
        ideal_runtime = sched_slice(cfs_rq, curr);
        delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
        if (delta_exec > ideal_runtime) {
-                resched_task(rq_of(cfs_rq)->curr);
+                resched_curr(rq_of(cfs_rq));
                /*
                 * The current task ran long enough, ensure it doesn't get
                 * re-elected due to buddy favours.
@@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
                return;
        if (delta > ideal_runtime)
-                resched_task(rq_of(cfs_rq)->curr);
+                resched_curr(rq_of(cfs_rq));
 }
 static void
@@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         * validating it and just reschedule.
         */
        if (queued) {
-                resched_task(rq_of(cfs_rq)->curr);
+                resched_curr(rq_of(cfs_rq));
                return;
        }
        /*
@@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
         * hierarchy can be throttled
         */
        if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-                resched_task(rq_of(cfs_rq)->curr);
+                resched_curr(rq_of(cfs_rq));
 }
 static __always_inline
@@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->throttled = 1;
        cfs_rq->throttled_clock = rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
-        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+        /*
+         * Add to the _head_ of the list, so that an already-started
+         * distribute_cfs_runtime will not see us
+         */
+        list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
        if (!cfs_b->timer_active)
                __start_cfs_bandwidth(cfs_b, false);
        raw_spin_unlock(&cfs_b->lock);
@@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
        /* determine whether we need to wake up potentially idle cpu */
        if (rq->curr == rq->idle && rq->cfs.nr_running)
-                resched_task(rq->curr);
+                resched_curr(rq);
 }
 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
                u64 remaining, u64 expires)
 {
        struct cfs_rq *cfs_rq;
-        u64 runtime = remaining;
+        u64 runtime;
+        u64 starting_runtime = remaining;
        rcu_read_lock();
        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3448,7 +3469,7 @@ next:
        }
        rcu_read_unlock();
-        return remaining;
+        return starting_runtime - remaining;
 }
 /*
@@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
        /* account preceding periods in which throttling occurred */
        cfs_b->nr_throttled += overrun;
-        /*
-         * There are throttled entities so we must first use the new bandwidth
-         * to unthrottle them before making it generally available.  This
-         * ensures that all existing debts will be paid before a new cfs_rq is
-         * allowed to run.
-         */
-        runtime = cfs_b->runtime;
        runtime_expires = cfs_b->runtime_expires;
-        cfs_b->runtime = 0;
        /*
-         * This check is repeated as we are holding onto the new bandwidth
+         * This check is repeated as we are holding onto the new bandwidth while
-         * while we unthrottle.  This can potentially race with an unthrottled
+         * we unthrottle. This can potentially race with an unthrottled group
-         * group trying to acquire new bandwidth from the global pool.
+         * trying to acquire new bandwidth from the global pool. This can result
+         * in us over-using our runtime if it is all used during this loop, but
+         * only by limited amounts in that extreme case.
         */
-        while (throttled && runtime > 0) {
+        while (throttled && cfs_b->runtime > 0) {
+                runtime = cfs_b->runtime;
                raw_spin_unlock(&cfs_b->lock);
                /* we can't nest cfs_b->lock while distributing bandwidth */
                runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
                raw_spin_lock(&cfs_b->lock);
                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+                cfs_b->runtime -= min(runtime, cfs_b->runtime);
        }
-        /* return (any) remaining runtime */
-        cfs_b->runtime = runtime;
        /*
         * While we are ensured activity in the period following an
         * unthrottle, this also covers the case in which the new bandwidth is
@@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
                return;
        }
-        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
                runtime = cfs_b->runtime;
-                cfs_b->runtime = 0;
-        }
        expires = cfs_b->runtime_expires;
        raw_spin_unlock(&cfs_b->lock);
@@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        raw_spin_lock(&cfs_b->lock);
        if (expires == cfs_b->runtime_expires)
-                cfs_b->runtime = runtime;
+                cfs_b->runtime -= min(runtime, cfs_b->runtime);
        raw_spin_unlock(&cfs_b->lock);
 }
@@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        hrtimer_cancel(&cfs_b->slack_timer);
 }
+static void __maybe_unused update_runtime_enabled(struct rq *rq)
+{
+        struct cfs_rq *cfs_rq;
+        for_each_leaf_cfs_rq(rq, cfs_rq) {
+                struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+                raw_spin_lock(&cfs_b->lock);
+                cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
+                raw_spin_unlock(&cfs_b->lock);
+        }
+}
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
        struct cfs_rq *cfs_rq;
@@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
                 * there's some valid quota amount
                 */
                cfs_rq->runtime_remaining = 1;
+                /*
+                 * Offline rq is schedulable till cpu is completely disabled
+                 * in take_cpu_down(), so we prevent new cfs throttling here.
+                 */
+                cfs_rq->runtime_enabled = 0;
                if (cfs_rq_throttled(cfs_rq))
                        unthrottle_cfs_rq(cfs_rq);
        }
@@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
        return NULL;
 }
 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static inline void update_runtime_enabled(struct rq *rq) {}
 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
 #endif /* CONFIG_CFS_BANDWIDTH */
@@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                if (delta < 0) {
                        if (rq->curr == p)
-                                resched_task(p);
+                                resched_curr(rq);
                        return;
                }
@@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        return;
 preempt:
-        resched_task(curr);
+        resched_curr(rq);
        /*
         * Only set the backward buddy when the current task is still
         * on the rq. This can happen when a wakeup gets interleaved
@@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
 /*
 * Is this task likely cache-hot:
 */
-static int
+static int task_hot(struct task_struct *p, struct lb_env *env)
-task_hot(struct task_struct *p, u64 now)
 {
        s64 delta;
@@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now)
        /*
         * Buddy candidates are cache hot:
         */
-        if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
+        if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
                        (&p->se == cfs_rq_of(&p->se)->next ||
                         &p->se == cfs_rq_of(&p->se)->last))
                return 1;
@@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now)
        if (sysctl_sched_migration_cost == 0)
                return 0;
-        delta = now - p->se.exec_start;
+        delta = rq_clock_task(env->src_rq) - p->se.exec_start;
        return delta < (s64)sysctl_sched_migration_cost;
 }
@@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         * 2) task is cache cold, or
         * 3) too many balance attempts have failed.
         */
-        tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
+        tsk_cache_hot = task_hot(p, env);
        if (!tsk_cache_hot)
                tsk_cache_hot = migrate_degrades_locality(p, env);
@@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
 * @sgs: variable to hold the statistics for this group.
+ * @overload: Indicate more than one runnable task for any CPU.
 */
 static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
-                        int local_group, struct sg_lb_stats *sgs)
+                        int local_group, struct sg_lb_stats *sgs,
+                        bool *overload)
 {
        unsigned long load;
        int i;
@@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->group_load += load;
                sgs->sum_nr_running += rq->nr_running;
+                if (rq->nr_running > 1)
+                        *overload = true;
 #ifdef CONFIG_NUMA_BALANCING
                sgs->nr_numa_running += rq->nr_numa_running;
                sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats tmp_sgs;
        int load_idx, prefer_sibling = 0;
+        bool overload = false;
        if (child && child->flags & SD_PREFER_SIBLING)
                prefer_sibling = 1;
@@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                                update_group_capacity(env->sd, env->dst_cpu);
                }
-                update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
+                update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+                                                &overload);
                if (local_group)
                        goto next_group;
@@ -6049,6 +6091,13 @@ next_group:
        if (env->sd->flags & SD_NUMA)
                env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+        if (!env->sd->parent) {
+                /* update overload indicator if we are at root domain */
+                if (env->dst_rq->rd->overload != overload)
+                        env->dst_rq->rd->overload = overload;
+        }
 }
 /**
@@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq)
         */
        this_rq->idle_stamp = rq_clock(this_rq);
-        if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+        if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+            !this_rq->rd->overload) {
                rcu_read_lock();
                sd = rcu_dereference_check_sched_domain(this_rq->sd);
                if (sd)
@@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq)
 static void rq_online_fair(struct rq *rq)
 {
        update_sysctl();
+        update_runtime_enabled(rq);
 }
 static void rq_offline_fair(struct rq *rq)
@@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p)
                 * 'current' within the tree based on its new key value.
                 */
                swap(curr->vruntime, se->vruntime);
-                resched_task(rq->curr);
+                resched_curr(rq);
        }
        se->vruntime -= cfs_rq->min_vruntime;
@@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
         */
        if (rq->curr == p) {
                if (p->prio > oldprio)
-                        resched_task(rq->curr);
+                        resched_curr(rq);
        } else
                check_preempt_curr(rq, p, 0);
 }
@@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
         * if we can still preempt the current task.
         */
        if (rq->curr == p)
-                resched_task(rq->curr);
+                resched_curr(rq);
        else
                check_preempt_curr(rq, p, 0);
 }
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cf009fb0bc25..9f1608f99819 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -79,7 +79,7 @@ static void cpuidle_idle_call(void)
        struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
        struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
        int next_state, entered_state;
-        bool broadcast;
+        unsigned int broadcast;
        /*
         * Check if the idle task must be rescheduled. If it is the
@@ -135,7 +135,7 @@ use_default:
                goto exit_idle;
        }
-        broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
+        broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
        /*
         * Tell the time framework to switch to a broadcast timer
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 879f2b75266a..67ad4e7f506a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
 */
 static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
 {
-        resched_task(rq->idle);
+        resched_curr(rq);
 }
 static struct task_struct *
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a49083192c64..5f6edca4fafd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
        struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+        struct rq *rq = rq_of_rt_rq(rt_rq);
        struct sched_rt_entity *rt_se;
-        int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+        int cpu = cpu_of(rq);
        rt_se = rt_rq->tg->rt_se[cpu];
@@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
                        enqueue_rt_entity(rt_se, false);
                if (rt_rq->highest_prio.curr < curr->prio)
-                        resched_task(curr);
+                        resched_curr(rq);
        }
 }
@@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
                return;
        enqueue_top_rt_rq(rt_rq);
-        resched_task(rq->curr);
+        resched_curr(rq);
 }
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -740,6 +741,9 @@ balanced:
                rt_rq->rt_throttled = 0;
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
                raw_spin_unlock(&rt_b->rt_runtime_lock);
+                /* Make rt_rq available for pick_next_task() */
+                sched_rt_rq_enqueue(rt_rq);
        }
 }
@@ -948,7 +952,7 @@ static void update_curr_rt(struct rq *rq)
                        raw_spin_lock(&rt_rq->rt_runtime_lock);
                        rt_rq->rt_time += delta_exec;
                        if (sched_rt_runtime_exceeded(rt_rq))
-                                resched_task(curr);
+                                resched_curr(rq);
                        raw_spin_unlock(&rt_rq->rt_runtime_lock);
                }
        }
@@ -1363,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
         * to try and push current away:
         */
        requeue_task_rt(rq, p, 1);
-        resched_task(rq->curr);
+        resched_curr(rq);
 }
 #endif /* CONFIG_SMP */
@@ -1374,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
 {
        if (p->prio < rq->curr->prio) {
-                resched_task(rq->curr);
+                resched_curr(rq);
                return;
        }
@@ -1690,7 +1694,7 @@ retry:
         * just reschedule current.
         */
        if (unlikely(next_task->prio < rq->curr->prio)) {
-                resched_task(rq->curr);
+                resched_curr(rq);
                return 0;
        }
@@ -1737,7 +1741,7 @@ retry:
        activate_task(lowest_rq, next_task, 0);
        ret = 1;
-        resched_task(lowest_rq->curr);
+        resched_curr(lowest_rq);
        double_unlock_balance(rq, lowest_rq);
@@ -1936,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
                return;
        if (pull_rt_task(rq))
-                resched_task(rq->curr);
+                resched_curr(rq);
 }
 void __init init_sched_rt_class(void)
@@ -1974,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
                        check_resched = 0;
 #endif /* CONFIG_SMP */
                if (check_resched && p->prio < rq->curr->prio)
-                        resched_task(rq->curr);
+                        resched_curr(rq);
        }
 }
@@ -2003,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
                 * Only reschedule if p is still on the same runqueue.
                 */
                if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
-                        resched_task(p);
+                        resched_curr(rq);
 #else
                /* For UP simply resched on drop of prio */
                if (oldprio < p->prio)
-                        resched_task(p);
+                        resched_curr(rq);
 #endif /* CONFIG_SMP */
        } else {
                /*
@@ -2016,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
                 * then reschedule.
                 */
                if (p->prio < rq->curr->prio)
-                        resched_task(rq->curr);
+                        resched_curr(rq);
        }
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31cc02ebc54e..579712f4e9d5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,6 +477,9 @@ struct root_domain {
        cpumask_var_t span;
        cpumask_var_t online;
+        /* Indicate more than one runnable task for any CPU */
+        bool overload;
        /*
         * The bit corresponding to a CPU gets set here if such CPU has more
         * than one runnable -deadline task (as it is below for RT tasks).
@@ -884,20 +887,10 @@ enum {
 #undef SCHED_FEAT
 #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
-static __always_inline bool static_branch__true(struct static_key *key)
-{
-        return static_key_true(key); /* Not out of line branch. */
-}
-static __always_inline bool static_branch__false(struct static_key *key)
-{
-        return static_key_false(key); /* Out of line branch. */
-}
 #define SCHED_FEAT(name, enabled)                                       \
 static __always_inline bool static_branch_##name(struct static_key *key) \
 {                                                                       \
-        return static_branch__##enabled(key);                           \
+        return static_key_##enabled(key);                               \
 }
 #include "features.h"
@@ -1196,7 +1189,7 @@ extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 extern void init_sched_dl_class(void);
-extern void resched_task(struct task_struct *p);
+extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
 extern struct rt_bandwidth def_rt_bandwidth;
@@ -1218,15 +1211,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
        rq->nr_running = prev_nr + count;
-#ifdef CONFIG_NO_HZ_FULL
        if (prev_nr < 2 && rq->nr_running >= 2) {
+#ifdef CONFIG_SMP
+                if (!rq->rd->overload)
+                        rq->rd->overload = true;
+#endif
+#ifdef CONFIG_NO_HZ_FULL
                if (tick_nohz_full_cpu(rq->cpu)) {
-                        /* Order rq->nr_running write against the IPI */
+                        /*
-                        smp_wmb();
+                         * Tick is needed if more than one task runs on a CPU.
-                        smp_send_reschedule(rq->cpu);
+                         * Send the target an IPI to kick it out of nohz mode.
+                         *
+                         * We assume that IPI implies full memory barrier and the
+                         * new value of rq->nr_running is visible on reception
+                         * from the target.
+                         */
+                        tick_nohz_full_kick_cpu(rq->cpu);
                }
-       }
 #endif
+        }
 }
 static inline void sub_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 0ffa20ae657b..15cab1a4f84e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function);
 */
 int __sched
 __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
-                        int (*action)(void *), unsigned mode)
+              wait_bit_action_f *action, unsigned mode)
 {
        int ret = 0;
        do {
                prepare_to_wait(wq, &q->wait, mode);
                if (test_bit(q->key.bit_nr, q->key.flags))
-                        ret = (*action)(q->key.flags);
+                        ret = (*action)(&q->key);
        } while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
        finish_wait(wq, &q->wait);
        return ret;
@@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
 EXPORT_SYMBOL(__wait_on_bit);
 int __sched out_of_line_wait_on_bit(void *word, int bit,
-                                        int (*action)(void *), unsigned mode)
+                                    wait_bit_action_f *action, unsigned mode)
 {
        wait_queue_head_t *wq = bit_waitqueue(word, bit);
        DEFINE_WAIT_BIT(wait, word, bit);
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit);
 int __sched
 __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
-                        int (*action)(void *), unsigned mode)
+                        wait_bit_action_f *action, unsigned mode)
 {
        do {
                int ret;
@@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
                prepare_to_wait_exclusive(wq, &q->wait, mode);
                if (!test_bit(q->key.bit_nr, q->key.flags))
                        continue;
-                ret = action(q->key.flags);
+                ret = action(&q->key);
                if (!ret)
                        continue;
                abort_exclusive_wait(wq, &q->wait, mode, &q->key);
@@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
 EXPORT_SYMBOL(__wait_on_bit_lock);
 int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
-                                        int (*action)(void *), unsigned mode)
+                                         wait_bit_action_f *action, unsigned mode)
 {
        wait_queue_head_t *wq = bit_waitqueue(word, bit);
        DEFINE_WAIT_BIT(wait, word, bit);
@@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p)
        __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
 }
 EXPORT_SYMBOL(wake_up_atomic_t);
+__sched int bit_wait(struct wait_bit_key *word)
+{
+        if (signal_pending_state(current->state, current))
+                return 1;
+        schedule();
+        return 0;
+}
+EXPORT_SYMBOL(bit_wait);
+__sched int bit_wait_io(struct wait_bit_key *word)
+{
+        if (signal_pending_state(current->state, current))
+                return 1;
+        io_schedule();
+        return 0;
+}
+EXPORT_SYMBOL(bit_wait_io);
diff --git a/kernel/signal.c b/kernel/signal.c
index a4077e90f19f..40b76e351e64 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
        struct sighand_struct *sighand;
        for (;;) {
+                /*
+                 * Disable interrupts early to avoid deadlocks.
+                 * See rcu_read_unlock() comment header for details.
+                 */
                local_irq_save(*flags);
                rcu_read_lock();
                sighand = rcu_dereference(tsk->sighand);
diff --git a/kernel/smp.c b/kernel/smp.c
index 80c33f8de14f..487653b5844f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,7 @@
 *
 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
 */
+#include <linux/irq_work.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/kernel.h>
@@ -251,6 +252,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
                csd->func(csd->info);
                csd_unlock(csd);
        }
+        /*
+         * Handle irq works queued remotely by irq_work_queue_on().
+         * Smp functions above are typically synchronous so they
+         * better run first since some other CPUs may be busy waiting
+         * for them.
+         */
+        irq_work_run();
 }
 /*
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65a430d..4aec4a457431 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
        return ret;
 }
+EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
 static int alarmtimer_rtc_add_device(struct device *dev,
                                struct class_interface *class_intf)
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
                                struct itimerspec *new_setting,
                                struct itimerspec *old_setting)
 {
+        ktime_t exp;
        if (!rtcdev)
                return -ENOTSUPP;
+        if (flags & ~TIMER_ABSTIME)
+                return -EINVAL;
        if (old_setting)
                alarm_timer_get(timr, old_setting);
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
        /* start the timer */
        timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
-        alarm_start(&timr->it.alarm.alarmtimer,
+        exp = timespec_to_ktime(new_setting->it_value);
-                        timespec_to_ktime(new_setting->it_value));
+        /* Convert (if necessary) to absolute time */
+        if (flags != TIMER_ABSTIME) {
+                ktime_t now;
+                now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
+                exp = ktime_add(now, exp);
+        }
+        alarm_start(&timr->it.alarm.alarmtimer, exp);
        return 0;
 }
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
        if (!alarmtimer_get_rtcdev())
                return -ENOTSUPP;
+        if (flags & ~TIMER_ABSTIME)
+                return -EINVAL;
        if (!capable(CAP_WAKE_ALARM))
                return -EPERM;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index ad362c260ef4..9c94c19f1305 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -146,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
 {
        /* Nothing to do if we already reached the limit */
        if (dev->min_delta_ns >= MIN_DELTA_LIMIT) {
-                printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n");
+                printk_deferred(KERN_WARNING
+                                "CE: Reprogramming failure. Giving up\n");
                dev->next_event.tv64 = KTIME_MAX;
                return -ETIME;
        }
@@ -159,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev)
        if (dev->min_delta_ns > MIN_DELTA_LIMIT)
                dev->min_delta_ns = MIN_DELTA_LIMIT;
-        printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
+        printk_deferred(KERN_WARNING
-               dev->name ? dev->name : "?",
+                        "CE: %s increased min_delta_ns to %llu nsec\n",
-               (unsigned long long) dev->min_delta_ns);
+                        dev->name ? dev->name : "?",
+                        (unsigned long long) dev->min_delta_ns);
        return 0;
 }
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 445106d2c729..01d2d15aa662 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -191,7 +191,8 @@ void __init sched_clock_postinit(void)
 static int sched_clock_suspend(void)
 {
-        sched_clock_poll(&sched_clock_timer);
+        update_sched_clock();
+        hrtimer_cancel(&sched_clock_timer);
        cd.suspended = true;
        return 0;
 }
@@ -199,6 +200,7 @@ static int sched_clock_suspend(void)
 static void sched_clock_resume(void)
 {
        cd.epoch_cyc = read_sched_clock();
+        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
        cd.suspended = false;
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6558b7ac112d..99aa6ee3908f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 #ifdef CONFIG_NO_HZ_FULL
 cpumask_var_t tick_nohz_full_mask;
+cpumask_var_t housekeeping_mask;
 bool tick_nohz_full_running;
 static bool can_stop_full_tick(void)
@@ -224,13 +225,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
 };
 /*
- * Kick the current CPU if it's full dynticks in order to force it to
+ * Kick the CPU if it's full dynticks in order to force it to
 * re-evaluate its dependency on the tick and restart it if necessary.
 */
-void tick_nohz_full_kick(void)
+void tick_nohz_full_kick_cpu(int cpu)
 {
-        if (tick_nohz_full_cpu(smp_processor_id()))
+        if (!tick_nohz_full_cpu(cpu))
-                irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+                return;
+        irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
 }
 static void nohz_full_kick_ipi(void *info)
@@ -281,6 +284,7 @@ static int __init tick_nohz_full_setup(char *str)
        int cpu;
        alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
+        alloc_bootmem_cpumask_var(&housekeeping_mask);
        if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
                pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
                return 1;
@@ -291,6 +295,8 @@ static int __init tick_nohz_full_setup(char *str)
                pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
                cpumask_clear_cpu(cpu, tick_nohz_full_mask);
        }
+        cpumask_andnot(housekeeping_mask,
+                       cpu_possible_mask, tick_nohz_full_mask);
        tick_nohz_full_running = true;
        return 1;
@@ -332,9 +338,15 @@ static int tick_nohz_init_all(void)
                pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
                return err;
        }
+        if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+                pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n");
+                return err;
+        }
        err = 0;
        cpumask_setall(tick_nohz_full_mask);
        cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
+        cpumask_clear(housekeeping_mask);
+        cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
        tick_nohz_full_running = true;
 #endif
        return err;
diff --git a/kernel/torture.c b/kernel/torture.c
index 40bb511cca48..d600af21f022 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -708,7 +708,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
        int ret = 0;
        VERBOSE_TOROUT_STRING(m);
-        *tp = kthread_run(fn, arg, s);
+        *tp = kthread_run(fn, arg, "%s", s);
        if (IS_ERR(*tp)) {
                ret = PTR_ERR(*tp);
                VERBOSE_TOROUT_ERRSTRING(f);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d4409356f40d..a5da09c899dd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
        help
          See Documentation/trace/ftrace-design.txt
-config HAVE_FUNCTION_TRACE_MCOUNT_TEST
-        bool
-        help
-          See Documentation/trace/ftrace-design.txt
 config HAVE_DYNAMIC_FTRACE
        bool
        help
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2611613f14f1..67d6369ddf83 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_output.o
+obj-$(CONFIG_TRACING) += trace_seq.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b372e3ed675..1654b12c891a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -80,9 +80,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = {
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
-/* Quick disabling of function tracer. */
-int function_trace_stop __read_mostly;
 /* Current function tracing op */
 struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
 /* What to set function_trace_op to */
@@ -265,12 +262,12 @@ static void update_ftrace_function(void)
                func = ftrace_ops_list_func;
        }
+        update_function_graph_func();
        /* If there's no change, then do nothing more here */
        if (ftrace_trace_function == func)
                return;
-        update_function_graph_func();
        /*
         * If we are using the list function, it doesn't care
         * about the function_trace_ops.
@@ -1042,6 +1039,8 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 #ifdef CONFIG_DYNAMIC_FTRACE
+static struct ftrace_ops *removed_ops;
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
@@ -1304,25 +1303,15 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
        struct ftrace_hash *new_hash;
        int size = src->count;
        int bits = 0;
-        int ret;
        int i;
        /*
-         * Remove the current set, update the hash and add
-         * them back.
-         */
-        ftrace_hash_rec_disable(ops, enable);
-        /*
         * If the new source is empty, just free dst and assign it
         * the empty_hash.
         */
        if (!src->count) {
-                free_ftrace_hash_rcu(*dst);
+                new_hash = EMPTY_HASH;
-                rcu_assign_pointer(*dst, EMPTY_HASH);
+                goto update;
-                /* still need to update the function records */
-                ret = 0;
-                goto out;
        }
        /*
@@ -1335,10 +1324,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
        if (bits > FTRACE_HASH_MAX_BITS)
                bits = FTRACE_HASH_MAX_BITS;
-        ret = -ENOMEM;
        new_hash = alloc_ftrace_hash(bits);
        if (!new_hash)
-                goto out;
+                return -ENOMEM;
        size = 1 << src->size_bits;
        for (i = 0; i < size; i++) {
@@ -1349,20 +1337,20 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
                }
        }
+update:
+        /*
+         * Remove the current set, update the hash and add
+         * them back.
+         */
+        ftrace_hash_rec_disable(ops, enable);
        old_hash = *dst;
        rcu_assign_pointer(*dst, new_hash);
        free_ftrace_hash_rcu(old_hash);
-        ret = 0;
- out:
-        /*
-         * Enable regardless of ret:
-         *  On success, we enable the new hash.
-         *  On failure, we re-enable the original hash.
-         */
        ftrace_hash_rec_enable(ops, enable);
-        return ret;
+        return 0;
 }
 /*
@@ -1492,6 +1480,53 @@ int ftrace_text_reserved(const void *start, const void *end)
        return (int)!!ret;
 }
+/* Test if ops registered to this rec needs regs */
+static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
+{
+        struct ftrace_ops *ops;
+        bool keep_regs = false;
+        for (ops = ftrace_ops_list;
+             ops != &ftrace_list_end; ops = ops->next) {
+                /* pass rec in as regs to have non-NULL val */
+                if (ftrace_ops_test(ops, rec->ip, rec)) {
+                        if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+                                keep_regs = true;
+                                break;
+                        }
+                }
+        }
+        return  keep_regs;
+}
+static void ftrace_remove_tramp(struct ftrace_ops *ops,
+                                struct dyn_ftrace *rec)
+{
+        struct ftrace_func_entry *entry;
+        entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip);
+        if (!entry)
+                return;
+        /*
+         * The tramp_hash entry will be removed at time
+         * of update.
+         */
+        ops->nr_trampolines--;
+        rec->flags &= ~FTRACE_FL_TRAMP;
+}
+static void ftrace_clear_tramps(struct dyn_ftrace *rec)
+{
+        struct ftrace_ops *op;
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                if (op->nr_trampolines)
+                        ftrace_remove_tramp(op, rec);
+        } while_for_each_ftrace_op(op);
+}
 static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                                     int filter_hash,
                                     bool inc)
@@ -1572,8 +1607,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                if (inc) {
                        rec->flags++;
-                        if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
+                        if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
                                return;
+                        /*
+                         * If there's only a single callback registered to a
+                         * function, and the ops has a trampoline registered
+                         * for it, then we can call it directly.
+                         */
+                        if (ftrace_rec_count(rec) == 1 && ops->trampoline) {
+                                rec->flags |= FTRACE_FL_TRAMP;
+                                ops->nr_trampolines++;
+                        } else {
+                                /*
+                                 * If we are adding another function callback
+                                 * to this function, and the previous had a
+                                 * trampoline used, then we need to go back to
+                                 * the default trampoline.
+                                 */
+                                rec->flags &= ~FTRACE_FL_TRAMP;
+                                /* remove trampolines from any ops for this rec */
+                                ftrace_clear_tramps(rec);
+                        }
                        /*
                         * If any ops wants regs saved for this function
                         * then all ops will get saved regs.
@@ -1581,9 +1638,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                        if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
                                rec->flags |= FTRACE_FL_REGS;
                } else {
-                        if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
+                        if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0))
                                return;
                        rec->flags--;
+                        if (ops->trampoline && !ftrace_rec_count(rec))
+                                ftrace_remove_tramp(ops, rec);
+                        /*
+                         * If the rec had REGS enabled and the ops that is
+                         * being removed had REGS set, then see if there is
+                         * still any ops for this record that wants regs.
+                         * If not, we can stop recording them.
+                         */
+                        if (ftrace_rec_count(rec) > 0 &&
+                            rec->flags & FTRACE_FL_REGS &&
+                            ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+                                if (!test_rec_ops_needs_regs(rec))
+                                        rec->flags &= ~FTRACE_FL_REGS;
+                        }
+                        /*
+                         * flags will be cleared in ftrace_check_record()
+                         * if rec count is zero.
+                         */
                }
                count++;
                /* Shortcut, if we handled all records, we are done. */
@@ -1668,17 +1746,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
         * If we are disabling calls, then disable all records that
         * are enabled.
         */
-        if (enable && (rec->flags & ~FTRACE_FL_MASK))
+        if (enable && ftrace_rec_count(rec))
                flag = FTRACE_FL_ENABLED;
        /*
-         * If enabling and the REGS flag does not match the REGS_EN, then
+         * If enabling and the REGS flag does not match the REGS_EN, or
-         * do not ignore this record. Set flags to fail the compare against
+         * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore
-         * ENABLED.
+         * this record. Set flags to fail the compare against ENABLED.
         */
-        if (flag &&
+        if (flag) {
-            (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
+                if (!(rec->flags & FTRACE_FL_REGS) != 
-                flag |= FTRACE_FL_REGS;
+                    !(rec->flags & FTRACE_FL_REGS_EN))
+                        flag |= FTRACE_FL_REGS;
+                if (!(rec->flags & FTRACE_FL_TRAMP) != 
+                    !(rec->flags & FTRACE_FL_TRAMP_EN))
+                        flag |= FTRACE_FL_TRAMP;
+        }
        /* If the state of this record hasn't changed, then do nothing */
        if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1696,6 +1780,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
                                else
                                        rec->flags &= ~FTRACE_FL_REGS_EN;
                        }
+                        if (flag & FTRACE_FL_TRAMP) {
+                                if (rec->flags & FTRACE_FL_TRAMP)
+                                        rec->flags |= FTRACE_FL_TRAMP_EN;
+                                else
+                                        rec->flags &= ~FTRACE_FL_TRAMP_EN;
+                        }
                }
                /*
@@ -1704,7 +1794,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
                 * Otherwise,
                 *   return UPDATE_MODIFY_CALL to tell the caller to convert
                 *   from the save regs, to a non-save regs function or
-                 *   vice versa.
+                 *   vice versa, or from a trampoline call.
                 */
                if (flag & FTRACE_FL_ENABLED)
                        return FTRACE_UPDATE_MAKE_CALL;
@@ -1714,7 +1804,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
        if (update) {
                /* If there's no more users, clear all flags */
-                if (!(rec->flags & ~FTRACE_FL_MASK))
+                if (!ftrace_rec_count(rec))
                        rec->flags = 0;
                else
                        /* Just disable the record (keep REGS state) */
@@ -1751,6 +1841,43 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
        return ftrace_check_record(rec, enable, 0);
 }
+static struct ftrace_ops *
+ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
+{
+        struct ftrace_ops *op;
+        /* Removed ops need to be tested first */
+        if (removed_ops && removed_ops->tramp_hash) {
+                if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip))
+                        return removed_ops;
+        }
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                if (!op->tramp_hash)
+                        continue;
+                if (ftrace_lookup_ip(op->tramp_hash, rec->ip))
+                        return op;
+        } while_for_each_ftrace_op(op);
+        return NULL;
+}
+static struct ftrace_ops *
+ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
+{
+        struct ftrace_ops *op;
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                /* pass rec in as regs to have non-NULL val */
+                if (ftrace_ops_test(op, rec->ip, rec))
+                        return op;
+        } while_for_each_ftrace_op(op);
+        return NULL;
+}
 /**
 * ftrace_get_addr_new - Get the call address to set to
 * @rec:  The ftrace record descriptor
@@ -1763,6 +1890,20 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
 */
 unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
 {
+        struct ftrace_ops *ops;
+        /* Trampolines take precedence over regs */
+        if (rec->flags & FTRACE_FL_TRAMP) {
+                ops = ftrace_find_tramp_ops_new(rec);
+                if (FTRACE_WARN_ON(!ops || !ops->trampoline)) {
+                        pr_warning("Bad trampoline accounting at: %p (%pS)\n",
+                                    (void *)rec->ip, (void *)rec->ip);
+                        /* Ftrace is shutting down, return anything */
+                        return (unsigned long)FTRACE_ADDR;
+                }
+                return ops->trampoline;
+        }
        if (rec->flags & FTRACE_FL_REGS)
                return (unsigned long)FTRACE_REGS_ADDR;
        else
@@ -1781,6 +1922,20 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
 */
 unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
 {
+        struct ftrace_ops *ops;
+        /* Trampolines take precedence over regs */
+        if (rec->flags & FTRACE_FL_TRAMP_EN) {
+                ops = ftrace_find_tramp_ops_curr(rec);
+                if (FTRACE_WARN_ON(!ops)) {
+                        pr_warning("Bad trampoline accounting at: %p (%pS)\n",
+                                    (void *)rec->ip, (void *)rec->ip);
+                        /* Ftrace is shutting down, return anything */
+                        return (unsigned long)FTRACE_ADDR;
+                }
+                return ops->trampoline;
+        }
        if (rec->flags & FTRACE_FL_REGS_EN)
                return (unsigned long)FTRACE_REGS_ADDR;
        else
@@ -2023,6 +2178,89 @@ void __weak arch_ftrace_update_code(int command)
        ftrace_run_stop_machine(command);
 }
+static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
+{
+        struct ftrace_page *pg;
+        struct dyn_ftrace *rec;
+        int size, bits;
+        int ret;
+        size = ops->nr_trampolines;
+        bits = 0;
+        /*
+         * Make the hash size about 1/2 the # found
+         */
+        for (size /= 2; size; size >>= 1)
+                bits++;
+        ops->tramp_hash = alloc_ftrace_hash(bits);
+        /*
+         * TODO: a failed allocation is going to screw up
+         * the accounting of what needs to be modified
+         * and not. For now, we kill ftrace if we fail
+         * to allocate here. But there are ways around this,
+         * but that will take a little more work.
+         */
+        if (!ops->tramp_hash)
+                return -ENOMEM;
+        do_for_each_ftrace_rec(pg, rec) {
+                if (ftrace_rec_count(rec) == 1 &&
+                    ftrace_ops_test(ops, rec->ip, rec)) {
+                        /*
+                         * If another ops adds to a rec, the rec will
+                         * lose its trampoline and never get it back
+                         * until all ops are off of it.
+                         */
+                        if (!(rec->flags & FTRACE_FL_TRAMP))
+                                continue;
+                        /* This record had better have a trampoline */
+                        if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN)))
+                                return -1;
+                        ret = add_hash_entry(ops->tramp_hash, rec->ip);
+                        if (ret < 0)
+                                return ret;
+                }
+        } while_for_each_ftrace_rec();
+        /* The number of recs in the hash must match nr_trampolines */
+        FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines);
+        return 0;
+}
+static int ftrace_save_tramp_hashes(void)
+{
+        struct ftrace_ops *op;
+        int ret;
+        /*
+         * Now that any trampoline is being used, we need to save the
+         * hashes for the ops that have them. This allows the mapping
+         * back from the record to the ops that has the trampoline to
+         * know what code is being replaced. Modifying code must always
+         * verify what it is changing.
+         */
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
+                /* The tramp_hash is recreated each time. */
+                free_ftrace_hash(op->tramp_hash);
+                op->tramp_hash = NULL;
+                if (op->nr_trampolines) {
+                        ret = ftrace_save_ops_tramp_hash(op);
+                        if (ret)
+                                return ret;
+                }
+        } while_for_each_ftrace_op(op);
+        return 0;
+}
 static void ftrace_run_update_code(int command)
 {
        int ret;
@@ -2031,11 +2269,6 @@ static void ftrace_run_update_code(int command)
        FTRACE_WARN_ON(ret);
        if (ret)
                return;
-        /*
-         * Do not call function tracer while we update the code.
-         * We are in stop machine.
-         */
-        function_trace_stop++;
        /*
         * By default we use stop_machine() to modify the code.
@@ -2045,15 +2278,15 @@ static void ftrace_run_update_code(int command)
         */
        arch_ftrace_update_code(command);
-        function_trace_stop--;
        ret = ftrace_arch_code_modify_post_process();
        FTRACE_WARN_ON(ret);
+        ret = ftrace_save_tramp_hashes();
+        FTRACE_WARN_ON(ret);
 }
 static ftrace_func_t saved_ftrace_func;
 static int ftrace_start_up;
-static int global_start_up;
 static void control_ops_free(struct ftrace_ops *ops)
 {
@@ -2117,8 +2350,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
        ftrace_hash_rec_disable(ops, 1);
-        if (!global_start_up)
+        ops->flags &= ~FTRACE_OPS_FL_ENABLED;
-                ops->flags &= ~FTRACE_OPS_FL_ENABLED;
        command |= FTRACE_UPDATE_CALLS;
@@ -2139,8 +2371,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
                return 0;
        }
+        /*
+         * If the ops uses a trampoline, then it needs to be
+         * tested first on update.
+         */
+        removed_ops = ops;
        ftrace_run_update_code(command);
+        removed_ops = NULL;
        /*
         * Dynamic ops may be freed, we must make sure that all
         * callers are done before leaving this function.
@@ -2398,7 +2638,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
        return start_pg;
 free_pages:
-        while (start_pg) {
+        pg = start_pg;
+        while (pg) {
                order = get_count_order(pg->size / ENTRIES_PER_PAGE);
                free_pages((unsigned long)pg->records, order);
                start_pg = pg->next;
@@ -2595,8 +2836,10 @@ static void *t_start(struct seq_file *m, loff_t *pos)
         * off, we can short cut and just print out that all
         * functions are enabled.
         */
-        if (iter->flags & FTRACE_ITER_FILTER &&
+        if ((iter->flags & FTRACE_ITER_FILTER &&
-            ftrace_hash_empty(ops->filter_hash)) {
+             ftrace_hash_empty(ops->filter_hash)) ||
+            (iter->flags & FTRACE_ITER_NOTRACE &&
+             ftrace_hash_empty(ops->notrace_hash))) {
                if (*pos > 0)
                        return t_hash_start(m, pos);
                iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2641,7 +2884,10 @@ static int t_show(struct seq_file *m, void *v)
                return t_hash_show(m, iter);
        if (iter->flags & FTRACE_ITER_PRINTALL) {
-                seq_printf(m, "#### all functions enabled ####\n");
+                if (iter->flags & FTRACE_ITER_NOTRACE)
+                        seq_printf(m, "#### no functions disabled ####\n");
+                else
+                        seq_printf(m, "#### all functions enabled ####\n");
                return 0;
        }
@@ -2651,10 +2897,22 @@ static int t_show(struct seq_file *m, void *v)
                return 0;
        seq_printf(m, "%ps", (void *)rec->ip);
-        if (iter->flags & FTRACE_ITER_ENABLED)
+        if (iter->flags & FTRACE_ITER_ENABLED) {
                seq_printf(m, " (%ld)%s",
-                           rec->flags & ~FTRACE_FL_MASK,
+                           ftrace_rec_count(rec),
-                           rec->flags & FTRACE_FL_REGS ? " R" : "");
+                           rec->flags & FTRACE_FL_REGS ? " R" : "  ");
+                if (rec->flags & FTRACE_FL_TRAMP_EN) {
+                        struct ftrace_ops *ops;
+                        ops = ftrace_find_tramp_ops_curr(rec);
+                        if (ops && ops->trampoline)
+                                seq_printf(m, "\ttramp: %pS",
+                                           (void *)ops->trampoline);
+                        else
+                                seq_printf(m, "\ttramp: ERROR!");
+                }
+        }       
        seq_printf(m, "\n");
        return 0;
@@ -2702,13 +2960,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
        return iter ? 0 : -ENOMEM;
 }
-static void ftrace_filter_reset(struct ftrace_hash *hash)
-{
-        mutex_lock(&ftrace_lock);
-        ftrace_hash_clear(hash);
-        mutex_unlock(&ftrace_lock);
-}
 /**
 * ftrace_regex_open - initialize function tracer filter files
 * @ops: The ftrace_ops that hold the hash filters
@@ -2758,7 +3009,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
                hash = ops->filter_hash;
        if (file->f_mode & FMODE_WRITE) {
-                iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
+                const int size_bits = FTRACE_HASH_DEFAULT_BITS;
+                if (file->f_flags & O_TRUNC)
+                        iter->hash = alloc_ftrace_hash(size_bits);
+                else
+                        iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
                if (!iter->hash) {
                        trace_parser_put(&iter->parser);
                        kfree(iter);
@@ -2767,10 +3024,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
                }
        }
-        if ((file->f_mode & FMODE_WRITE) &&
-            (file->f_flags & O_TRUNC))
-                ftrace_filter_reset(iter->hash);
        if (file->f_mode & FMODE_READ) {
                iter->pg = ftrace_pages_start;
@@ -3471,14 +3724,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
        else
                orig_hash = &ops->notrace_hash;
-        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+        if (reset)
+                hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+        else
+                hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
        if (!hash) {
                ret = -ENOMEM;
                goto out_regex_unlock;
        }
-        if (reset)
-                ftrace_filter_reset(hash);
        if (buf && !ftrace_match_records(hash, buf, len)) {
                ret = -EINVAL;
                goto out_regex_unlock;
@@ -3630,6 +3885,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
 static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
 static int __init set_graph_function(char *str)
@@ -3639,16 +3895,29 @@ static int __init set_graph_function(char *str)
 }
 __setup("ftrace_graph_filter=", set_graph_function);
-static void __init set_ftrace_early_graph(char *buf)
+static int __init set_graph_notrace_function(char *str)
+{
+        strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+        return 1;
+}
+__setup("ftrace_graph_notrace=", set_graph_notrace_function);
+static void __init set_ftrace_early_graph(char *buf, int enable)
 {
        int ret;
        char *func;
+        unsigned long *table = ftrace_graph_funcs;
+        int *count = &ftrace_graph_count;
+        if (!enable) {
+                table = ftrace_graph_notrace_funcs;
+                count = &ftrace_graph_notrace_count;
+        }
        while (buf) {
                func = strsep(&buf, ",");
                /* we allow only one expression at a time */
-                ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
+                ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
-                                      FTRACE_GRAPH_MAX_FUNCS, func);
                if (ret)
                        printk(KERN_DEBUG "ftrace: function %s not "
                                          "traceable\n", func);
@@ -3677,7 +3946,9 @@ static void __init set_ftrace_early_filters(void)
                ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        if (ftrace_graph_buf[0])
-                set_ftrace_early_graph(ftrace_graph_buf);
+                set_ftrace_early_graph(ftrace_graph_buf, 1);
+        if (ftrace_graph_notrace_buf[0])
+                set_ftrace_early_graph(ftrace_graph_notrace_buf, 0);
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 }
@@ -3819,7 +4090,12 @@ static int g_show(struct seq_file *m, void *v)
                return 0;
        if (ptr == (unsigned long *)1) {
-                seq_printf(m, "#### all functions enabled ####\n");
+                struct ftrace_graph_data *fgd = m->private;
+                if (fgd->table == ftrace_graph_funcs)
+                        seq_printf(m, "#### all functions enabled ####\n");
+                else
+                        seq_printf(m, "#### no functions disabled ####\n");
                return 0;
        }
@@ -4447,9 +4723,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
        struct ftrace_ops *op;
        int bit;
-        if (function_trace_stop)
-                return;
        bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
        if (bit < 0)
                return;
@@ -4461,9 +4734,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
        preempt_disable_notrace();
        do_for_each_ftrace_op(op, ftrace_ops_list) {
                if (ftrace_ops_test(op, ip, regs)) {
-                        if (WARN_ON(!op->func)) {
+                        if (FTRACE_WARN_ON(!op->func)) {
-                                function_trace_stop = 1;
+                                pr_warn("op=%p %pS\n", op, op);
-                                printk("op=%p %pS\n", op, op);
                                goto out;
                        }
                        op->func(ip, parent_ip, op, regs);
@@ -5084,6 +5356,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        /* Function graph doesn't use the .func field of global_ops */
        global_ops.flags |= FTRACE_OPS_FL_STUB;
+#ifdef CONFIG_DYNAMIC_FTRACE
+        /* Optimize function graph calling (if implemented by arch) */
+        if (FTRACE_GRAPH_TRAMP_ADDR != 0)
+                global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR;
+#endif
        ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
 out:
@@ -5104,6 +5382,10 @@ void unregister_ftrace_graph(void)
        __ftrace_graph_entry = ftrace_graph_entry_stub;
        ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
        global_ops.flags &= ~FTRACE_OPS_FL_STUB;
+#ifdef CONFIG_DYNAMIC_FTRACE
+        if (FTRACE_GRAPH_TRAMP_ADDR != 0)
+                global_ops.trampoline = 0;
+#endif
        unregister_pm_notifier(&ftrace_suspend_notifier);
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
@@ -5183,9 +5465,4 @@ void ftrace_graph_exit_task(struct task_struct *t)
        kfree(ret_stack);
 }
-void ftrace_graph_stop(void)
-{
-        ftrace_stop();
-}
 #endif
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7c56c3d06943..925f629658d6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -616,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
        struct ring_buffer_per_cpu *cpu_buffer;
        struct rb_irq_work *work;
-        if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
-            (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
-                return POLLIN | POLLRDNORM;
        if (cpu == RING_BUFFER_ALL_CPUS)
                work = &buffer->irq_work;
        else {
@@ -1693,22 +1689,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
                        if (!cpu_buffer->nr_pages_to_update)
                                continue;
-                        /* The update must run on the CPU that is being updated. */
+                        /* Can't run something on an offline CPU. */
-                        preempt_disable();
+                        if (!cpu_online(cpu)) {
-                        if (cpu == smp_processor_id() || !cpu_online(cpu)) {
                                rb_update_pages(cpu_buffer);
                                cpu_buffer->nr_pages_to_update = 0;
                        } else {
-                                /*
-                                 * Can not disable preemption for schedule_work_on()
-                                 * on PREEMPT_RT.
-                                 */
-                                preempt_enable();
                                schedule_work_on(cpu,
                                                &cpu_buffer->update_pages_work);
-                                preempt_disable();
                        }
-                        preempt_enable();
                }
                /* wait for all the updates to complete */
@@ -1746,22 +1734,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
                get_online_cpus();
-                preempt_disable();
+                /* Can't run something on an offline CPU. */
-                /* The update must run on the CPU that is being updated. */
+                if (!cpu_online(cpu_id))
-                if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
                        rb_update_pages(cpu_buffer);
                else {
-                        /*
-                         * Can not disable preemption for schedule_work_on()
-                         * on PREEMPT_RT.
-                         */
-                        preempt_enable();
                        schedule_work_on(cpu_id,
                                         &cpu_buffer->update_pages_work);
                        wait_for_completion(&cpu_buffer->update_done);
-                        preempt_disable();
                }
-                preempt_enable();
                cpu_buffer->nr_pages_to_update = 0;
                put_online_cpus();
@@ -3779,7 +3759,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        if (rb_per_cpu_empty(cpu_buffer))
                return NULL;
-        if (iter->head >= local_read(&iter->head_page->page->commit)) {
+        if (iter->head >= rb_page_size(iter->head_page)) {
                rb_inc_iter(iter);
                goto again;
        }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 84e2b45c0934..8a528392b1f4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -466,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size)
        struct print_entry *entry;
        unsigned long irq_flags;
        int alloc;
+        int pc;
+        if (!(trace_flags & TRACE_ITER_PRINTK))
+                return 0;
+        pc = preempt_count();
        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;
@@ -475,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
        local_save_flags(irq_flags);
        buffer = global_trace.trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
-                                          irq_flags, preempt_count());
+                                          irq_flags, pc);
        if (!event)
                return 0;
@@ -492,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
                entry->buf[size] = '\0';
        __buffer_unlock_commit(buffer, event);
+        ftrace_trace_stack(buffer, irq_flags, 4, pc);
        return size;
 }
@@ -509,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str)
        struct bputs_entry *entry;
        unsigned long irq_flags;
        int size = sizeof(struct bputs_entry);
+        int pc;
+        if (!(trace_flags & TRACE_ITER_PRINTK))
+                return 0;
+        pc = preempt_count();
        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;
@@ -516,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str)
        local_save_flags(irq_flags);
        buffer = global_trace.trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
-                                          irq_flags, preempt_count());
+                                          irq_flags, pc);
        if (!event)
                return 0;
@@ -525,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str)
        entry->str                      = str;
        __buffer_unlock_commit(buffer, event);
+        ftrace_trace_stack(buffer, irq_flags, 4, pc);
        return 1;
 }
@@ -809,7 +823,7 @@ static struct {
        { trace_clock_local,            "local",        1 },
        { trace_clock_global,           "global",       1 },
        { trace_clock_counter,          "counter",      0 },
-        { trace_clock_jiffies,          "uptime",       1 },
+        { trace_clock_jiffies,          "uptime",       0 },
        { trace_clock,                  "perf",         1 },
        { ktime_get_mono_fast_ns,       "mono",         1 },
        ARCH_TRACE_CLOCKS
@@ -924,30 +938,6 @@ out:
        return ret;
 }
-ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
-{
-        int len;
-        int ret;
-        if (!cnt)
-                return 0;
-        if (s->len <= s->readpos)
-                return -EBUSY;
-        len = s->len - s->readpos;
-        if (cnt > len)
-                cnt = len;
-        ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
-        if (ret == cnt)
-                return -EFAULT;
-        cnt -= ret;
-        s->readpos += cnt;
-        return cnt;
-}
 static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 {
        int len;
@@ -3686,6 +3676,7 @@ static const char readme_msg[] =
 #endif
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        "  set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
+        "  set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n"
        "  max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
 #endif
 #ifdef CONFIG_TRACER_SNAPSHOT
@@ -4225,10 +4216,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 }
 static ssize_t
-tracing_max_lat_read(struct file *filp, char __user *ubuf,
+tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
-                     size_t cnt, loff_t *ppos)
+                   size_t cnt, loff_t *ppos)
 {
-        unsigned long *ptr = filp->private_data;
        char buf[64];
        int r;
@@ -4240,10 +4230,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf,
 }
 static ssize_t
-tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
-                      size_t cnt, loff_t *ppos)
+                    size_t cnt, loff_t *ppos)
 {
-        unsigned long *ptr = filp->private_data;
        unsigned long val;
        int ret;
@@ -4256,6 +4245,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
+static ssize_t
+tracing_thresh_read(struct file *filp, char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+        return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos);
+}
+static ssize_t
+tracing_thresh_write(struct file *filp, const char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+        struct trace_array *tr = filp->private_data;
+        int ret;
+        mutex_lock(&trace_types_lock);
+        ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
+        if (ret < 0)
+                goto out;
+        if (tr->current_trace->update_thresh) {
+                ret = tr->current_trace->update_thresh(tr);
+                if (ret < 0)
+                        goto out;
+        }
+        ret = cnt;
+out:
+        mutex_unlock(&trace_types_lock);
+        return ret;
+}
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+                     size_t cnt, loff_t *ppos)
+{
+        return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
+}
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+                      size_t cnt, loff_t *ppos)
+{
+        return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
+}
 static int tracing_open_pipe(struct inode *inode, struct file *filp)
 {
        struct trace_array *tr = inode->i_private;
@@ -5157,6 +5192,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)
 #endif /* CONFIG_TRACER_SNAPSHOT */
+static const struct file_operations tracing_thresh_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_thresh_read,
+        .write          = tracing_thresh_write,
+        .llseek         = generic_file_llseek,
+};
 static const struct file_operations tracing_max_lat_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_max_lat_read,
@@ -6094,10 +6136,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
        if (!topts)
                return;
-        for (cnt = 0; topts[cnt].opt; cnt++) {
+        for (cnt = 0; topts[cnt].opt; cnt++)
-                if (topts[cnt].entry)
+                debugfs_remove(topts[cnt].entry);
-                        debugfs_remove(topts[cnt].entry);
-        }
        kfree(topts);
 }
@@ -6520,7 +6560,7 @@ static __init int tracer_init_debugfs(void)
        init_tracer_debugfs(&global_trace, d_tracer);
        trace_create_file("tracing_thresh", 0644, d_tracer,
-                        &tracing_thresh, &tracing_max_lat_fops);
+                        &global_trace, &tracing_thresh_fops);
        trace_create_file("README", 0444, d_tracer,
                        NULL, &tracing_readme_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9258f5a815db..385391fb1d3b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -339,6 +339,7 @@ struct tracer_flags {
 * @reset: called when one switches to another tracer
 * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
 * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @update_thresh: called when tracing_thresh is updated
 * @open: called when the trace file is opened
 * @pipe_open: called when the trace_pipe file is opened
 * @close: called when the trace file is released
@@ -357,6 +358,7 @@ struct tracer {
        void                    (*reset)(struct trace_array *tr);
        void                    (*start)(struct trace_array *tr);
        void                    (*stop)(struct trace_array *tr);
+        int                     (*update_thresh)(struct trace_array *tr);
        void                    (*open)(struct trace_iterator *iter);
        void                    (*pipe_open)(struct trace_iterator *iter);
        void                    (*close)(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 26dc348332b7..57b67b1f24d1 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -59,13 +59,14 @@ u64 notrace trace_clock(void)
 /*
 * trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ * Note that this use of jiffies_64 is not completely safe on
+ * 32-bit systems. But the window is tiny, and the effect if
+ * we are affected is that we will have an obviously bogus
+ * timestamp on a trace event - i.e. not life threatening.
 */
 u64 notrace trace_clock_jiffies(void)
 {
-        u64 jiffy = jiffies - INITIAL_JIFFIES;
+        return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
-        /* Return nsecs */
-        return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
 }
 /*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 5d12bb407b44..4b9c114ee9de 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -30,6 +30,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
                        return ret;
        }
+        /*
+         * We checked and allowed to create parent,
+         * allow children without checking.
+         */
+        if (p_event->parent)
+                return 0;
+        /*
+         * It's ok to check current process (owner) permissions in here,
+         * because code below is called only via perf_event_open syscall.
+         */
        /* The ftrace function trace is allowed only for root. */
        if (ftrace_event_is_function(tp_event)) {
                if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f99e0b3bca8c..ef06ce7e9cf8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,6 +8,8 @@
 *
 */
+#define pr_fmt(fmt) fmt
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
 #include <linux/kthread.h>
@@ -470,6 +472,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
        list_del(&file->list);
        remove_subsystem(file->system);
+        free_event_filter(file->filter);
        kmem_cache_free(file_cachep, file);
 }
@@ -1490,7 +1493,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
        dir->entry = debugfs_create_dir(name, parent);
        if (!dir->entry) {
-                pr_warning("Failed to create system directory %s\n", name);
+                pr_warn("Failed to create system directory %s\n", name);
                __put_system(system);
                goto out_free;
        }
@@ -1506,7 +1509,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
        if (!entry) {
                kfree(system->filter);
                system->filter = NULL;
-                pr_warning("Could not create debugfs '%s/filter' entry\n", name);
+                pr_warn("Could not create debugfs '%s/filter' entry\n", name);
        }
        trace_create_file("enable", 0644, dir->entry, dir,
@@ -1521,8 +1524,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 out_fail:
        /* Only print this message if failed on memory allocation */
        if (!dir || !system)
-                pr_warning("No memory to create event subsystem %s\n",
+                pr_warn("No memory to create event subsystem %s\n", name);
-                           name);
        return NULL;
 }
@@ -1550,8 +1552,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
        name = ftrace_event_name(call);
        file->dir = debugfs_create_dir(name, d_events);
        if (!file->dir) {
-                pr_warning("Could not create debugfs '%s' directory\n",
+                pr_warn("Could not create debugfs '%s' directory\n", name);
-                           name);
                return -1;
        }
@@ -1574,8 +1575,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
        if (list_empty(head)) {
                ret = call->class->define_fields(call);
                if (ret < 0) {
-                        pr_warning("Could not initialize trace point"
+                        pr_warn("Could not initialize trace point events/%s\n",
-                                   " events/%s\n", name);
+                                name);
                        return -1;
                }
        }
@@ -1620,7 +1621,6 @@ static void event_remove(struct ftrace_event_call *call)
                if (file->event_call != call)
                        continue;
                ftrace_event_enable_disable(file, 0);
-                destroy_preds(file);
                /*
                 * The do_for_each_event_file() is
                 * a double loop. After finding the call for this
@@ -1648,8 +1648,7 @@ static int event_init(struct ftrace_event_call *call)
        if (call->class->raw_init) {
                ret = call->class->raw_init(call);
                if (ret < 0 && ret != -ENOSYS)
-                        pr_warn("Could not initialize trace events/%s\n",
+                        pr_warn("Could not initialize trace events/%s\n", name);
-                                name);
        }
        return ret;
@@ -1748,7 +1747,8 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
 {
        event_remove(call);
        trace_destroy_fields(call);
-        destroy_call_preds(call);
+        free_event_filter(call->filter);
+        call->filter = NULL;
 }
 static int probe_remove_event_call(struct ftrace_event_call *call)
@@ -1894,8 +1894,8 @@ __trace_add_event_dirs(struct trace_array *tr)
        list_for_each_entry(call, &ftrace_events, list) {
                ret = __trace_add_new_event(call, tr);
                if (ret < 0)
-                        pr_warning("Could not create directory for event %s\n",
+                        pr_warn("Could not create directory for event %s\n",
-                                   ftrace_event_name(call));
+                                ftrace_event_name(call));
        }
 }
@@ -2207,8 +2207,8 @@ __trace_early_add_event_dirs(struct trace_array *tr)
        list_for_each_entry(file, &tr->events, list) {
                ret = event_create_dir(tr->event_dir, file);
                if (ret < 0)
-                        pr_warning("Could not create directory for event %s\n",
+                        pr_warn("Could not create directory for event %s\n",
-                                   ftrace_event_name(file->event_call));
+                                ftrace_event_name(file->event_call));
        }
 }
@@ -2231,8 +2231,8 @@ __trace_early_add_events(struct trace_array *tr)
                ret = __trace_early_add_new_event(call, tr);
                if (ret < 0)
-                        pr_warning("Could not create early event %s\n",
+                        pr_warn("Could not create early event %s\n",
-                                   ftrace_event_name(call));
+                                ftrace_event_name(call));
        }
 }
@@ -2279,13 +2279,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
        entry = debugfs_create_file("set_event", 0644, parent,
                                    tr, &ftrace_set_event_fops);
        if (!entry) {
-                pr_warning("Could not create debugfs 'set_event' entry\n");
+                pr_warn("Could not create debugfs 'set_event' entry\n");
                return -ENOMEM;
        }
        d_events = debugfs_create_dir("events", parent);
        if (!d_events) {
-                pr_warning("Could not create debugfs 'events' directory\n");
+                pr_warn("Could not create debugfs 'events' directory\n");
                return -ENOMEM;
        }
@@ -2461,11 +2461,10 @@ static __init int event_trace_init(void)
        entry = debugfs_create_file("available_events", 0444, d_tracer,
                                    tr, &ftrace_avail_fops);
        if (!entry)
-                pr_warning("Could not create debugfs "
+                pr_warn("Could not create debugfs 'available_events' entry\n");
-                           "'available_events' entry\n");
        if (trace_define_common_fields())
-                pr_warning("tracing: Failed to allocate common fields");
+                pr_warn("tracing: Failed to allocate common fields");
        ret = early_event_add_tracer(d_tracer, tr);
        if (ret)
@@ -2474,7 +2473,7 @@ static __init int event_trace_init(void)
 #ifdef CONFIG_MODULES
        ret = register_module_notifier(&trace_module_nb);
        if (ret)
-                pr_warning("Failed to register trace events module notifier\n");
+                pr_warn("Failed to register trace events module notifier\n");
 #endif
        return 0;
 }
@@ -2578,7 +2577,7 @@ static __init void event_trace_self_tests(void)
                 * it and the self test should not be on.
                 */
                if (file->flags & FTRACE_EVENT_FL_ENABLED) {
-                        pr_warning("Enabled event during self test!\n");
+                        pr_warn("Enabled event during self test!\n");
                        WARN_ON_ONCE(1);
                        continue;
                }
@@ -2606,8 +2605,8 @@ static __init void event_trace_self_tests(void)
                ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
                if (WARN_ON_ONCE(ret)) {
-                        pr_warning("error enabling system %s\n",
+                        pr_warn("error enabling system %s\n",
-                                   system->name);
+                                system->name);
                        continue;
                }
@@ -2615,8 +2614,8 @@ static __init void event_trace_self_tests(void)
                ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
                if (WARN_ON_ONCE(ret)) {
-                        pr_warning("error disabling system %s\n",
+                        pr_warn("error disabling system %s\n",
-                                   system->name);
+                                system->name);
                        continue;
                }
@@ -2630,7 +2629,7 @@ static __init void event_trace_self_tests(void)
        ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
        if (WARN_ON_ONCE(ret)) {
-                pr_warning("error enabling all events\n");
+                pr_warn("error enabling all events\n");
                return;
        }
@@ -2639,7 +2638,7 @@ static __init void event_trace_self_tests(void)
        /* reset sysname */
        ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
        if (WARN_ON_ONCE(ret)) {
-                pr_warning("error disabling all events\n");
+                pr_warn("error disabling all events\n");
                return;
        }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8a8631926a07..7a8c1528e141 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -774,17 +774,12 @@ static void __free_preds(struct event_filter *filter)
        filter->n_preds = 0;
 }
-static void call_filter_disable(struct ftrace_event_call *call)
-{
-        call->flags &= ~TRACE_EVENT_FL_FILTERED;
-}
 static void filter_disable(struct ftrace_event_file *file)
 {
        struct ftrace_event_call *call = file->event_call;
        if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-                call_filter_disable(call);
+                call->flags &= ~TRACE_EVENT_FL_FILTERED;
        else
                file->flags &= ~FTRACE_EVENT_FL_FILTERED;
 }
@@ -804,32 +799,6 @@ void free_event_filter(struct event_filter *filter)
        __free_filter(filter);
 }
-void destroy_call_preds(struct ftrace_event_call *call)
-{
-        __free_filter(call->filter);
-        call->filter = NULL;
-}
-static void destroy_file_preds(struct ftrace_event_file *file)
-{
-        __free_filter(file->filter);
-        file->filter = NULL;
-}
-/*
- * Called when destroying the ftrace_event_file.
- * The file is being freed, so we do not need to worry about
- * the file being currently used. This is for module code removing
- * the tracepoints from within it.
- */
-void destroy_preds(struct ftrace_event_file *file)
-{
-        if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-                destroy_call_preds(file->event_call);
-        else
-                destroy_file_preds(file);
-}
 static struct event_filter *__alloc_filter(void)
 {
        struct event_filter *filter;
@@ -873,17 +842,14 @@ static inline void __remove_filter(struct ftrace_event_file *file)
                remove_filter_string(file->filter);
 }
-static void filter_free_subsystem_preds(struct event_subsystem *system,
+static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
                                        struct trace_array *tr)
 {
        struct ftrace_event_file *file;
-        struct ftrace_event_call *call;
        list_for_each_entry(file, &tr->events, list) {
-                call = file->event_call;
+                if (file->system != dir)
-                if (strcmp(call->class->system, system->name) != 0)
                        continue;
                __remove_filter(file);
        }
 }
@@ -901,15 +867,13 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file)
        }
 }
-static void filter_free_subsystem_filters(struct event_subsystem *system,
+static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
                                          struct trace_array *tr)
 {
        struct ftrace_event_file *file;
-        struct ftrace_event_call *call;
        list_for_each_entry(file, &tr->events, list) {
-                call = file->event_call;
+                if (file->system != dir)
-                if (strcmp(call->class->system, system->name) != 0)
                        continue;
                __free_subsystem_filter(file);
        }
@@ -1582,7 +1546,6 @@ static int fold_pred_tree(struct event_filter *filter,
 static int replace_preds(struct ftrace_event_call *call,
                         struct event_filter *filter,
                         struct filter_parse_state *ps,
-                         char *filter_string,
                         bool dry_run)
 {
        char *operand1 = NULL, *operand2 = NULL;
@@ -1755,13 +1718,12 @@ struct filter_list {
        struct event_filter     *filter;
 };
-static int replace_system_preds(struct event_subsystem *system,
+static int replace_system_preds(struct ftrace_subsystem_dir *dir,
                                struct trace_array *tr,
                                struct filter_parse_state *ps,
                                char *filter_string)
 {
        struct ftrace_event_file *file;
-        struct ftrace_event_call *call;
        struct filter_list *filter_item;
        struct filter_list *tmp;
        LIST_HEAD(filter_list);
@@ -1769,15 +1731,14 @@ static int replace_system_preds(struct event_subsystem *system,
        int err;
        list_for_each_entry(file, &tr->events, list) {
-                call = file->event_call;
+                if (file->system != dir)
-                if (strcmp(call->class->system, system->name) != 0)
                        continue;
                /*
                 * Try to see if the filter can be applied
                 *  (filter arg is ignored on dry_run)
                 */
-                err = replace_preds(call, NULL, ps, filter_string, true);
+                err = replace_preds(file->event_call, NULL, ps, true);
                if (err)
                        event_set_no_set_filter_flag(file);
                else
@@ -1787,9 +1748,7 @@ static int replace_system_preds(struct event_subsystem *system,
        list_for_each_entry(file, &tr->events, list) {
                struct event_filter *filter;
-                call = file->event_call;
+                if (file->system != dir)
-                if (strcmp(call->class->system, system->name) != 0)
                        continue;
                if (event_no_set_filter_flag(file))
@@ -1811,7 +1770,7 @@ static int replace_system_preds(struct event_subsystem *system,
                if (err)
                        goto fail_mem;
-                err = replace_preds(call, filter, ps, filter_string, false);
+                err = replace_preds(file->event_call, filter, ps, false);
                if (err) {
                        filter_disable(file);
                        parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
@@ -1933,7 +1892,7 @@ static int create_filter(struct ftrace_event_call *call,
        err = create_filter_start(filter_str, set_str, &ps, &filter);
        if (!err) {
-                err = replace_preds(call, filter, ps, filter_str, false);
+                err = replace_preds(call, filter, ps, false);
                if (err && set_str)
                        append_filter_err(ps, filter);
        }
@@ -1959,7 +1918,7 @@ int create_event_filter(struct ftrace_event_call *call,
 * Identical to create_filter() except that it creates a subsystem filter
 * and always remembers @filter_str.
 */
-static int create_system_filter(struct event_subsystem *system,
+static int create_system_filter(struct ftrace_subsystem_dir *dir,
                                struct trace_array *tr,
                                char *filter_str, struct event_filter **filterp)
 {
@@ -1969,7 +1928,7 @@ static int create_system_filter(struct event_subsystem *system,
        err = create_filter_start(filter_str, true, &ps, &filter);
        if (!err) {
-                err = replace_system_preds(system, tr, ps, filter_str);
+                err = replace_system_preds(dir, tr, ps, filter_str);
                if (!err) {
                        /* System filters just show a default message */
                        kfree(filter->filter_string);
@@ -2053,18 +2012,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
        }
        if (!strcmp(strstrip(filter_string), "0")) {
-                filter_free_subsystem_preds(system, tr);
+                filter_free_subsystem_preds(dir, tr);
                remove_filter_string(system->filter);
                filter = system->filter;
                system->filter = NULL;
                /* Ensure all filters are no longer used */
                synchronize_sched();
-                filter_free_subsystem_filters(system, tr);
+                filter_free_subsystem_filters(dir, tr);
                __free_filter(filter);
                goto out_unlock;
        }
-        err = create_system_filter(system, tr, filter_string, &filter);
+        err = create_system_filter(dir, tr, filter_string, &filter);
        if (filter) {
                /*
                 * No event actually uses the system filter
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4de3e57f723c..f0a0c982cde3 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,6 +15,33 @@
 #include "trace.h"
 #include "trace_output.h"
+static bool kill_ftrace_graph;
+/**
+ * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
+ *
+ * ftrace_graph_stop() is called when a severe error is detected in
+ * the function graph tracing. This function is called by the critical
+ * paths of function graph to keep those paths from doing any more harm.
+ */
+bool ftrace_graph_is_dead(void)
+{
+        return kill_ftrace_graph;
+}
+/**
+ * ftrace_graph_stop - set to permanently disable function graph tracincg
+ *
+ * In case of an error int function graph tracing, this is called
+ * to try to keep function graph tracing from causing any more harm.
+ * Usually this is pretty severe and this is called to try to at least
+ * get a warning out to the user.
+ */
+void ftrace_graph_stop(void)
+{
+        kill_ftrace_graph = true;
+}
 /* When set, irq functions will be ignored */
 static int ftrace_graph_skip_irqs;
@@ -92,6 +119,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
        unsigned long long calltime;
        int index;
+        if (unlikely(ftrace_graph_is_dead()))
+                return -EBUSY;
        if (!current->ret_stack)
                return -EBUSY;
@@ -323,7 +353,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        return ret;
 }
-int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
 {
        if (tracing_thresh)
                return 1;
@@ -412,7 +442,7 @@ void set_graph_array(struct trace_array *tr)
        smp_mb();
 }
-void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
 {
        if (tracing_thresh &&
            (trace->rettime - trace->calltime < tracing_thresh))
@@ -445,6 +475,12 @@ static void graph_trace_reset(struct trace_array *tr)
        unregister_ftrace_graph();
 }
+static int graph_trace_update_thresh(struct trace_array *tr)
+{
+        graph_trace_reset(tr);
+        return graph_trace_init(tr);
+}
 static int max_bytes_for_cpu;
 static enum print_line_t
@@ -1399,7 +1435,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
        seq_printf(s, "               |   |   |   |\n");
 }
-void print_graph_headers(struct seq_file *s)
+static void print_graph_headers(struct seq_file *s)
 {
        print_graph_headers_flags(s, tracer_flags.val);
 }
@@ -1495,6 +1531,7 @@ static struct trace_event graph_trace_ret_event = {
 static struct tracer graph_trace __tracer_data = {
        .name           = "function_graph",
+        .update_thresh  = graph_trace_update_thresh,
        .open           = graph_trace_open,
        .pipe_open      = graph_trace_open,
        .close          = graph_trace_close,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index f3dad80c20b2..c6977d5a9b12 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,23 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 static int next_event_type = __TRACE_LAST_TYPE + 1;
-int trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
-        int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
-        int ret;
-        ret = seq_write(m, s->buffer, len);
-        /*
-         * Only reset this buffer if we successfully wrote to the
-         * seq_file buffer.
-         */
-        if (!ret)
-                trace_seq_init(s);
-        return ret;
-}
 enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
 {
        struct trace_seq *s = &iter->seq;
@@ -85,257 +68,6 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
        return TRACE_TYPE_HANDLED;
 }
-/**
- * trace_seq_printf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * It returns 0 if the trace oversizes the buffer's free
- * space, 1 otherwise.
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-{
-        int len = (PAGE_SIZE - 1) - s->len;
-        va_list ap;
-        int ret;
-        if (s->full || !len)
-                return 0;
-        va_start(ap, fmt);
-        ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
-        va_end(ap);
-        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len) {
-                s->full = 1;
-                return 0;
-        }
-        s->len += ret;
-        return 1;
-}
-EXPORT_SYMBOL_GPL(trace_seq_printf);
-/**
- * trace_seq_bitmask - put a list of longs as a bitmask print output
- * @s:          trace sequence descriptor
- * @maskp:      points to an array of unsigned longs that represent a bitmask
- * @nmaskbits:  The number of bits that are valid in @maskp
- *
- * It returns 0 if the trace oversizes the buffer's free
- * space, 1 otherwise.
- *
- * Writes a ASCII representation of a bitmask string into @s.
- */
-int
-trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
-                  int nmaskbits)
-{
-        int len = (PAGE_SIZE - 1) - s->len;
-        int ret;
-        if (s->full || !len)
-                return 0;
-        ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
-        s->len += ret;
-        return 1;
-}
-EXPORT_SYMBOL_GPL(trace_seq_bitmask);
-/**
- * trace_seq_vprintf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
-{
-        int len = (PAGE_SIZE - 1) - s->len;
-        int ret;
-        if (s->full || !len)
-                return 0;
-        ret = vsnprintf(s->buffer + s->len, len, fmt, args);
-        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len) {
-                s->full = 1;
-                return 0;
-        }
-        s->len += ret;
-        return len;
-}
-EXPORT_SYMBOL_GPL(trace_seq_vprintf);
-int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
-{
-        int len = (PAGE_SIZE - 1) - s->len;
-        int ret;
-        if (s->full || !len)
-                return 0;
-        ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
-        /* If we can't write it all, don't bother writing anything */
-        if (ret >= len) {
-                s->full = 1;
-                return 0;
-        }
-        s->len += ret;
-        return len;
-}
-/**
- * trace_seq_puts - trace sequence printing of simple string
- * @s: trace sequence descriptor
- * @str: simple string to record
- *
- * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple string
- * into a special buffer (@s) for later retrieval by a sequencer
- * or other mechanism.
- */
-int trace_seq_puts(struct trace_seq *s, const char *str)
-{
-        int len = strlen(str);
-        if (s->full)
-                return 0;
-        if (len > ((PAGE_SIZE - 1) - s->len)) {
-                s->full = 1;
-                return 0;
-        }
-        memcpy(s->buffer + s->len, str, len);
-        s->len += len;
-        return len;
-}
-int trace_seq_putc(struct trace_seq *s, unsigned char c)
-{
-        if (s->full)
-                return 0;
-        if (s->len >= (PAGE_SIZE - 1)) {
-                s->full = 1;
-                return 0;
-        }
-        s->buffer[s->len++] = c;
-        return 1;
-}
-EXPORT_SYMBOL(trace_seq_putc);
-int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
-{
-        if (s->full)
-                return 0;
-        if (len > ((PAGE_SIZE - 1) - s->len)) {
-                s->full = 1;
-                return 0;
-        }
-        memcpy(s->buffer + s->len, mem, len);
-        s->len += len;
-        return len;
-}
-int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
-{
-        unsigned char hex[HEX_CHARS];
-        const unsigned char *data = mem;
-        int i, j;
-        if (s->full)
-                return 0;
-#ifdef __BIG_ENDIAN
-        for (i = 0, j = 0; i < len; i++) {
-#else
-        for (i = len-1, j = 0; i >= 0; i--) {
-#endif
-                hex[j++] = hex_asc_hi(data[i]);
-                hex[j++] = hex_asc_lo(data[i]);
-        }
-        hex[j++] = ' ';
-        return trace_seq_putmem(s, hex, j);
-}
-void *trace_seq_reserve(struct trace_seq *s, size_t len)
-{
-        void *ret;
-        if (s->full)
-                return NULL;
-        if (len > ((PAGE_SIZE - 1) - s->len)) {
-                s->full = 1;
-                return NULL;
-        }
-        ret = s->buffer + s->len;
-        s->len += len;
-        return ret;
-}
-int trace_seq_path(struct trace_seq *s, const struct path *path)
-{
-        unsigned char *p;
-        if (s->full)
-                return 0;
-        if (s->len >= (PAGE_SIZE - 1)) {
-                s->full = 1;
-                return 0;
-        }
-        p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
-        if (!IS_ERR(p)) {
-                p = mangle_path(s->buffer + s->len, p, "\n");
-                if (p) {
-                        s->len = p - s->buffer;
-                        return 1;
-                }
-        } else {
-                s->buffer[s->len++] = '?';
-                return 1;
-        }
-        s->full = 1;
-        return 0;
-}
 const char *
 ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
                       unsigned long flags,
@@ -343,7 +75,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 {
        unsigned long mask;
        const char *str;
-        const char *ret = p->buffer + p->len;
+        const char *ret = trace_seq_buffer_ptr(p);
        int i, first = 1;
        for (i = 0;  flag_array[i].name && flags; i++) {
@@ -379,7 +111,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
                         const struct trace_print_flags *symbol_array)
 {
        int i;
-        const char *ret = p->buffer + p->len;
+        const char *ret = trace_seq_buffer_ptr(p);
        for (i = 0;  symbol_array[i].name; i++) {
@@ -390,7 +122,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
                break;
        }
-        if (ret == (const char *)(p->buffer + p->len))
+        if (ret == (const char *)(trace_seq_buffer_ptr(p)))
                trace_seq_printf(p, "0x%lx", val);
                
        trace_seq_putc(p, 0);
@@ -405,7 +137,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
                         const struct trace_print_flags_u64 *symbol_array)
 {
        int i;
-        const char *ret = p->buffer + p->len;
+        const char *ret = trace_seq_buffer_ptr(p);
        for (i = 0;  symbol_array[i].name; i++) {
@@ -416,7 +148,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
                break;
        }
-        if (ret == (const char *)(p->buffer + p->len))
+        if (ret == (const char *)(trace_seq_buffer_ptr(p)))
                trace_seq_printf(p, "0x%llx", val);
        trace_seq_putc(p, 0);
@@ -430,7 +162,7 @@ const char *
 ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
                         unsigned int bitmask_size)
 {
-        const char *ret = p->buffer + p->len;
+        const char *ret = trace_seq_buffer_ptr(p);
        trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
        trace_seq_putc(p, 0);
@@ -443,7 +175,7 @@ const char *
 ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 {
        int i;
-        const char *ret = p->buffer + p->len;
+        const char *ret = trace_seq_buffer_ptr(p);
        for (i = 0; i < buf_len; i++)
                trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 127a9d8c8357..80b25b585a70 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,9 +35,6 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
 extern int __unregister_ftrace_event(struct trace_event *event);
 extern struct rw_semaphore trace_event_sem;
-#define MAX_MEMHEX_BYTES        8
-#define HEX_CHARS               (MAX_MEMHEX_BYTES*2 + 1)
 #define SEQ_PUT_FIELD_RET(s, x)                         \
 do {                                                    \
        if (!trace_seq_putmem(s, &(x), sizeof(x)))      \
@@ -46,7 +43,6 @@ do {							\
 #define SEQ_PUT_HEX_FIELD_RET(s, x)                     \
 do {                                                    \
-        BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);     \
        if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))  \
                return TRACE_TYPE_PARTIAL_LINE;         \
 } while (0)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
new file mode 100644
index 000000000000..1f24ed99dca2
--- /dev/null
+++ b/kernel/trace/trace_seq.c
@@ -0,0 +1,428 @@
+/*
+ * trace_seq.c
+ *
+ * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * The trace_seq is a handy tool that allows you to pass a descriptor around
+ * to a buffer that other functions can write to. It is similar to the
+ * seq_file functionality but has some differences.
+ *
+ * To use it, the trace_seq must be initialized with trace_seq_init().
+ * This will set up the counters within the descriptor. You can call
+ * trace_seq_init() more than once to reset the trace_seq to start
+ * from scratch.
+ * 
+ * The buffer size is currently PAGE_SIZE, although it may become dynamic
+ * in the future.
+ *
+ * A write to the buffer will either succed or fail. That is, unlike
+ * sprintf() there will not be a partial write (well it may write into
+ * the buffer but it wont update the pointers). This allows users to
+ * try to write something into the trace_seq buffer and if it fails
+ * they can flush it and try again.
+ *
+ */
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/trace_seq.h>
+/* How much buffer is left on the trace_seq? */
+#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len)
+/* How much buffer is written? */
+#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1))
+/**
+ * trace_print_seq - move the contents of trace_seq into a seq_file
+ * @m: the seq_file descriptor that is the destination
+ * @s: the trace_seq descriptor that is the source.
+ *
+ * Returns 0 on success and non zero on error. If it succeeds to
+ * write to the seq_file it will reset the trace_seq, otherwise
+ * it does not modify the trace_seq to let the caller try again.
+ */
+int trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+        unsigned int len = TRACE_SEQ_BUF_USED(s);
+        int ret;
+        ret = seq_write(m, s->buffer, len);
+        /*
+         * Only reset this buffer if we successfully wrote to the
+         * seq_file buffer. This lets the caller try again or
+         * do something else with the contents.
+         */
+        if (!ret)
+                trace_seq_init(s);
+        return ret;
+}
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf() is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ *
+ * Returns 1 if we successfully written all the contents to
+ *   the buffer.
+  * Returns 0 if we the length to write is bigger than the
+ *   reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+        unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+        va_list ap;
+        int ret;
+        if (s->full || !len)
+                return 0;
+        va_start(ap, fmt);
+        ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+        va_end(ap);
+        /* If we can't write it all, don't bother writing anything */
+        if (ret >= len) {
+                s->full = 1;
+                return 0;
+        }
+        s->len += ret;
+        return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_printf);
+/**
+ * trace_seq_bitmask - write a bitmask array in its ASCII representation
+ * @s:          trace sequence descriptor
+ * @maskp:      points to an array of unsigned longs that represent a bitmask
+ * @nmaskbits:  The number of bits that are valid in @maskp
+ *
+ * Writes a ASCII representation of a bitmask string into @s.
+ *
+ * Returns 1 if we successfully written all the contents to
+ *   the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ *   reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+                      int nmaskbits)
+{
+        unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+        int ret;
+        if (s->full || !len)
+                return 0;
+        ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
+        s->len += ret;
+        return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_bitmask);
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+        unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+        int ret;
+        if (s->full || !len)
+                return 0;
+        ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+        /* If we can't write it all, don't bother writing anything */
+        if (ret >= len) {
+                s->full = 1;
+                return 0;
+        }
+        s->len += ret;
+        return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+/**
+ * trace_seq_bprintf - Write the printf string from binary arguments
+ * @s: trace sequence descriptor
+ * @fmt: The format string for the @binary arguments
+ * @binary: The binary arguments for @fmt.
+ *
+ * When recording in a fast path, a printf may be recorded with just
+ * saving the format and the arguments as they were passed to the
+ * function, instead of wasting cycles converting the arguments into
+ * ASCII characters. Instead, the arguments are saved in a 32 bit
+ * word array that is defined by the format string constraints.
+ *
+ * This function will take the format and the binary array and finish
+ * the conversion into the ASCII string within the buffer.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+        unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+        int ret;
+        if (s->full || !len)
+                return 0;
+        ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+        /* If we can't write it all, don't bother writing anything */
+        if (ret >= len) {
+                s->full = 1;
+                return 0;
+        }
+        s->len += ret;
+        return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_bprintf);
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+        unsigned int len = strlen(str);
+        if (s->full)
+                return 0;
+        if (len > TRACE_SEQ_BUF_LEFT(s)) {
+                s->full = 1;
+                return 0;
+        }
+        memcpy(s->buffer + s->len, str, len);
+        s->len += len;
+        return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_puts);
+/**
+ * trace_seq_putc - trace sequence printing of simple character
+ * @s: trace sequence descriptor
+ * @c: simple character to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple charater
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+        if (s->full)
+                return 0;
+        if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+                s->full = 1;
+                return 0;
+        }
+        s->buffer[s->len++] = c;
+        return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putc);
+/**
+ * trace_seq_putmem - write raw data into the trace_seq buffer
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to copy into the buffer
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * There may be cases where raw memory needs to be written into the
+ * buffer and a strcpy() would not work. Using this function allows
+ * for such cases.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
+{
+        if (s->full)
+                return 0;
+        if (len > TRACE_SEQ_BUF_LEFT(s)) {
+                s->full = 1;
+                return 0;
+        }
+        memcpy(s->buffer + s->len, mem, len);
+        s->len += len;
+        return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem);
+#define MAX_MEMHEX_BYTES        8U
+#define HEX_CHARS               (MAX_MEMHEX_BYTES*2 + 1)
+/**
+ * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to write its hex ASCII representation of
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * This is similar to trace_seq_putmem() except instead of just copying the
+ * raw memory into the buffer it writes its ASCII representation of it
+ * in hex characters.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+                         unsigned int len)
+{
+        unsigned char hex[HEX_CHARS];
+        const unsigned char *data = mem;
+        unsigned int start_len;
+        int i, j;
+        int cnt = 0;
+        if (s->full)
+                return 0;
+        while (len) {
+                start_len = min(len, HEX_CHARS - 1);
+#ifdef __BIG_ENDIAN
+                for (i = 0, j = 0; i < start_len; i++) {
+#else
+                for (i = start_len-1, j = 0; i >= 0; i--) {
+#endif
+                        hex[j++] = hex_asc_hi(data[i]);
+                        hex[j++] = hex_asc_lo(data[i]);
+                }
+                if (WARN_ON_ONCE(j == 0 || j/2 > len))
+                        break;
+                /* j increments twice per loop */
+                len -= j / 2;
+                hex[j++] = ' ';
+                cnt += trace_seq_putmem(s, hex, j);
+        }
+        return cnt;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
+/**
+ * trace_seq_path - copy a path into the sequence buffer
+ * @s: trace sequence descriptor
+ * @path: path to write into the sequence buffer.
+ *
+ * Write a path name into the sequence buffer.
+ *
+ * Returns 1 if we successfully written all the contents to
+ *   the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ *   reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_path(struct trace_seq *s, const struct path *path)
+{
+        unsigned char *p;
+        if (s->full)
+                return 0;
+        if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+                s->full = 1;
+                return 0;
+        }
+        p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+        if (!IS_ERR(p)) {
+                p = mangle_path(s->buffer + s->len, p, "\n");
+                if (p) {
+                        s->len = p - s->buffer;
+                        return 1;
+                }
+        } else {
+                s->buffer[s->len++] = '?';
+                return 1;
+        }
+        s->full = 1;
+        return 0;
+}
+EXPORT_SYMBOL_GPL(trace_seq_path);
+/**
+ * trace_seq_to_user - copy the squence buffer to user space
+ * @s: trace sequence descriptor
+ * @ubuf: The userspace memory location to copy to
+ * @cnt: The amount to copy
+ *
+ * Copies the sequence buffer into the userspace memory pointed to
+ * by @ubuf. It starts from the last read position (@s->readpos)
+ * and writes up to @cnt characters or till it reaches the end of
+ * the content in the buffer (@s->len), which ever comes first.
+ *
+ * On success, it returns a positive number of the number of bytes
+ * it copied.
+ *
+ * On failure it returns -EBUSY if all of the content in the
+ * sequence has been already read, which includes nothing in the
+ * sequenc (@s->len == @s->readpos).
+ *
+ * Returns -EFAULT if the copy to userspace fails.
+ */
+int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
+{
+        int len;
+        int ret;
+        if (!cnt)
+                return 0;
+        if (s->len <= s->readpos)
+                return -EBUSY;
+        len = s->len - s->readpos;
+        if (cnt > len)
+                cnt = len;
+        ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+        if (ret == cnt)
+                return -EFAULT;
+        cnt -= ret;
+        s->readpos += cnt;
+        return cnt;
+}
+EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 3c9b97e6b1f4..33ff6a24b802 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -265,7 +265,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
        if (is_ret)
                tu->consumer.ret_handler = uretprobe_dispatcher;
        init_trace_uprobe_filter(&tu->filter);
-        tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
        return tu;
 error:
@@ -1292,7 +1291,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
                kfree(call->print_fmt);
                return -ENODEV;
        }
-        call->flags = 0;
        call->class->reg = trace_uprobe_register;
        call->data = tu;
        ret = trace_add_event_call(call);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 35974ac69600..5dbe22aa3efd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -265,7 +265,6 @@ struct workqueue_struct {
 static struct kmem_cache *pwq_cache;
-static int wq_numa_tbl_len;             /* highest possible NUMA node id + 1 */
 static cpumask_var_t *wq_numa_possible_cpumask;
                                        /* possible CPUs of each node */
@@ -758,13 +757,6 @@ static bool too_many_workers(struct worker_pool *pool)
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;
-        /*
-         * nr_idle and idle_list may disagree if idle rebinding is in
-         * progress.  Never return %true if idle_list is empty.
-         */
-        if (list_empty(&pool->idle_list))
-                return false;
        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
@@ -850,7 +842,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
        pool = worker->pool;
        /* this can only happen on the local cpu */
-        if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
+        if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
                return NULL;
        /*
@@ -874,35 +866,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
 * worker_set_flags - set worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to set
- * @wakeup: wakeup an idle worker if necessary
 *
- * Set @flags in @worker->flags and adjust nr_running accordingly.  If
+ * Set @flags in @worker->flags and adjust nr_running accordingly.
- * nr_running becomes zero and @wakeup is %true, an idle worker is
- * woken up.
 *
 * CONTEXT:
 * spin_lock_irq(pool->lock)
 */
-static inline void worker_set_flags(struct worker *worker, unsigned int flags,
+static inline void worker_set_flags(struct worker *worker, unsigned int flags)
-                                    bool wakeup)
 {
        struct worker_pool *pool = worker->pool;
        WARN_ON_ONCE(worker->task != current);
-        /*
+        /* If transitioning into NOT_RUNNING, adjust nr_running. */
-         * If transitioning into NOT_RUNNING, adjust nr_running and
-         * wake up an idle worker as necessary if requested by
-         * @wakeup.
-         */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
-                if (wakeup) {
+                atomic_dec(&pool->nr_running);
-                        if (atomic_dec_and_test(&pool->nr_running) &&
-                            !list_empty(&pool->worklist))
-                                wake_up_worker(pool);
-                } else
-                        atomic_dec(&pool->nr_running);
        }
        worker->flags |= flags;
@@ -1232,7 +1211,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
                        pwq_activate_delayed_work(work);
                list_del_init(&work->entry);
-                pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
+                pwq_dec_nr_in_flight(pwq, get_work_color(work));
                /* work->data points to pwq iff queued, point to pool */
                set_work_pool_and_keep_pending(work, pool->id);
@@ -1560,7 +1539,7 @@ static void worker_enter_idle(struct worker *worker)
                         (worker->hentry.next || worker->hentry.pprev)))
                return;
-        /* can't use worker_set_flags(), also called from start_worker() */
+        /* can't use worker_set_flags(), also called from create_worker() */
        worker->flags |= WORKER_IDLE;
        pool->nr_idle++;
        worker->last_active = jiffies;
@@ -1602,11 +1581,11 @@ static void worker_leave_idle(struct worker *worker)
        list_del_init(&worker->entry);
 }
-static struct worker *alloc_worker(void)
+static struct worker *alloc_worker(int node)
 {
        struct worker *worker;
-        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+        worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
@@ -1670,6 +1649,9 @@ static void worker_detach_from_pool(struct worker *worker,
                detach_completion = pool->detach_completion;
        mutex_unlock(&pool->attach_mutex);
+        /* clear leftover flags without pool->lock after it is detached */
+        worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
        if (detach_completion)
                complete(detach_completion);
 }
@@ -1678,8 +1660,7 @@ static void worker_detach_from_pool(struct worker *worker,
 * create_worker - create a new workqueue worker
 * @pool: pool the new worker will belong to
 *
- * Create a new worker which is attached to @pool.  The new worker must be
+ * Create and start a new worker which is attached to @pool.
- * started by start_worker().
 *
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
@@ -1698,7 +1679,7 @@ static struct worker *create_worker(struct worker_pool *pool)
        if (id < 0)
                goto fail;
-        worker = alloc_worker();
+        worker = alloc_worker(pool->node);
        if (!worker)
                goto fail;
@@ -1724,6 +1705,13 @@ static struct worker *create_worker(struct worker_pool *pool)
        /* successful, attach the worker to the pool */
        worker_attach_to_pool(worker, pool);
+        /* start the newly created worker */
+        spin_lock_irq(&pool->lock);
+        worker->pool->nr_workers++;
+        worker_enter_idle(worker);
+        wake_up_process(worker->task);
+        spin_unlock_irq(&pool->lock);
        return worker;
 fail:
@@ -1734,44 +1722,6 @@ fail:
 }
 /**
- * start_worker - start a newly created worker
- * @worker: worker to start
- *
- * Make the pool aware of @worker and start it.
- *
- * CONTEXT:
- * spin_lock_irq(pool->lock).
- */
-static void start_worker(struct worker *worker)
-{
-        worker->pool->nr_workers++;
-        worker_enter_idle(worker);
-        wake_up_process(worker->task);
-}
-/**
- * create_and_start_worker - create and start a worker for a pool
- * @pool: the target pool
- *
- * Grab the managership of @pool and create and start a new worker for it.
- *
- * Return: 0 on success. A negative error code otherwise.
- */
-static int create_and_start_worker(struct worker_pool *pool)
-{
-        struct worker *worker;
-        worker = create_worker(pool);
-        if (worker) {
-                spin_lock_irq(&pool->lock);
-                start_worker(worker);
-                spin_unlock_irq(&pool->lock);
-        }
-        return worker ? 0 : -ENOMEM;
-}
-/**
 * destroy_worker - destroy a workqueue worker
 * @worker: worker to be destroyed
 *
@@ -1909,23 +1859,10 @@ restart:
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
        while (true) {
-                struct worker *worker;
+                if (create_worker(pool) || !need_to_create_worker(pool))
-                worker = create_worker(pool);
-                if (worker) {
-                        del_timer_sync(&pool->mayday_timer);
-                        spin_lock_irq(&pool->lock);
-                        start_worker(worker);
-                        if (WARN_ON_ONCE(need_to_create_worker(pool)))
-                                goto restart;
-                        return true;
-                }
-                if (!need_to_create_worker(pool))
                        break;
-                __set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout_interruptible(CREATE_COOLDOWN);
-                schedule_timeout(CREATE_COOLDOWN);
                if (!need_to_create_worker(pool))
                        break;
@@ -1933,6 +1870,11 @@ restart:
        del_timer_sync(&pool->mayday_timer);
        spin_lock_irq(&pool->lock);
+        /*
+         * This is necessary even after a new worker was just successfully
+         * created as @pool->lock was dropped and the new worker might have
+         * already become busy.
+         */
        if (need_to_create_worker(pool))
                goto restart;
        return true;
@@ -2020,13 +1962,8 @@ __acquires(&pool->lock)
        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
-        /*
+        /* ensure we're on the correct CPU */
-         * Ensure we're on the correct CPU.  DISASSOCIATED test is
+        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
-         * necessary to avoid spurious warnings from rescuers servicing the
-         * unbound or a disassociated pool.
-         */
-        WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
-                     !(pool->flags & POOL_DISASSOCIATED) &&
                     raw_smp_processor_id() != pool->cpu);
        /*
@@ -2052,17 +1989,22 @@ __acquires(&pool->lock)
        list_del_init(&work->entry);
        /*
-         * CPU intensive works don't participate in concurrency
+         * CPU intensive works don't participate in concurrency management.
-         * management.  They're the scheduler's responsibility.
+         * They're the scheduler's responsibility.  This takes @worker out
+         * of concurrency management and the next code block will chain
+         * execution of the pending work items.
         */
        if (unlikely(cpu_intensive))
-                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+                worker_set_flags(worker, WORKER_CPU_INTENSIVE);
        /*
-         * Unbound pool isn't concurrency managed and work items should be
+         * Wake up another worker if necessary.  The condition is always
-         * executed ASAP.  Wake up another worker if necessary.
+         * false for normal per-cpu workers since nr_running would always
+         * be >= 1 at this point.  This is used to chain execution of the
+         * pending work items for WORKER_NOT_RUNNING workers such as the
+         * UNBOUND and CPU_INTENSIVE ones.
         */
-        if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+        if (need_more_worker(pool))
                wake_up_worker(pool);
        /*
@@ -2218,7 +2160,7 @@ recheck:
                }
        } while (keep_working(pool));
-        worker_set_flags(worker, WORKER_PREP, false);
+        worker_set_flags(worker, WORKER_PREP);
 sleep:
        /*
         * pool->lock is held and there's no work to process and no need to
@@ -2311,29 +2253,27 @@ repeat:
                                move_linked_works(work, scheduled, &n);
                process_scheduled_works(rescuer);
-                spin_unlock_irq(&pool->lock);
-                worker_detach_from_pool(rescuer, pool);
-                spin_lock_irq(&pool->lock);
                /*
                 * Put the reference grabbed by send_mayday().  @pool won't
-                 * go away while we're holding its lock.
+                 * go away while we're still attached to it.
                 */
                put_pwq(pwq);
                /*
-                 * Leave this pool.  If keep_working() is %true, notify a
+                 * Leave this pool.  If need_more_worker() is %true, notify a
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
                 */
-                if (keep_working(pool))
+                if (need_more_worker(pool))
                        wake_up_worker(pool);
                rescuer->pool = NULL;
-                spin_unlock(&pool->lock);
+                spin_unlock_irq(&pool->lock);
-                spin_lock(&wq_mayday_lock);
+                worker_detach_from_pool(rescuer, pool);
+                spin_lock_irq(&wq_mayday_lock);
        }
        spin_unlock_irq(&wq_mayday_lock);
@@ -3458,7 +3398,7 @@ static void put_unbound_pool(struct worker_pool *pool)
                return;
        /* sanity checks */
-        if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
+        if (WARN_ON(!(pool->cpu < 0)) ||
            WARN_ON(!list_empty(&pool->worklist)))
                return;
@@ -3524,7 +3464,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
        hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
                if (wqattrs_equal(pool->attrs, attrs)) {
                        pool->refcnt++;
-                        goto out_unlock;
+                        return pool;
                }
        }
@@ -3557,12 +3497,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
                goto fail;
        /* create and start the initial worker */
-        if (create_and_start_worker(pool) < 0)
+        if (!create_worker(pool))
                goto fail;
        /* install */
        hash_add(unbound_pool_hash, &pool->hash_node, hash);
-out_unlock:
        return pool;
 fail:
        if (pool)
@@ -3591,11 +3531,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
        if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
                return;
-        /*
-         * Unlink @pwq.  Synchronization against wq->mutex isn't strictly
-         * necessary on release but do it anyway.  It's easier to verify
-         * and consistent with the linking path.
-         */
        mutex_lock(&wq->mutex);
        list_del_rcu(&pwq->pwqs_node);
        is_last = list_empty(&wq->pwqs);
@@ -3692,10 +3627,7 @@ static void link_pwq(struct pool_workqueue *pwq)
        if (!list_empty(&pwq->pwqs_node))
                return;
-        /*
+        /* set the matching work_color */
-         * Set the matching work_color.  This is synchronized with
-         * wq->mutex to avoid confusing flush_workqueue().
-         */
        pwq->work_color = wq->work_color;
        /* sync max_active to the current setting */
@@ -3832,7 +3764,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
        if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
                return -EINVAL;
-        pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
+        pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
        new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
        tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
        if (!pwq_tbl || !new_attrs || !tmp_attrs)
@@ -4080,7 +4012,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        /* allocate wq and format name */
        if (flags & WQ_UNBOUND)
-                tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
+                tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
        wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
        if (!wq)
@@ -4122,7 +4054,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        if (flags & WQ_MEM_RECLAIM) {
                struct worker *rescuer;
-                rescuer = alloc_worker();
+                rescuer = alloc_worker(NUMA_NO_NODE);
                if (!rescuer)
                        goto err_destroy;
@@ -4470,8 +4402,6 @@ static void wq_unbind_fn(struct work_struct *work)
        struct worker *worker;
        for_each_cpu_worker_pool(pool, cpu) {
-                WARN_ON_ONCE(cpu != smp_processor_id());
                mutex_lock(&pool->attach_mutex);
                spin_lock_irq(&pool->lock);
@@ -4543,6 +4473,7 @@ static void rebind_workers(struct worker_pool *pool)
                                                  pool->attrs->cpumask) < 0);
        spin_lock_irq(&pool->lock);
+        pool->flags &= ~POOL_DISASSOCIATED;
        for_each_pool_worker(worker, pool) {
                unsigned int worker_flags = worker->flags;
@@ -4632,7 +4563,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
                for_each_cpu_worker_pool(pool, cpu) {
                        if (pool->nr_workers)
                                continue;
-                        if (create_and_start_worker(pool) < 0)
+                        if (!create_worker(pool))
                                return NOTIFY_BAD;
                }
                break;
@@ -4644,15 +4575,10 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
                for_each_pool(pool, pi) {
                        mutex_lock(&pool->attach_mutex);
-                        if (pool->cpu == cpu) {
+                        if (pool->cpu == cpu)
-                                spin_lock_irq(&pool->lock);
-                                pool->flags &= ~POOL_DISASSOCIATED;
-                                spin_unlock_irq(&pool->lock);
                                rebind_workers(pool);
-                        } else if (pool->cpu < 0) {
+                        else if (pool->cpu < 0)
                                restore_unbound_workers_cpumask(pool, cpu);
-                        }
                        mutex_unlock(&pool->attach_mutex);
                }
@@ -4856,10 +4782,6 @@ static void __init wq_numa_init(void)
        cpumask_var_t *tbl;
        int node, cpu;
-        /* determine NUMA pwq table len - highest node id + 1 */
-        for_each_node(node)
-                wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
        if (num_possible_nodes() <= 1)
                return;
@@ -4876,7 +4798,7 @@ static void __init wq_numa_init(void)
         * available.  Build one from cpu_to_node() which should have been
         * fully initialized by now.
         */
-        tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
+        tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
        BUG_ON(!tbl);
        for_each_node(node)
@@ -4936,7 +4858,7 @@ static int __init init_workqueues(void)
                for_each_cpu_worker_pool(pool, cpu) {
                        pool->flags &= ~POOL_DISASSOCIATED;
-                        BUG_ON(create_and_start_worker(pool) < 0);
+                        BUG_ON(!create_worker(pool));
                }
        }