Merge branch 'next' into for-linus

Prepare first round of input updates for 3.17.
author: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2014-08-07 02:36:12 -0400
committer: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2014-08-07 02:36:12 -0400
commit: 5e2aa2ed08e2e280121dc7cf5609c87d464f12ef (patch)
tree: ca7d7b1480285e3b617fecc5b41f0ce150a82c32 /kernel
parent: f62d14a8072b9756db36ba394e2b267470a40240 (diff)
parent: fc8104bc5a3f6f49d79f45f2706f79f77a9fb2ae (diff)
134 files changed, 7066 insertions, 3912 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index d2b32ac27a39..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -220,6 +220,20 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
 endif
+config ARCH_SUPPORTS_ATOMIC_RMW
+        bool
 config MUTEX_SPIN_ON_OWNER
        def_bool y
-        depends on SMP && !DEBUG_MUTEXES
+        depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+config RWSEM_SPIN_ON_OWNER
+       def_bool y
+       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+config ARCH_USE_QUEUE_RWLOCK
+        bool
+config QUEUE_RWLOCK
+        def_bool y if ARCH_USE_QUEUE_RWLOCK
+        depends on SMP
diff --git a/kernel/acct.c b/kernel/acct.c
index 8d6e145138bb..808a86ff229d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -55,7 +55,7 @@
 #include <linux/times.h>
 #include <linux/syscalls.h>
 #include <linux/mount.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
 #include <linux/pid_namespace.h>
@@ -134,7 +134,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
        spin_lock(&acct_lock);
        if (file != acct->file) {
                if (act)
-                        res = act>0;
+                        res = act > 0;
                goto out;
        }
@@ -262,7 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
        if (name) {
                struct filename *tmp = getname(name);
                if (IS_ERR(tmp))
-                        return (PTR_ERR(tmp));
+                        return PTR_ERR(tmp);
                error = acct_on(tmp);
                putname(tmp);
        } else {
diff --git a/kernel/audit.c b/kernel/audit.c
index 47845c57eb19..3ef2e0e797e8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -44,7 +44,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/init.h>
-#include <asm/types.h>
+#include <linux/types.h>
 #include <linux/atomic.h>
 #include <linux/mm.h>
 #include <linux/export.h>
@@ -424,6 +424,38 @@ static void kauditd_send_skb(struct sk_buff *skb)
 }
 /*
+ * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ *
+ * This function doesn't consume an skb as might be expected since it has to
+ * copy it anyways.
+ */
+static void kauditd_send_multicast_skb(struct sk_buff *skb)
+{
+        struct sk_buff          *copy;
+        struct audit_net        *aunet = net_generic(&init_net, audit_net_id);
+        struct sock             *sock = aunet->nlsk;
+        if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
+                return;
+        /*
+         * The seemingly wasteful skb_copy() rather than bumping the refcount
+         * using skb_get() is necessary because non-standard mods are made to
+         * the skb by the original kaudit unicast socket send routine.  The
+         * existing auditd daemon assumes this breakage.  Fixing this would
+         * require co-ordinating a change in the established protocol between
+         * the kaudit kernel subsystem and the auditd userspace code.  There is
+         * no reason for new multicast clients to continue with this
+         * non-compliance.
+         */
+        copy = skb_copy(skb, GFP_KERNEL);
+        if (!copy)
+                return;
+        nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
+}
+/*
 * flush_hold_queue - empty the hold queue if auditd appears
 *
 * If auditd just started, drain the queue of messages already
@@ -1076,10 +1108,22 @@ static void audit_receive(struct sk_buff  *skb)
        mutex_unlock(&audit_cmd_mutex);
 }
+/* Run custom bind function on netlink socket group connect or bind requests. */
+static int audit_bind(int group)
+{
+        if (!capable(CAP_AUDIT_READ))
+                return -EPERM;
+        return 0;
+}
 static int __net_init audit_net_init(struct net *net)
 {
        struct netlink_kernel_cfg cfg = {
                .input  = audit_receive,
+                .bind   = audit_bind,
+                .flags  = NL_CFG_F_NONROOT_RECV,
+                .groups = AUDIT_NLGRP_MAX,
        };
        struct audit_net *aunet = net_generic(net, audit_net_id);
@@ -1901,10 +1945,10 @@ out:
 * audit_log_end - end one audit record
 * @ab: the audit_buffer
 *
- * The netlink_* functions cannot be called inside an irq context, so
+ * netlink_unicast() cannot be called inside an irq context because it blocks
- * the audit buffer is placed on a queue and a tasklet is scheduled to
+ * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
- * remove them from the queue outside the irq context.  May be called in
+ * on a queue and a tasklet is scheduled to remove them from the queue outside
- * any context.
+ * the irq context.  May be called in any context.
 */
 void audit_log_end(struct audit_buffer *ab)
 {
@@ -1914,6 +1958,18 @@ void audit_log_end(struct audit_buffer *ab)
                audit_log_lost("rate limit exceeded");
        } else {
                struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+                kauditd_send_multicast_skb(ab->skb);
+                /*
+                 * The original kaudit unicast socket sends up messages with
+                 * nlmsg_len set to the payload length rather than the entire
+                 * message length.  This breaks the standard set by netlink.
+                 * The existing auditd daemon assumes this breakage.  Fixing
+                 * this would require co-ordinating a change in the established
+                 * protocol between the kaudit kernel subsystem and the auditd
+                 * userspace code.
+                 */
                nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
                if (audit_pid) {
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f251a5e8d17a..21eae3c05ec0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -728,6 +728,22 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
        return AUDIT_BUILD_CONTEXT;
 }
+static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
+{
+        int word, bit;
+        if (val > 0xffffffff)
+                return false;
+        word = AUDIT_WORD(val);
+        if (word >= AUDIT_BITMASK_SIZE)
+                return false;
+        bit = AUDIT_BIT(val);
+        return rule->mask[word] & bit;
+}
 /* At syscall entry and exit time, this filter is called if the
 * audit_state is not low enough that auditing cannot take place, but is
 * also not high enough that we already know we have to write an audit
@@ -745,11 +761,8 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
        rcu_read_lock();
        if (!list_empty(list)) {
-                int word = AUDIT_WORD(ctx->major);
-                int bit  = AUDIT_BIT(ctx->major);
                list_for_each_entry_rcu(e, list, list) {
-                        if ((e->rule.mask[word] & bit) == bit &&
+                        if (audit_in_mask(&e->rule, ctx->major) &&
                            audit_filter_rules(tsk, &e->rule, ctx, NULL,
                                               &state, false)) {
                                rcu_read_unlock();
@@ -769,20 +782,16 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 static int audit_filter_inode_name(struct task_struct *tsk,
                                   struct audit_names *n,
                                   struct audit_context *ctx) {
-        int word, bit;
        int h = audit_hash_ino((u32)n->ino);
        struct list_head *list = &audit_inode_hash[h];
        struct audit_entry *e;
        enum audit_state state;
-        word = AUDIT_WORD(ctx->major);
-        bit  = AUDIT_BIT(ctx->major);
        if (list_empty(list))
                return 0;
        list_for_each_entry_rcu(e, list, list) {
-                if ((e->rule.mask[word] & bit) == bit &&
+                if (audit_in_mask(&e->rule, ctx->major) &&
                    audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
                        ctx->current_state = state;
                        return 1;
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a5e026bc45c4..1323360d90e3 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -19,8 +19,8 @@
 static void backtrace_test_normal(void)
 {
-        printk("Testing a backtrace from process context.\n");
+        pr_info("Testing a backtrace from process context.\n");
-        printk("The following trace is a kernel self test and not a bug!\n");
+        pr_info("The following trace is a kernel self test and not a bug!\n");
        dump_stack();
 }
@@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
 static void backtrace_test_irq(void)
 {
-        printk("Testing a backtrace from irq context.\n");
+        pr_info("Testing a backtrace from irq context.\n");
-        printk("The following trace is a kernel self test and not a bug!\n");
+        pr_info("The following trace is a kernel self test and not a bug!\n");
        init_completion(&backtrace_work);
        tasklet_schedule(&backtrace_tasklet);
@@ -51,8 +51,8 @@ static void backtrace_test_saved(void)
        struct stack_trace trace;
        unsigned long entries[8];
-        printk("Testing a saved backtrace.\n");
+        pr_info("Testing a saved backtrace.\n");
-        printk("The following trace is a kernel self test and not a bug!\n");
+        pr_info("The following trace is a kernel self test and not a bug!\n");
        trace.nr_entries = 0;
        trace.max_entries = ARRAY_SIZE(entries);
@@ -65,19 +65,19 @@ static void backtrace_test_saved(void)
 #else
 static void backtrace_test_saved(void)
 {
-        printk("Saved backtrace test skipped.\n");
+        pr_info("Saved backtrace test skipped.\n");
 }
 #endif
 static int backtrace_regression_test(void)
 {
-        printk("====[ backtrace testing ]===========\n");
+        pr_info("====[ backtrace testing ]===========\n");
        backtrace_test_normal();
        backtrace_test_irq();
        backtrace_test_saved();
-        printk("====[ end of backtrace testing ]====\n");
+        pr_info("====[ end of backtrace testing ]====\n");
        return 0;
 }
diff --git a/kernel/capability.c b/kernel/capability.c
index a8d63df0c322..a5cf13c018ce 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -24,7 +24,6 @@
 */
 const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
 EXPORT_SYMBOL(__cap_empty_set);
 int file_caps_enabled = 1;
@@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
                 *
                 * An alternative would be to return an error here
                 * (-ERANGE), but that causes legacy applications to
-                 * unexpectidly fail; the capget/modify/capset aborts
+                 * unexpectedly fail; the capget/modify/capset aborts
                 * before modification is attempted and the application
                 * fails.
                 */
@@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable);
 * This does not set PF_SUPERPRIV because the caller may not
 * actually be privileged.
 */
-bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
+bool file_ns_capable(const struct file *file, struct user_namespace *ns,
+                     int cap)
 {
        if (WARN_ON_ONCE(!cap_valid(cap)))
                return false;
@@ -424,23 +424,19 @@ bool capable(int cap)
 EXPORT_SYMBOL(capable);
 /**
- * inode_capable - Check superior capability over inode
+ * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
 * @inode: The inode in question
 * @cap: The capability in question
 *
- * Return true if the current task has the given superior capability
+ * Return true if the current task has the given capability targeted at
- * targeted at it's own user namespace and that the given inode is owned
+ * its own user namespace and that the given inode's uid and gid are
- * by the current user namespace or a child namespace.
+ * mapped into the current user namespace.
- *
- * Currently we check to see if an inode is owned by the current
- * user namespace by seeing if the inode's owner maps into the
- * current user namespace.
- *
 */
-bool inode_capable(const struct inode *inode, int cap)
+bool capable_wrt_inode_uidgid(const struct inode *inode, int cap)
 {
        struct user_namespace *ns = current_user_ns();
-        return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid);
+        return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid) &&
+                kgid_has_mapping(ns, inode->i_gid);
 }
-EXPORT_SYMBOL(inode_capable);
+EXPORT_SYMBOL(capable_wrt_inode_uidgid);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9fcdaa705b6c..70776aec2562 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -26,6 +26,8 @@
 *  distribution for more details.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/cgroup.h>
 #include <linux/cred.h>
 #include <linux/ctype.h>
@@ -33,6 +35,7 @@
 #include <linux/init_task.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/magic.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
@@ -69,15 +72,6 @@
                                         MAX_CFTYPE_NAME + 2)
 /*
- * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
- * creation/removal and hierarchy changing operations including cgroup
- * creation, removal, css association and controller rebinding.  This outer
- * lock is needed mainly to resolve the circular dependency between kernfs
- * active ref and cgroup_mutex.  cgroup_tree_mutex nests above both.
- */
-static DEFINE_MUTEX(cgroup_tree_mutex);
-/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
 *
@@ -98,16 +92,21 @@ static DECLARE_RWSEM(css_set_rwsem);
 #endif
 /*
+ * Protects cgroup_idr and css_idr so that IDs can be released without
+ * grabbing cgroup_mutex.
+ */
+static DEFINE_SPINLOCK(cgroup_idr_lock);
+/*
 * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
 * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
 */
 static DEFINE_SPINLOCK(release_agent_path_lock);
-#define cgroup_assert_mutexes_or_rcu_locked()                           \
+#define cgroup_assert_mutex_or_rcu_locked()                             \
        rcu_lockdep_assert(rcu_read_lock_held() ||                      \
-                           lockdep_is_held(&cgroup_tree_mutex) ||       \
                           lockdep_is_held(&cgroup_mutex),              \
-                           "cgroup_[tree_]mutex or RCU read lock required");
+                           "cgroup_mutex or RCU read lock required");
 /*
 * cgroup destruction makes heavy use of work items and there can be a lot
@@ -150,6 +149,13 @@ struct cgroup_root cgrp_dfl_root;
 */
 static bool cgrp_dfl_root_visible;
+/* some controllers are not supported in the default hierarchy */
+static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
+#ifdef CONFIG_CGROUP_DEBUG
+        | (1 << debug_cgrp_id)
+#endif
+        ;
 /* The list of hierarchy roots */
 static LIST_HEAD(cgroup_roots);
@@ -159,14 +165,13 @@ static int cgroup_root_count;
 static DEFINE_IDR(cgroup_hierarchy_idr);
 /*
- * Assign a monotonically increasing serial number to cgroups.  It
+ * Assign a monotonically increasing serial number to csses.  It guarantees
- * guarantees cgroups with bigger numbers are newer than those with smaller
+ * cgroups with bigger numbers are newer than those with smaller numbers.
- * numbers.  Also, as cgroups are always appended to the parent's
+ * Also, as csses are always appended to the parent's ->children list, it
- * ->children list, it guarantees that sibling cgroups are always sorted in
+ * guarantees that sibling csses are always sorted in the ascending serial
- * the ascending serial number order on the list.  Protected by
+ * number order on the list.  Protected by cgroup_mutex.
- * cgroup_mutex.
 */
-static u64 cgroup_serial_nr_next = 1;
+static u64 css_serial_nr_next = 1;
 /* This flag indicates whether tasks in the fork and exit paths should
 * check for fork/exit handlers to call. This avoids us having to do
@@ -179,17 +184,59 @@ static struct cftype cgroup_base_files[];
 static void cgroup_put(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
-                             unsigned long ss_mask);
+                             unsigned int ss_mask);
-static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static void css_release(struct percpu_ref *ref);
+static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                              bool is_add);
 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
+/* IDR wrappers which synchronize using cgroup_idr_lock */
+static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
+                            gfp_t gfp_mask)
+{
+        int ret;
+        idr_preload(gfp_mask);
+        spin_lock_bh(&cgroup_idr_lock);
+        ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+        spin_unlock_bh(&cgroup_idr_lock);
+        idr_preload_end();
+        return ret;
+}
+static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
+{
+        void *ret;
+        spin_lock_bh(&cgroup_idr_lock);
+        ret = idr_replace(idr, ptr, id);
+        spin_unlock_bh(&cgroup_idr_lock);
+        return ret;
+}
+static void cgroup_idr_remove(struct idr *idr, int id)
+{
+        spin_lock_bh(&cgroup_idr_lock);
+        idr_remove(idr, id);
+        spin_unlock_bh(&cgroup_idr_lock);
+}
+static struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+        struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+        if (parent_css)
+                return container_of(parent_css, struct cgroup, self);
+        return NULL;
+}
 /**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
- * @ss: the subsystem of interest (%NULL returns the dummy_css)
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 *
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
@@ -202,23 +249,49 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 {
        if (ss)
                return rcu_dereference_check(cgrp->subsys[ss->id],
-                                        lockdep_is_held(&cgroup_tree_mutex) ||
                                        lockdep_is_held(&cgroup_mutex));
        else
-                return &cgrp->dummy_css;
+                return &cgrp->self;
+}
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns @cgrp->self)
+ *
+ * Similar to cgroup_css() but returns the effctive css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+                                                struct cgroup_subsys *ss)
+{
+        lockdep_assert_held(&cgroup_mutex);
+        if (!ss)
+                return &cgrp->self;
+        if (!(cgrp->root->subsys_mask & (1 << ss->id)))
+                return NULL;
+        while (cgroup_parent(cgrp) &&
+               !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+                cgrp = cgroup_parent(cgrp);
+        return cgroup_css(cgrp, ss);
 }
 /* convenient tests for these bits */
 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
-        return test_bit(CGRP_DEAD, &cgrp->flags);
+        return !(cgrp->self.flags & CSS_ONLINE);
 }
-struct cgroup_subsys_state *seq_css(struct seq_file *seq)
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 {
-        struct kernfs_open_file *of = seq->private;
        struct cgroup *cgrp = of->kn->parent->priv;
-        struct cftype *cft = seq_cft(seq);
+        struct cftype *cft = of_cft(of);
        /*
         * This is open and unprotected implementation of cgroup_css().
@@ -231,9 +304,9 @@ struct cgroup_subsys_state *seq_css(struct seq_file *seq)
        if (cft->ss)
                return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
        else
-                return &cgrp->dummy_css;
+                return &cgrp->self;
 }
-EXPORT_SYMBOL_GPL(seq_css);
+EXPORT_SYMBOL_GPL(of_css);
 /**
 * cgroup_is_descendant - test ancestry
@@ -249,7 +322,7 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
        while (cgrp) {
                if (cgrp == ancestor)
                        return true;
-                cgrp = cgrp->parent;
+                cgrp = cgroup_parent(cgrp);
        }
        return false;
 }
@@ -273,17 +346,30 @@ static int notify_on_release(const struct cgroup *cgrp)
 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 * @cgrp: the target cgroup to iterate css's of
 *
- * Should be called under cgroup_mutex.
+ * Should be called under cgroup_[tree_]mutex.
 */
 #define for_each_css(css, ssid, cgrp)                                   \
        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
                if (!((css) = rcu_dereference_check(                    \
                                (cgrp)->subsys[(ssid)],                 \
-                                lockdep_is_held(&cgroup_tree_mutex) ||  \
                                lockdep_is_held(&cgroup_mutex)))) { }   \
                else
 /**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_e_css(css, ssid, cgrp)                                 \
+        for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
+                if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+                        ;                                               \
+                else
+/**
 * for_each_subsys - iterate all enabled cgroup subsystems
 * @ss: the iteration cursor
 * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -296,22 +382,13 @@ static int notify_on_release(const struct cgroup *cgrp)
 #define for_each_root(root)                                             \
        list_for_each_entry((root), &cgroup_roots, root_list)
-/**
+/* iterate over child cgrps, lock should be held throughout iteration */
- * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+#define cgroup_for_each_live_child(child, cgrp)                         \
- * @cgrp: the cgroup to be checked for liveness
+        list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
- *
+                if (({ lockdep_assert_held(&cgroup_mutex);              \
- * On success, returns true; the mutex should be later unlocked.  On
+                       cgroup_is_dead(child); }))                       \
- * failure returns false with no lock held.
+                        ;                                               \
- */
+                else
-static bool cgroup_lock_live_group(struct cgroup *cgrp)
-{
-        mutex_lock(&cgroup_mutex);
-        if (cgroup_is_dead(cgrp)) {
-                mutex_unlock(&cgroup_mutex);
-                return false;
-        }
-        return true;
-}
 /* the list of cgroups eligible for automatic release. Protected by
 * release_list_lock */
@@ -348,7 +425,7 @@ struct cgrp_cset_link {
 * reference-counted, to improve performance when child cgroups
 * haven't been created.
 */
-static struct css_set init_css_set = {
+struct css_set init_css_set = {
        .refcount               = ATOMIC_INIT(1),
        .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
        .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
@@ -359,6 +436,43 @@ static struct css_set init_css_set = {
 static int css_set_count        = 1;    /* 1 for init_css_set */
+/**
+ * cgroup_update_populated - updated populated count of a cgroup
+ * @cgrp: the target cgroup
+ * @populated: inc or dec populated count
+ *
+ * @cgrp is either getting the first task (css_set) or losing the last.
+ * Update @cgrp->populated_cnt accordingly.  The count is propagated
+ * towards root so that a given cgroup's populated_cnt is zero iff the
+ * cgroup and all its descendants are empty.
+ *
+ * @cgrp's interface file "cgroup.populated" is zero if
+ * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
+ * changes from or to zero, userland is notified that the content of the
+ * interface file has changed.  This can be used to detect when @cgrp and
+ * its descendants become populated or empty.
+ */
+static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
+{
+        lockdep_assert_held(&css_set_rwsem);
+        do {
+                bool trigger;
+                if (populated)
+                        trigger = !cgrp->populated_cnt++;
+                else
+                        trigger = !--cgrp->populated_cnt;
+                if (!trigger)
+                        break;
+                if (cgrp->populated_kn)
+                        kernfs_notify(cgrp->populated_kn);
+                cgrp = cgroup_parent(cgrp);
+        } while (cgrp);
+}
 /*
 * hash table for cgroup groups. This improves the performance to find
 * an existing css_set. This hash doesn't (currently) take into
@@ -383,6 +497,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 static void put_css_set_locked(struct css_set *cset, bool taskexit)
 {
        struct cgrp_cset_link *link, *tmp_link;
+        struct cgroup_subsys *ss;
+        int ssid;
        lockdep_assert_held(&css_set_rwsem);
@@ -390,6 +506,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
                return;
        /* This css_set is dead. unlink it and release cgroup refcounts */
+        for_each_subsys(ss, ssid)
+                list_del(&cset->e_cset_node[ssid]);
        hash_del(&cset->hlist);
        css_set_count--;
@@ -400,10 +518,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
                list_del(&link->cgrp_link);
                /* @cgrp can't go away while we're holding css_set_rwsem */
-                if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
+                if (list_empty(&cgrp->cset_links)) {
-                        if (taskexit)
+                        cgroup_update_populated(cgrp, false);
-                                set_bit(CGRP_RELEASABLE, &cgrp->flags);
+                        if (notify_on_release(cgrp)) {
-                        check_for_release(cgrp);
+                                if (taskexit)
+                                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
+                                check_for_release(cgrp);
+                        }
                }
                kfree(link);
@@ -452,20 +573,20 @@ static bool compare_css_sets(struct css_set *cset,
 {
        struct list_head *l1, *l2;
-        if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
+        /*
-                /* Not all subsystems matched */
+         * On the default hierarchy, there can be csets which are
+         * associated with the same set of cgroups but different csses.
+         * Let's first ensure that csses match.
+         */
+        if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
                return false;
-        }
        /*
         * Compare cgroup pointers in order to distinguish between
-         * different cgroups in heirarchies with no subsystems. We
+         * different cgroups in hierarchies.  As different cgroups may
-         * could get by with just this check alone (and skip the
+         * share the same effective css, this comparison is always
-         * memcmp above) but on most setups the memcmp check will
+         * necessary.
-         * avoid the need for this more expensive check on almost all
-         * candidates.
         */
        l1 = &cset->cgrp_links;
        l2 = &old_cset->cgrp_links;
        while (1) {
@@ -529,14 +650,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
         * won't change, so no need for locking.
         */
        for_each_subsys(ss, i) {
-                if (root->cgrp.subsys_mask & (1UL << i)) {
+                if (root->subsys_mask & (1UL << i)) {
-                        /* Subsystem is in this hierarchy. So we want
+                        /*
-                         * the subsystem state from the new
+                         * @ss is in this hierarchy, so we want the
-                         * cgroup */
+                         * effective css from @cgrp.
-                        template[i] = cgroup_css(cgrp, ss);
+                         */
+                        template[i] = cgroup_e_css(cgrp, ss);
                } else {
-                        /* Subsystem is not in this hierarchy, so we
+                        /*
-                         * don't want to change the subsystem state */
+                         * @ss is not in this hierarchy, so we don't want
+                         * to change the css.
+                         */
                        template[i] = old_cset->subsys[i];
                }
        }
@@ -602,10 +726,18 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
        struct cgrp_cset_link *link;
        BUG_ON(list_empty(tmp_links));
+        if (cgroup_on_dfl(cgrp))
+                cset->dfl_cgrp = cgrp;
        link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
        link->cset = cset;
        link->cgrp = cgrp;
+        if (list_empty(&cgrp->cset_links))
+                cgroup_update_populated(cgrp, true);
        list_move(&link->cset_link, &cgrp->cset_links);
        /*
         * Always add links to the tail of the list so that the list
         * is sorted by order of hierarchy creation
@@ -628,7 +760,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        struct css_set *cset;
        struct list_head tmp_links;
        struct cgrp_cset_link *link;
+        struct cgroup_subsys *ss;
        unsigned long key;
+        int ssid;
        lockdep_assert_held(&cgroup_mutex);
@@ -679,10 +813,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
        css_set_count++;
-        /* Add this cgroup group to the hash table */
+        /* Add @cset to the hash table */
        key = css_set_hash(cset->subsys);
        hash_add(css_set_table, &cset->hlist, key);
+        for_each_subsys(ss, ssid)
+                list_add_tail(&cset->e_cset_node[ssid],
+                              &cset->subsys[ssid]->cgroup->e_csets[ssid]);
        up_write(&css_set_rwsem);
        return cset;
@@ -735,14 +873,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
        struct cgroup *cgrp = &root->cgrp;
        struct cgrp_cset_link *link, *tmp_link;
-        mutex_lock(&cgroup_tree_mutex);
        mutex_lock(&cgroup_mutex);
        BUG_ON(atomic_read(&root->nr_cgrps));
-        BUG_ON(!list_empty(&cgrp->children));
+        BUG_ON(!list_empty(&cgrp->self.children));
        /* Rebind all subsystems back to the default hierarchy */
-        rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
+        rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
        /*
         * Release all the links from cset_links to this hierarchy's
@@ -765,7 +902,6 @@ static void cgroup_destroy_root(struct cgroup_root *root)
        cgroup_exit_root_id(root);
        mutex_unlock(&cgroup_mutex);
-        mutex_unlock(&cgroup_tree_mutex);
        kernfs_destroy_root(root->kf_root);
        cgroup_free_root(root);
@@ -848,7 +984,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 * update of a tasks cgroup pointer by cgroup_attach_task()
 */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
@@ -883,79 +1019,95 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
        if (cft->read_u64 || cft->read_s64 || cft->seq_show)
                mode |= S_IRUGO;
-        if (cft->write_u64 || cft->write_s64 || cft->write_string ||
+        if (cft->write_u64 || cft->write_s64 || cft->write)
-            cft->trigger)
                mode |= S_IWUSR;
        return mode;
 }
-static void cgroup_free_fn(struct work_struct *work)
+static void cgroup_get(struct cgroup *cgrp)
 {
-        struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
+        WARN_ON_ONCE(cgroup_is_dead(cgrp));
+        css_get(&cgrp->self);
-        atomic_dec(&cgrp->root->nr_cgrps);
-        cgroup_pidlist_destroy_all(cgrp);
-        if (cgrp->parent) {
-                /*
-                 * We get a ref to the parent, and put the ref when this
-                 * cgroup is being freed, so it's guaranteed that the
-                 * parent won't be destroyed before its children.
-                 */
-                cgroup_put(cgrp->parent);
-                kernfs_put(cgrp->kn);
-                kfree(cgrp);
-        } else {
-                /*
-                 * This is root cgroup's refcnt reaching zero, which
-                 * indicates that the root should be released.
-                 */
-                cgroup_destroy_root(cgrp->root);
-        }
 }
-static void cgroup_free_rcu(struct rcu_head *head)
+static void cgroup_put(struct cgroup *cgrp)
 {
-        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+        css_put(&cgrp->self);
-        INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
-        queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
-static void cgroup_get(struct cgroup *cgrp)
+/**
+ * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper undoes cgroup_kn_lock_live() and should be invoked before
+ * the method finishes if locking succeeded.  Note that once this function
+ * returns the cgroup returned by cgroup_kn_lock_live() may become
+ * inaccessible any time.  If the caller intends to continue to access the
+ * cgroup, it should pin it before invoking this function.
+ */
+static void cgroup_kn_unlock(struct kernfs_node *kn)
 {
-        WARN_ON_ONCE(cgroup_is_dead(cgrp));
+        struct cgroup *cgrp;
-        WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
-        atomic_inc(&cgrp->refcnt);
+        if (kernfs_type(kn) == KERNFS_DIR)
+                cgrp = kn->priv;
+        else
+                cgrp = kn->parent->priv;
+        mutex_unlock(&cgroup_mutex);
+        kernfs_unbreak_active_protection(kn);
+        cgroup_put(cgrp);
 }
-static void cgroup_put(struct cgroup *cgrp)
+/**
+ * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
+ * @kn: the kernfs_node being serviced
+ *
+ * This helper is to be used by a cgroup kernfs method currently servicing
+ * @kn.  It breaks the active protection, performs cgroup locking and
+ * verifies that the associated cgroup is alive.  Returns the cgroup if
+ * alive; otherwise, %NULL.  A successful return should be undone by a
+ * matching cgroup_kn_unlock() invocation.
+ *
+ * Any cgroup kernfs method implementation which requires locking the
+ * associated cgroup should use this helper.  It avoids nesting cgroup
+ * locking under kernfs active protection and allows all kernfs operations
+ * including self-removal.
+ */
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
 {
-        if (!atomic_dec_and_test(&cgrp->refcnt))
+        struct cgroup *cgrp;
-                return;
-        if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
+        if (kernfs_type(kn) == KERNFS_DIR)
-                return;
+                cgrp = kn->priv;
+        else
+                cgrp = kn->parent->priv;
        /*
-         * XXX: cgrp->id is only used to look up css's.  As cgroup and
+         * We're gonna grab cgroup_mutex which nests outside kernfs
-         * css's lifetimes will be decoupled, it should be made
+         * active_ref.  cgroup liveliness check alone provides enough
-         * per-subsystem and moved to css->id so that lookups are
+         * protection against removal.  Ensure @cgrp stays accessible and
-         * successful until the target css is released.
+         * break the active_ref protection.
         */
+        cgroup_get(cgrp);
+        kernfs_break_active_protection(kn);
        mutex_lock(&cgroup_mutex);
-        idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
-        mutex_unlock(&cgroup_mutex);
-        cgrp->id = -1;
-        call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
+        if (!cgroup_is_dead(cgrp))
+                return cgrp;
+        cgroup_kn_unlock(kn);
+        return NULL;
 }
 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 {
        char name[CGROUP_FILE_NAME_MAX];
-        lockdep_assert_held(&cgroup_tree_mutex);
+        lockdep_assert_held(&cgroup_mutex);
        kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
 }
@@ -964,7 +1116,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 * @cgrp: target cgroup
 * @subsys_mask: mask of the subsystem ids whose files should be removed
 */
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
 {
        struct cgroup_subsys *ss;
        int i;
@@ -972,40 +1124,40 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
        for_each_subsys(ss, i) {
                struct cftype *cfts;
-                if (!test_bit(i, &subsys_mask))
+                if (!(subsys_mask & (1 << i)))
                        continue;
                list_for_each_entry(cfts, &ss->cfts, node)
                        cgroup_addrm_files(cgrp, cfts, false);
        }
 }
-static int rebind_subsystems(struct cgroup_root *dst_root,
+static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
-                             unsigned long ss_mask)
 {
        struct cgroup_subsys *ss;
-        int ssid, ret;
+        unsigned int tmp_ss_mask;
+        int ssid, i, ret;
-        lockdep_assert_held(&cgroup_tree_mutex);
        lockdep_assert_held(&cgroup_mutex);
        for_each_subsys(ss, ssid) {
                if (!(ss_mask & (1 << ssid)))
                        continue;
-                /* if @ss is on the dummy_root, we can always move it */
+                /* if @ss has non-root csses attached to it, can't move */
-                if (ss->root == &cgrp_dfl_root)
+                if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
-                        continue;
-                /* if @ss has non-root cgroups attached to it, can't move */
-                if (!list_empty(&ss->root->cgrp.children))
                        return -EBUSY;
                /* can't move between two non-dummy roots either */
-                if (dst_root != &cgrp_dfl_root)
+                if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
                        return -EBUSY;
        }
-        ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
+        /* skip creating root files on dfl_root for inhibited subsystems */
+        tmp_ss_mask = ss_mask;
+        if (dst_root == &cgrp_dfl_root)
+                tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
+        ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
        if (ret) {
                if (dst_root != &cgrp_dfl_root)
                        return ret;
@@ -1017,9 +1169,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
                 * Just warn about it and continue.
                 */
                if (cgrp_dfl_root_visible) {
-                        pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
+                        pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
-                                   ret, ss_mask);
+                                ret, ss_mask);
-                        pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
+                        pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
                }
        }
@@ -1027,15 +1179,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
         * Nothing can fail from this point on.  Remove files for the
         * removed subsystems and rebind each subsystem.
         */
-        mutex_unlock(&cgroup_mutex);
        for_each_subsys(ss, ssid)
                if (ss_mask & (1 << ssid))
                        cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
-        mutex_lock(&cgroup_mutex);
        for_each_subsys(ss, ssid) {
                struct cgroup_root *src_root;
                struct cgroup_subsys_state *css;
+                struct css_set *cset;
                if (!(ss_mask & (1 << ssid)))
                        continue;
@@ -1050,8 +1201,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
                ss->root = dst_root;
                css->cgroup = &dst_root->cgrp;
-                src_root->cgrp.subsys_mask &= ~(1 << ssid);
+                down_write(&css_set_rwsem);
-                dst_root->cgrp.subsys_mask |= 1 << ssid;
+                hash_for_each(css_set_table, i, cset, hlist)
+                        list_move_tail(&cset->e_cset_node[ss->id],
+                                       &dst_root->cgrp.e_csets[ss->id]);
+                up_write(&css_set_rwsem);
+                src_root->subsys_mask &= ~(1 << ssid);
+                src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+                /* default hierarchy doesn't enable controllers by default */
+                dst_root->subsys_mask |= 1 << ssid;
+                if (dst_root != &cgrp_dfl_root)
+                        dst_root->cgrp.child_subsys_mask |= 1 << ssid;
                if (ss->bind)
                        ss->bind(css);
@@ -1069,7 +1231,7 @@ static int cgroup_show_options(struct seq_file *seq,
        int ssid;
        for_each_subsys(ss, ssid)
-                if (root->cgrp.subsys_mask & (1 << ssid))
+                if (root->subsys_mask & (1 << ssid))
                        seq_printf(seq, ",%s", ss->name);
        if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
                seq_puts(seq, ",sane_behavior");
@@ -1091,8 +1253,8 @@ static int cgroup_show_options(struct seq_file *seq,
 }
 struct cgroup_sb_opts {
-        unsigned long subsys_mask;
+        unsigned int subsys_mask;
-        unsigned long flags;
+        unsigned int flags;
        char *release_agent;
        bool cpuset_clone_children;
        char *name;
@@ -1100,24 +1262,16 @@ struct cgroup_sb_opts {
        bool none;
 };
-/*
- * Convert a hierarchy specifier into a bitmask of subsystems and
- * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
- * array. This function takes refcounts on subsystems to be used, unless it
- * returns error, in which case no refcounts are taken.
- */
 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
        char *token, *o = data;
        bool all_ss = false, one_ss = false;
-        unsigned long mask = (unsigned long)-1;
+        unsigned int mask = -1U;
        struct cgroup_subsys *ss;
        int i;
-        BUG_ON(!mutex_is_locked(&cgroup_mutex));
 #ifdef CONFIG_CPUSETS
-        mask = ~(1UL << cpuset_cgrp_id);
+        mask = ~(1U << cpuset_cgrp_id);
 #endif
        memset(opts, 0, sizeof(*opts));
@@ -1198,7 +1352,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        /* Mutually exclusive option 'all' + subsystem name */
                        if (all_ss)
                                return -EINVAL;
-                        set_bit(i, &opts->subsys_mask);
+                        opts->subsys_mask |= (1 << i);
                        one_ss = true;
                        break;
@@ -1210,12 +1364,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        /* Consistency checks */
        if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
-                pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+                pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
                if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
                    opts->cpuset_clone_children || opts->release_agent ||
                    opts->name) {
-                        pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+                        pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
                        return -EINVAL;
                }
        } else {
@@ -1227,7 +1381,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                if (all_ss || (!one_ss && !opts->none && !opts->name))
                        for_each_subsys(ss, i)
                                if (!ss->disabled)
-                                        set_bit(i, &opts->subsys_mask);
+                                        opts->subsys_mask |= (1 << i);
                /*
                 * We either have to specify by name or by subsystems. (So
@@ -1258,14 +1412,13 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
        int ret = 0;
        struct cgroup_root *root = cgroup_root_from_kf(kf_root);
        struct cgroup_sb_opts opts;
-        unsigned long added_mask, removed_mask;
+        unsigned int added_mask, removed_mask;
        if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
-                pr_err("cgroup: sane_behavior: remount is not allowed\n");
+                pr_err("sane_behavior: remount is not allowed\n");
                return -EINVAL;
        }
-        mutex_lock(&cgroup_tree_mutex);
        mutex_lock(&cgroup_mutex);
        /* See what subsystems are wanted */
@@ -1273,17 +1426,17 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
        if (ret)
                goto out_unlock;
-        if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
+        if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
-                pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
+                pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
-                           task_tgid_nr(current), current->comm);
+                        task_tgid_nr(current), current->comm);
-        added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
+        added_mask = opts.subsys_mask & ~root->subsys_mask;
-        removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
+        removed_mask = root->subsys_mask & ~opts.subsys_mask;
        /* Don't allow flags or name to change at remount */
        if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
            (opts.name && strcmp(opts.name, root->name))) {
-                pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
+                pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
                       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
                       root->flags & CGRP_ROOT_OPTION_MASK, root->name);
                ret = -EINVAL;
@@ -1291,7 +1444,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
        }
        /* remounting is not allowed for populated hierarchies */
-        if (!list_empty(&root->cgrp.children)) {
+        if (!list_empty(&root->cgrp.self.children)) {
                ret = -EBUSY;
                goto out_unlock;
        }
@@ -1311,7 +1464,6 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
        kfree(opts.release_agent);
        kfree(opts.name);
        mutex_unlock(&cgroup_mutex);
-        mutex_unlock(&cgroup_tree_mutex);
        return ret;
 }
@@ -1369,14 +1521,22 @@ out_unlock:
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
-        atomic_set(&cgrp->refcnt, 1);
+        struct cgroup_subsys *ss;
-        INIT_LIST_HEAD(&cgrp->sibling);
+        int ssid;
-        INIT_LIST_HEAD(&cgrp->children);
+        INIT_LIST_HEAD(&cgrp->self.sibling);
+        INIT_LIST_HEAD(&cgrp->self.children);
        INIT_LIST_HEAD(&cgrp->cset_links);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
        mutex_init(&cgrp->pidlist_mutex);
-        cgrp->dummy_css.cgroup = cgrp;
+        cgrp->self.cgroup = cgrp;
+        cgrp->self.flags |= CSS_ONLINE;
+        for_each_subsys(ss, ssid)
+                INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+        init_waitqueue_head(&cgrp->offline_waitq);
 }
 static void init_cgroup_root(struct cgroup_root *root,
@@ -1399,21 +1559,24 @@ static void init_cgroup_root(struct cgroup_root *root,
                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
-static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
 {
        LIST_HEAD(tmp_links);
        struct cgroup *root_cgrp = &root->cgrp;
        struct css_set *cset;
        int i, ret;
-        lockdep_assert_held(&cgroup_tree_mutex);
        lockdep_assert_held(&cgroup_mutex);
-        ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
+        ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
        if (ret < 0)
                goto out;
        root_cgrp->id = ret;
+        ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+        if (ret)
+                goto out;
        /*
         * We're accessing css_set_count without locking css_set_rwsem here,
         * but that's OK - it can only be increased by someone holding
@@ -1422,11 +1585,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
         */
        ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
        if (ret)
-                goto out;
+                goto cancel_ref;
        ret = cgroup_init_root_id(root);
        if (ret)
-                goto out;
+                goto cancel_ref;
        root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
                                           KERNFS_ROOT_CREATE_DEACTIVATED,
@@ -1462,7 +1625,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
                link_css_set(&tmp_links, cset, root_cgrp);
        up_write(&css_set_rwsem);
-        BUG_ON(!list_empty(&root_cgrp->children));
+        BUG_ON(!list_empty(&root_cgrp->self.children));
        BUG_ON(atomic_read(&root->nr_cgrps) != 1);
        kernfs_activate(root_cgrp->kn);
@@ -1474,6 +1637,8 @@ destroy_root:
        root->kf_root = NULL;
 exit_root_id:
        cgroup_exit_root_id(root);
+cancel_ref:
+        percpu_ref_cancel_init(&root_cgrp->self.refcnt);
 out:
        free_cgrp_cset_links(&tmp_links);
        return ret;
@@ -1483,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                         int flags, const char *unused_dev_name,
                         void *data)
 {
+        struct super_block *pinned_sb = NULL;
+        struct cgroup_subsys *ss;
        struct cgroup_root *root;
        struct cgroup_sb_opts opts;
        struct dentry *dentry;
        int ret;
+        int i;
        bool new_sb;
        /*
@@ -1495,8 +1663,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
         */
        if (!use_task_css_set_links)
                cgroup_enable_task_cg_lists();
-retry:
-        mutex_lock(&cgroup_tree_mutex);
        mutex_lock(&cgroup_mutex);
        /* First find the desired set of subsystems */
@@ -1513,6 +1680,27 @@ retry:
                goto out_unlock;
        }
+        /*
+         * Destruction of cgroup root is asynchronous, so subsystems may
+         * still be dying after the previous unmount.  Let's drain the
+         * dying subsystems.  We just need to ensure that the ones
+         * unmounted previously finish dying and don't care about new ones
+         * starting.  Testing ref liveliness is good enough.
+         */
+        for_each_subsys(ss, i) {
+                if (!(opts.subsys_mask & (1 << i)) ||
+                    ss->root == &cgrp_dfl_root)
+                        continue;
+                if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
+                        mutex_unlock(&cgroup_mutex);
+                        msleep(10);
+                        ret = restart_syscall();
+                        goto out_free;
+                }
+                cgroup_put(&ss->root->cgrp);
+        }
        for_each_root(root) {
                bool name_match = false;
@@ -1535,7 +1723,7 @@ retry:
                 * subsystems) then they must match.
                 */
                if ((opts.subsys_mask || opts.none) &&
-                    (opts.subsys_mask != root->cgrp.subsys_mask)) {
+                    (opts.subsys_mask != root->subsys_mask)) {
                        if (!name_match)
                                continue;
                        ret = -EBUSY;
@@ -1544,28 +1732,35 @@ retry:
                if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
                        if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
-                                pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
+                                pr_err("sane_behavior: new mount options should match the existing superblock\n");
                                ret = -EINVAL;
                                goto out_unlock;
                        } else {
-                                pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
+                                pr_warn("new mount options do not match the existing superblock, will be ignored\n");
                        }
                }
                /*
-                 * A root's lifetime is governed by its root cgroup.  Zero
+                 * We want to reuse @root whose lifetime is governed by its
-                 * ref indicate that the root is being destroyed.  Wait for
+                 * ->cgrp.  Let's check whether @root is alive and keep it
-                 * destruction to complete so that the subsystems are free.
+                 * that way.  As cgroup_kill_sb() can happen anytime, we
-                 * We can use wait_queue for the wait but this path is
+                 * want to block it by pinning the sb so that @root doesn't
-                 * super cold.  Let's just sleep for a bit and retry.
+                 * get killed before mount is complete.
+                 *
+                 * With the sb pinned, tryget_live can reliably indicate
+                 * whether @root can be reused.  If it's being killed,
+                 * drain it.  We can use wait_queue for the wait but this
+                 * path is super cold.  Let's just sleep a bit and retry.
                 */
-                if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
+                pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+                if (IS_ERR(pinned_sb) ||
+                    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
                        mutex_unlock(&cgroup_mutex);
-                        mutex_unlock(&cgroup_tree_mutex);
+                        if (!IS_ERR_OR_NULL(pinned_sb))
-                        kfree(opts.release_agent);
+                                deactivate_super(pinned_sb);
-                        kfree(opts.name);
                        msleep(10);
-                        goto retry;
+                        ret = restart_syscall();
+                        goto out_free;
                }
                ret = 0;
@@ -1596,17 +1791,27 @@ retry:
 out_unlock:
        mutex_unlock(&cgroup_mutex);
-        mutex_unlock(&cgroup_tree_mutex);
+out_free:
        kfree(opts.release_agent);
        kfree(opts.name);
        if (ret)
                return ERR_PTR(ret);
-        dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
+        dentry = kernfs_mount(fs_type, flags, root->kf_root,
+                                CGROUP_SUPER_MAGIC, &new_sb);
        if (IS_ERR(dentry) || !new_sb)
                cgroup_put(&root->cgrp);
+        /*
+         * If @pinned_sb, we're reusing an existing root and holding an
+         * extra ref on its sb.  Mount is complete.  Put the extra ref.
+         */
+        if (pinned_sb) {
+                WARN_ON(new_sb);
+                deactivate_super(pinned_sb);
+        }
        return dentry;
 }
@@ -1615,7 +1820,19 @@ static void cgroup_kill_sb(struct super_block *sb)
        struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
        struct cgroup_root *root = cgroup_root_from_kf(kf_root);
-        cgroup_put(&root->cgrp);
+        /*
+         * If @root doesn't have any mounts or children, start killing it.
+         * This prevents new mounts by disabling percpu_ref_tryget_live().
+         * cgroup_mount() may wait for @root's release.
+         *
+         * And don't kill the default root.
+         */
+        if (css_has_online_children(&root->cgrp.self) ||
+            root == &cgrp_dfl_root)
+                cgroup_put(&root->cgrp);
+        else
+                percpu_ref_kill(&root->cgrp.self.refcnt);
        kernfs_kill_sb(sb);
 }
@@ -1737,7 +1954,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 /**
 * cgroup_task_migrate - move a task from one cgroup to another.
- * @old_cgrp; the cgroup @tsk is being migrated from
+ * @old_cgrp: the cgroup @tsk is being migrated from
 * @tsk: the task being migrated
 * @new_cset: the new css_set @tsk is being attached to
 *
@@ -1829,10 +2046,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
        src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
-        /* nothing to do if this cset already belongs to the cgroup */
-        if (src_cgrp == dst_cgrp)
-                return;
        if (!list_empty(&src_cset->mg_preload_node))
                return;
@@ -1847,13 +2060,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
 /**
 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @dst_cgrp: the destination cgroup
+ * @dst_cgrp: the destination cgroup (may be %NULL)
 * @preloaded_csets: list of preloaded source css_sets
 *
 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
 * have been preloaded to @preloaded_csets.  This function looks up and
- * pins all destination css_sets, links each to its source, and put them on
+ * pins all destination css_sets, links each to its source, and append them
- * @preloaded_csets.
+ * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
+ * source css_set is assumed to be its cgroup on the default hierarchy.
 *
 * This function must be called after cgroup_migrate_add_src() has been
 * called on each migration source css_set.  After migration is performed
@@ -1864,19 +2078,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
                                      struct list_head *preloaded_csets)
 {
        LIST_HEAD(csets);
-        struct css_set *src_cset;
+        struct css_set *src_cset, *tmp_cset;
        lockdep_assert_held(&cgroup_mutex);
+        /*
+         * Except for the root, child_subsys_mask must be zero for a cgroup
+         * with tasks so that child cgroups don't compete against tasks.
+         */
+        if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
+            dst_cgrp->child_subsys_mask)
+                return -EBUSY;
        /* look up the dst cset for each src cset and link it to src */
-        list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
+        list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
                struct css_set *dst_cset;
-                dst_cset = find_css_set(src_cset, dst_cgrp);
+                dst_cset = find_css_set(src_cset,
+                                        dst_cgrp ?: src_cset->dfl_cgrp);
                if (!dst_cset)
                        goto err;
                WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+                /*
+                 * If src cset equals dst, it's noop.  Drop the src.
+                 * cgroup_migrate() will skip the cset too.  Note that we
+                 * can't handle src == dst as some nodes are used by both.
+                 */
+                if (src_cset == dst_cset) {
+                        src_cset->mg_src_cgrp = NULL;
+                        list_del_init(&src_cset->mg_preload_node);
+                        put_css_set(src_cset, false);
+                        put_css_set(dst_cset, false);
+                        continue;
+                }
                src_cset->mg_dst_cset = dst_cset;
                if (list_empty(&dst_cset->mg_preload_node))
@@ -1885,7 +2122,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
                        put_css_set(dst_cset, false);
        }
-        list_splice(&csets, preloaded_csets);
+        list_splice_tail(&csets, preloaded_csets);
        return 0;
 err:
        cgroup_migrate_finish(&csets);
@@ -1966,7 +2203,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
                return 0;
        /* check that we can legitimately attach to the cgroup */
-        for_each_css(css, i, cgrp) {
+        for_each_e_css(css, i, cgrp) {
                if (css->ss->can_attach) {
                        ret = css->ss->can_attach(css, &tset);
                        if (ret) {
@@ -1996,7 +2233,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
         */
        tset.csets = &tset.dst_csets;
-        for_each_css(css, i, cgrp)
+        for_each_e_css(css, i, cgrp)
                if (css->ss->attach)
                        css->ss->attach(css, &tset);
@@ -2004,7 +2241,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
        goto out_release_tset;
 out_cancel_attach:
-        for_each_css(css, i, cgrp) {
+        for_each_e_css(css, i, cgrp) {
                if (css == failed_css)
                        break;
                if (css->ss->cancel_attach)
@@ -2063,13 +2300,20 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 * function to attach either it or all tasks in its threadgroup. Will lock
 * cgroup_mutex and threadgroup.
 */
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+                                    size_t nbytes, loff_t off, bool threadgroup)
 {
        struct task_struct *tsk;
        const struct cred *cred = current_cred(), *tcred;
+        struct cgroup *cgrp;
+        pid_t pid;
        int ret;
-        if (!cgroup_lock_live_group(cgrp))
+        if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+                return -EINVAL;
+        cgrp = cgroup_kn_lock_live(of->kn);
+        if (!cgrp)
                return -ENODEV;
 retry_find_task:
@@ -2135,8 +2379,8 @@ retry_find_task:
        put_task_struct(tsk);
 out_unlock_cgroup:
-        mutex_unlock(&cgroup_mutex);
+        cgroup_kn_unlock(of->kn);
-        return ret;
+        return ret ?: nbytes;
 }
 /**
@@ -2170,43 +2414,44 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-static int cgroup_tasks_write(struct cgroup_subsys_state *css,
+static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
-                              struct cftype *cft, u64 pid)
+                                  char *buf, size_t nbytes, loff_t off)
 {
-        return attach_task_by_pid(css->cgroup, pid, false);
+        return __cgroup_procs_write(of, buf, nbytes, off, false);
 }
-static int cgroup_procs_write(struct cgroup_subsys_state *css,
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
-                              struct cftype *cft, u64 tgid)
+                                  char *buf, size_t nbytes, loff_t off)
 {
-        return attach_task_by_pid(css->cgroup, tgid, true);
+        return __cgroup_procs_write(of, buf, nbytes, off, true);
 }
-static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
+static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
-                                      struct cftype *cft, char *buffer)
+                                          char *buf, size_t nbytes, loff_t off)
 {
-        struct cgroup_root *root = css->cgroup->root;
+        struct cgroup *cgrp;
+        BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-        BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
+        cgrp = cgroup_kn_lock_live(of->kn);
-        if (!cgroup_lock_live_group(css->cgroup))
+        if (!cgrp)
                return -ENODEV;
        spin_lock(&release_agent_path_lock);
-        strlcpy(root->release_agent_path, buffer,
+        strlcpy(cgrp->root->release_agent_path, strstrip(buf),
-                sizeof(root->release_agent_path));
+                sizeof(cgrp->root->release_agent_path));
        spin_unlock(&release_agent_path_lock);
-        mutex_unlock(&cgroup_mutex);
+        cgroup_kn_unlock(of->kn);
-        return 0;
+        return nbytes;
 }
 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
 {
        struct cgroup *cgrp = seq_css(seq)->cgroup;
-        if (!cgroup_lock_live_group(cgrp))
+        spin_lock(&release_agent_path_lock);
-                return -ENODEV;
        seq_puts(seq, cgrp->root->release_agent_path);
+        spin_unlock(&release_agent_path_lock);
        seq_putc(seq, '\n');
-        mutex_unlock(&cgroup_mutex);
        return 0;
 }
@@ -2218,6 +2463,320 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
        return 0;
 }
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+{
+        struct cgroup_subsys *ss;
+        bool printed = false;
+        int ssid;
+        for_each_subsys(ss, ssid) {
+                if (ss_mask & (1 << ssid)) {
+                        if (printed)
+                                seq_putc(seq, ' ');
+                        seq_printf(seq, "%s", ss->name);
+                        printed = true;
+                }
+        }
+        if (printed)
+                seq_putc(seq, '\n');
+}
+/* show controllers which are currently attached to the default hierarchy */
+static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
+{
+        struct cgroup *cgrp = seq_css(seq)->cgroup;
+        cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
+                             ~cgrp_dfl_root_inhibit_ss_mask);
+        return 0;
+}
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
+{
+        struct cgroup *cgrp = seq_css(seq)->cgroup;
+        cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+        return 0;
+}
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
+{
+        struct cgroup *cgrp = seq_css(seq)->cgroup;
+        cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+        return 0;
+}
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
+ * css associations need to be updated accordingly.  This function looks up
+ * all css_sets which are attached to the subtree, creates the matching
+ * updated css_sets and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+{
+        LIST_HEAD(preloaded_csets);
+        struct cgroup_subsys_state *css;
+        struct css_set *src_cset;
+        int ret;
+        lockdep_assert_held(&cgroup_mutex);
+        /* look up all csses currently attached to @cgrp's subtree */
+        down_read(&css_set_rwsem);
+        css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+                struct cgrp_cset_link *link;
+                /* self is not affected by child_subsys_mask change */
+                if (css->cgroup == cgrp)
+                        continue;
+                list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
+                        cgroup_migrate_add_src(link->cset, cgrp,
+                                               &preloaded_csets);
+        }
+        up_read(&css_set_rwsem);
+        /* NULL dst indicates self on default hierarchy */
+        ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+        if (ret)
+                goto out_finish;
+        list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+                struct task_struct *last_task = NULL, *task;
+                /* src_csets precede dst_csets, break on the first dst_cset */
+                if (!src_cset->mg_src_cgrp)
+                        break;
+                /*
+                 * All tasks in src_cset need to be migrated to the
+                 * matching dst_cset.  Empty it process by process.  We
+                 * walk tasks but migrate processes.  The leader might even
+                 * belong to a different cset but such src_cset would also
+                 * be among the target src_csets because the default
+                 * hierarchy enforces per-process membership.
+                 */
+                while (true) {
+                        down_read(&css_set_rwsem);
+                        task = list_first_entry_or_null(&src_cset->tasks,
+                                                struct task_struct, cg_list);
+                        if (task) {
+                                task = task->group_leader;
+                                WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
+                                get_task_struct(task);
+                        }
+                        up_read(&css_set_rwsem);
+                        if (!task)
+                                break;
+                        /* guard against possible infinite loop */
+                        if (WARN(last_task == task,
+                                 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
+                                goto out_finish;
+                        last_task = task;
+                        threadgroup_lock(task);
+                        /* raced against de_thread() from another thread? */
+                        if (!thread_group_leader(task)) {
+                                threadgroup_unlock(task);
+                                put_task_struct(task);
+                                continue;
+                        }
+                        ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+                        threadgroup_unlock(task);
+                        put_task_struct(task);
+                        if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+                                goto out_finish;
+                }
+        }
+out_finish:
+        cgroup_migrate_finish(&preloaded_csets);
+        return ret;
+}
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
+                                            char *buf, size_t nbytes,
+                                            loff_t off)
+{
+        unsigned int enable = 0, disable = 0;
+        struct cgroup *cgrp, *child;
+        struct cgroup_subsys *ss;
+        char *tok;
+        int ssid, ret;
+        /*
+         * Parse input - space separated list of subsystem names prefixed
+         * with either + or -.
+         */
+        buf = strstrip(buf);
+        while ((tok = strsep(&buf, " "))) {
+                if (tok[0] == '\0')
+                        continue;
+                for_each_subsys(ss, ssid) {
+                        if (ss->disabled || strcmp(tok + 1, ss->name) ||
+                            ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+                                continue;
+                        if (*tok == '+') {
+                                enable |= 1 << ssid;
+                                disable &= ~(1 << ssid);
+                        } else if (*tok == '-') {
+                                disable |= 1 << ssid;
+                                enable &= ~(1 << ssid);
+                        } else {
+                                return -EINVAL;
+                        }
+                        break;
+                }
+                if (ssid == CGROUP_SUBSYS_COUNT)
+                        return -EINVAL;
+        }
+        cgrp = cgroup_kn_lock_live(of->kn);
+        if (!cgrp)
+                return -ENODEV;
+        for_each_subsys(ss, ssid) {
+                if (enable & (1 << ssid)) {
+                        if (cgrp->child_subsys_mask & (1 << ssid)) {
+                                enable &= ~(1 << ssid);
+                                continue;
+                        }
+                        /*
+                         * Because css offlining is asynchronous, userland
+                         * might try to re-enable the same controller while
+                         * the previous instance is still around.  In such
+                         * cases, wait till it's gone using offline_waitq.
+                         */
+                        cgroup_for_each_live_child(child, cgrp) {
+                                DEFINE_WAIT(wait);
+                                if (!cgroup_css(child, ss))
+                                        continue;
+                                cgroup_get(child);
+                                prepare_to_wait(&child->offline_waitq, &wait,
+                                                TASK_UNINTERRUPTIBLE);
+                                cgroup_kn_unlock(of->kn);
+                                schedule();
+                                finish_wait(&child->offline_waitq, &wait);
+                                cgroup_put(child);
+                                return restart_syscall();
+                        }
+                        /* unavailable or not enabled on the parent? */
+                        if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+                            (cgroup_parent(cgrp) &&
+                             !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
+                                ret = -ENOENT;
+                                goto out_unlock;
+                        }
+                } else if (disable & (1 << ssid)) {
+                        if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+                                disable &= ~(1 << ssid);
+                                continue;
+                        }
+                        /* a child has it enabled? */
+                        cgroup_for_each_live_child(child, cgrp) {
+                                if (child->child_subsys_mask & (1 << ssid)) {
+                                        ret = -EBUSY;
+                                        goto out_unlock;
+                                }
+                        }
+                }
+        }
+        if (!enable && !disable) {
+                ret = 0;
+                goto out_unlock;
+        }
+        /*
+         * Except for the root, child_subsys_mask must be zero for a cgroup
+         * with tasks so that child cgroups don't compete against tasks.
+         */
+        if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
+                ret = -EBUSY;
+                goto out_unlock;
+        }
+        /*
+         * Create csses for enables and update child_subsys_mask.  This
+         * changes cgroup_e_css() results which in turn makes the
+         * subsequent cgroup_update_dfl_csses() associate all tasks in the
+         * subtree to the updated csses.
+         */
+        for_each_subsys(ss, ssid) {
+                if (!(enable & (1 << ssid)))
+                        continue;
+                cgroup_for_each_live_child(child, cgrp) {
+                        ret = create_css(child, ss);
+                        if (ret)
+                                goto err_undo_css;
+                }
+        }
+        cgrp->child_subsys_mask |= enable;
+        cgrp->child_subsys_mask &= ~disable;
+        ret = cgroup_update_dfl_csses(cgrp);
+        if (ret)
+                goto err_undo_css;
+        /* all tasks are now migrated away from the old csses, kill them */
+        for_each_subsys(ss, ssid) {
+                if (!(disable & (1 << ssid)))
+                        continue;
+                cgroup_for_each_live_child(child, cgrp)
+                        kill_css(cgroup_css(child, ss));
+        }
+        kernfs_activate(cgrp->kn);
+        ret = 0;
+out_unlock:
+        cgroup_kn_unlock(of->kn);
+        return ret ?: nbytes;
+err_undo_css:
+        cgrp->child_subsys_mask &= ~enable;
+        cgrp->child_subsys_mask |= disable;
+        for_each_subsys(ss, ssid) {
+                if (!(enable & (1 << ssid)))
+                        continue;
+                cgroup_for_each_live_child(child, cgrp) {
+                        struct cgroup_subsys_state *css = cgroup_css(child, ss);
+                        if (css)
+                                kill_css(css);
+                }
+        }
+        goto out_unlock;
+}
+static int cgroup_populated_show(struct seq_file *seq, void *v)
+{
+        seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+        return 0;
+}
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
                                 size_t nbytes, loff_t off)
 {
@@ -2226,6 +2785,9 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
        struct cgroup_subsys_state *css;
        int ret;
+        if (cft->write)
+                return cft->write(of, buf, nbytes, off);
        /*
         * kernfs guarantees that a file isn't deleted with operations in
         * flight, which means that the matching css is and stays alive and
@@ -2236,9 +2798,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
        css = cgroup_css(cgrp, cft->ss);
        rcu_read_unlock();
-        if (cft->write_string) {
+        if (cft->write_u64) {
-                ret = cft->write_string(css, cft, strstrip(buf));
-        } else if (cft->write_u64) {
                unsigned long long v;
                ret = kstrtoull(buf, 0, &v);
                if (!ret)
@@ -2248,8 +2808,6 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
                ret = kstrtoll(buf, 0, &v);
                if (!ret)
                        ret = cft->write_s64(css, cft, v);
-        } else if (cft->trigger) {
-                ret = cft->trigger(css, (unsigned int)cft->private);
        } else {
                ret = -EINVAL;
        }
@@ -2326,20 +2884,18 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
                return -EPERM;
        /*
-         * We're gonna grab cgroup_tree_mutex which nests outside kernfs
+         * We're gonna grab cgroup_mutex which nests outside kernfs
         * active_ref.  kernfs_rename() doesn't require active_ref
-         * protection.  Break them before grabbing cgroup_tree_mutex.
+         * protection.  Break them before grabbing cgroup_mutex.
         */
        kernfs_break_active_protection(new_parent);
        kernfs_break_active_protection(kn);
-        mutex_lock(&cgroup_tree_mutex);
        mutex_lock(&cgroup_mutex);
        ret = kernfs_rename(kn, new_parent, new_name_str);
        mutex_unlock(&cgroup_mutex);
-        mutex_unlock(&cgroup_tree_mutex);
        kernfs_unbreak_active_protection(kn);
        kernfs_unbreak_active_protection(new_parent);
@@ -2377,9 +2933,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
                return PTR_ERR(kn);
        ret = cgroup_kn_set_ugid(kn);
-        if (ret)
+        if (ret) {
                kernfs_remove(kn);
-        return ret;
+                return ret;
+        }
+        if (cft->seq_show == cgroup_populated_show)
+                cgrp->populated_kn = kn;
+        return 0;
 }
 /**
@@ -2399,7 +2960,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
        struct cftype *cft;
        int ret;
-        lockdep_assert_held(&cgroup_tree_mutex);
+        lockdep_assert_held(&cgroup_mutex);
        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
@@ -2407,16 +2968,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
                        continue;
                if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
                        continue;
-                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
                        continue;
-                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
                        continue;
                if (is_add) {
                        ret = cgroup_add_file(cgrp, cft);
                        if (ret) {
-                                pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
+                                pr_warn("%s: failed to add %s, err=%d\n",
-                                        cft->name, ret);
+                                        __func__, cft->name, ret);
                                return ret;
                        }
                } else {
@@ -2434,11 +2995,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
        struct cgroup_subsys_state *css;
        int ret = 0;
-        lockdep_assert_held(&cgroup_tree_mutex);
+        lockdep_assert_held(&cgroup_mutex);
-        /* don't bother if @ss isn't attached */
-        if (ss->root == &cgrp_dfl_root)
-                return 0;
        /* add/rm files for all cgroups created before */
        css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2506,7 +3063,7 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
 {
-        lockdep_assert_held(&cgroup_tree_mutex);
+        lockdep_assert_held(&cgroup_mutex);
        if (!cfts || !cfts[0].ss)
                return -ENOENT;
@@ -2532,9 +3089,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
 {
        int ret;
-        mutex_lock(&cgroup_tree_mutex);
+        mutex_lock(&cgroup_mutex);
        ret = cgroup_rm_cftypes_locked(cfts);
-        mutex_unlock(&cgroup_tree_mutex);
+        mutex_unlock(&cgroup_mutex);
        return ret;
 }
@@ -2556,6 +3113,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
        int ret;
+        if (ss->disabled)
+                return 0;
        if (!cfts || cfts[0].name[0] == '\0')
                return 0;
@@ -2563,14 +3123,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
        if (ret)
                return ret;
-        mutex_lock(&cgroup_tree_mutex);
+        mutex_lock(&cgroup_mutex);
        list_add_tail(&cfts->node, &ss->cfts);
        ret = cgroup_apply_cftypes(cfts, true);
        if (ret)
                cgroup_rm_cftypes_locked(cfts);
-        mutex_unlock(&cgroup_tree_mutex);
+        mutex_unlock(&cgroup_mutex);
        return ret;
 }
@@ -2594,57 +3154,65 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 /**
 * css_next_child - find the next child of a given css
- * @pos_css: the current position (%NULL to initiate traversal)
+ * @pos: the current position (%NULL to initiate traversal)
- * @parent_css: css whose children to walk
+ * @parent: css whose children to walk
 *
- * This function returns the next child of @parent_css and should be called
+ * This function returns the next child of @parent and should be called
 * under either cgroup_mutex or RCU read lock.  The only requirement is
- * that @parent_css and @pos_css are accessible.  The next sibling is
+ * that @parent and @pos are accessible.  The next sibling is guaranteed to
- * guaranteed to be returned regardless of their states.
+ * be returned regardless of their states.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
 */
-struct cgroup_subsys_state *
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
-css_next_child(struct cgroup_subsys_state *pos_css,
+                                           struct cgroup_subsys_state *parent)
-               struct cgroup_subsys_state *parent_css)
 {
-        struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
+        struct cgroup_subsys_state *next;
-        struct cgroup *cgrp = parent_css->cgroup;
-        struct cgroup *next;
-        cgroup_assert_mutexes_or_rcu_locked();
+        cgroup_assert_mutex_or_rcu_locked();
        /*
-         * @pos could already have been removed.  Once a cgroup is removed,
+         * @pos could already have been unlinked from the sibling list.
-         * its ->sibling.next is no longer updated when its next sibling
+         * Once a cgroup is removed, its ->sibling.next is no longer
-         * changes.  As CGRP_DEAD assertion is serialized and happens
+         * updated when its next sibling changes.  CSS_RELEASED is set when
-         * before the cgroup is taken off the ->sibling list, if we see it
+         * @pos is taken off list, at which time its next pointer is valid,
-         * unasserted, it's guaranteed that the next sibling hasn't
+         * and, as releases are serialized, the one pointed to by the next
-         * finished its grace period even if it's already removed, and thus
+         * pointer is guaranteed to not have started release yet.  This
-         * safe to dereference from this RCU critical section.  If
+         * implies that if we observe !CSS_RELEASED on @pos in this RCU
-         * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
+         * critical section, the one pointed to by its next pointer is
-         * to be visible as %true here.
+         * guaranteed to not have finished its RCU grace period even if we
+         * have dropped rcu_read_lock() inbetween iterations.
         *
-         * If @pos is dead, its next pointer can't be dereferenced;
+         * If @pos has CSS_RELEASED set, its next pointer can't be
-         * however, as each cgroup is given a monotonically increasing
+         * dereferenced; however, as each css is given a monotonically
-         * unique serial number and always appended to the sibling list,
+         * increasing unique serial number and always appended to the
-         * the next one can be found by walking the parent's children until
+         * sibling list, the next one can be found by walking the parent's
-         * we see a cgroup with higher serial number than @pos's.  While
+         * children until the first css with higher serial number than
-         * this path can be slower, it's taken only when either the current
+         * @pos's.  While this path can be slower, it happens iff iteration
-         * cgroup is removed or iteration and removal race.
+         * races against release and the race window is very small.
         */
        if (!pos) {
-                next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
+                next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
-        } else if (likely(!cgroup_is_dead(pos))) {
+        } else if (likely(!(pos->flags & CSS_RELEASED))) {
-                next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+                next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
        } else {
-                list_for_each_entry_rcu(next, &cgrp->children, sibling)
+                list_for_each_entry_rcu(next, &parent->children, sibling)
                        if (next->serial_nr > pos->serial_nr)
                                break;
        }
-        if (&next->sibling == &cgrp->children)
+        /*
-                return NULL;
+         * @next, if not pointing to the head, can be dereferenced and is
+         * the next sibling.
-        return cgroup_css(next, parent_css->ss);
+         */
+        if (&next->sibling != &parent->children)
+                return next;
+        return NULL;
 }
 /**
@@ -2660,6 +3228,13 @@ css_next_child(struct cgroup_subsys_state *pos_css,
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @root are accessible and @pos is a descendant of @root.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
 */
 struct cgroup_subsys_state *
 css_next_descendant_pre(struct cgroup_subsys_state *pos,
@@ -2667,7 +3242,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
 {
        struct cgroup_subsys_state *next;
-        cgroup_assert_mutexes_or_rcu_locked();
+        cgroup_assert_mutex_or_rcu_locked();
        /* if first iteration, visit @root */
        if (!pos)
@@ -2680,10 +3255,10 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
        /* no child, visit my or the closest ancestor's next sibling */
        while (pos != root) {
-                next = css_next_child(pos, css_parent(pos));
+                next = css_next_child(pos, pos->parent);
                if (next)
                        return next;
-                pos = css_parent(pos);
+                pos = pos->parent;
        }
        return NULL;
@@ -2707,7 +3282,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
 {
        struct cgroup_subsys_state *last, *tmp;
-        cgroup_assert_mutexes_or_rcu_locked();
+        cgroup_assert_mutex_or_rcu_locked();
        do {
                last = pos;
@@ -2747,6 +3322,13 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos)
 * section.  This function will return the correct next descendant as long
 * as both @pos and @cgroup are accessible and @pos is a descendant of
 * @cgroup.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
 */
 struct cgroup_subsys_state *
 css_next_descendant_post(struct cgroup_subsys_state *pos,
@@ -2754,7 +3336,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
 {
        struct cgroup_subsys_state *next;
-        cgroup_assert_mutexes_or_rcu_locked();
+        cgroup_assert_mutex_or_rcu_locked();
        /* if first iteration, visit leftmost descendant which may be @root */
        if (!pos)
@@ -2765,12 +3347,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
                return NULL;
        /* if there's an unvisited sibling, visit its leftmost descendant */
-        next = css_next_child(pos, css_parent(pos));
+        next = css_next_child(pos, pos->parent);
        if (next)
                return css_leftmost_descendant(next);
        /* no sibling left, visit parent */
-        return css_parent(pos);
+        return pos->parent;
+}
+/**
+ * css_has_online_children - does a css have online children
+ * @css: the target css
+ *
+ * Returns %true if @css has any online children; otherwise, %false.  This
+ * function can be called from any context but the caller is responsible
+ * for synchronizing against on/offlining as necessary.
+ */
+bool css_has_online_children(struct cgroup_subsys_state *css)
+{
+        struct cgroup_subsys_state *child;
+        bool ret = false;
+        rcu_read_lock();
+        css_for_each_child(child, css) {
+                if (child->flags & CSS_ONLINE) {
+                        ret = true;
+                        break;
+                }
+        }
+        rcu_read_unlock();
+        return ret;
 }
 /**
@@ -2781,27 +3387,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
 */
 static void css_advance_task_iter(struct css_task_iter *it)
 {
-        struct list_head *l = it->cset_link;
+        struct list_head *l = it->cset_pos;
        struct cgrp_cset_link *link;
        struct css_set *cset;
        /* Advance to the next non-empty css_set */
        do {
                l = l->next;
-                if (l == &it->origin_css->cgroup->cset_links) {
+                if (l == it->cset_head) {
-                        it->cset_link = NULL;
+                        it->cset_pos = NULL;
                        return;
                }
-                link = list_entry(l, struct cgrp_cset_link, cset_link);
-                cset = link->cset;
+                if (it->ss) {
+                        cset = container_of(l, struct css_set,
+                                            e_cset_node[it->ss->id]);
+                } else {
+                        link = list_entry(l, struct cgrp_cset_link, cset_link);
+                        cset = link->cset;
+                }
        } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
-        it->cset_link = l;
+        it->cset_pos = l;
        if (!list_empty(&cset->tasks))
-                it->task = cset->tasks.next;
+                it->task_pos = cset->tasks.next;
        else
-                it->task = cset->mg_tasks.next;
+                it->task_pos = cset->mg_tasks.next;
+        it->tasks_head = &cset->tasks;
+        it->mg_tasks_head = &cset->mg_tasks;
 }
 /**
@@ -2827,8 +3442,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
        down_read(&css_set_rwsem);
-        it->origin_css = css;
+        it->ss = css->ss;
-        it->cset_link = &css->cgroup->cset_links;
+        if (it->ss)
+                it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+        else
+                it->cset_pos = &css->cgroup->cset_links;
+        it->cset_head = it->cset_pos;
        css_advance_task_iter(it);
 }
@@ -2844,12 +3465,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
        struct task_struct *res;
-        struct list_head *l = it->task;
+        struct list_head *l = it->task_pos;
-        struct cgrp_cset_link *link = list_entry(it->cset_link,
-                                        struct cgrp_cset_link, cset_link);
        /* If the iterator cg is NULL, we have no tasks */
-        if (!it->cset_link)
+        if (!it->cset_pos)
                return NULL;
        res = list_entry(l, struct task_struct, cg_list);
@@ -2860,13 +3479,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
         */
        l = l->next;
-        if (l == &link->cset->tasks)
+        if (l == it->tasks_head)
-                l = link->cset->mg_tasks.next;
+                l = it->mg_tasks_head->next;
-        if (l == &link->cset->mg_tasks)
+        if (l == it->mg_tasks_head)
                css_advance_task_iter(it);
        else
-                it->task = l;
+                it->task_pos = l;
        return res;
 }
@@ -2919,7 +3538,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
         * ->can_attach() fails.
         */
        do {
-                css_task_iter_start(&from->dummy_css, &it);
+                css_task_iter_start(&from->self, &it);
                task = css_task_iter_next(&it);
                if (task)
                        get_task_struct(task);
@@ -3184,7 +3803,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
        if (!array)
                return -ENOMEM;
        /* now, populate the array */
-        css_task_iter_start(&cgrp->dummy_css, &it);
+        css_task_iter_start(&cgrp->self, &it);
        while ((tsk = css_task_iter_next(&it))) {
                if (unlikely(n == length))
                        break;
@@ -3246,7 +3865,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
        /*
         * We aren't being called from kernfs and there's no guarantee on
-         * @kn->priv's validity.  For this and css_tryget_from_dir(),
+         * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
         * @kn->priv is RCU safe.  Let's do the RCU dancing.
         */
        rcu_read_lock();
@@ -3258,7 +3877,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
        }
        rcu_read_unlock();
-        css_task_iter_start(&cgrp->dummy_css, &it);
+        css_task_iter_start(&cgrp->self, &it);
        while ((tsk = css_task_iter_next(&it))) {
                switch (tsk->state) {
                case TASK_RUNNING:
@@ -3388,17 +4007,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v)
        return seq_printf(s, "%d\n", *(int *)v);
 }
-/*
- * seq_operations functions for iterating on pidlists through seq_file -
- * independent of whether it's tasks or procs
- */
-static const struct seq_operations cgroup_pidlist_seq_operations = {
-        .start = cgroup_pidlist_start,
-        .stop = cgroup_pidlist_stop,
-        .next = cgroup_pidlist_next,
-        .show = cgroup_pidlist_show,
-};
 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
                                         struct cftype *cft)
 {
@@ -3440,7 +4048,7 @@ static struct cftype cgroup_base_files[] = {
                .seq_stop = cgroup_pidlist_stop,
                .seq_show = cgroup_pidlist_show,
                .private = CGROUP_FILE_PROCS,
-                .write_u64 = cgroup_procs_write,
+                .write = cgroup_procs_write,
                .mode = S_IRUGO | S_IWUSR,
        },
        {
@@ -3454,6 +4062,27 @@ static struct cftype cgroup_base_files[] = {
                .flags = CFTYPE_ONLY_ON_ROOT,
                .seq_show = cgroup_sane_behavior_show,
        },
+        {
+                .name = "cgroup.controllers",
+                .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+                .seq_show = cgroup_root_controllers_show,
+        },
+        {
+                .name = "cgroup.controllers",
+                .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+                .seq_show = cgroup_controllers_show,
+        },
+        {
+                .name = "cgroup.subtree_control",
+                .flags = CFTYPE_ONLY_ON_DFL,
+                .seq_show = cgroup_subtree_control_show,
+                .write = cgroup_subtree_control_write,
+        },
+        {
+                .name = "cgroup.populated",
+                .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+                .seq_show = cgroup_populated_show,
+        },
        /*
         * Historical crazy stuff.  These don't have "cgroup."  prefix and
@@ -3468,7 +4097,7 @@ static struct cftype cgroup_base_files[] = {
                .seq_stop = cgroup_pidlist_stop,
                .seq_show = cgroup_pidlist_show,
                .private = CGROUP_FILE_TASKS,
-                .write_u64 = cgroup_tasks_write,
+                .write = cgroup_tasks_write,
                .mode = S_IRUGO | S_IWUSR,
        },
        {
@@ -3481,7 +4110,7 @@ static struct cftype cgroup_base_files[] = {
                .name = "release_agent",
                .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
                .seq_show = cgroup_release_agent_show,
-                .write_string = cgroup_release_agent_write,
+                .write = cgroup_release_agent_write,
                .max_write_len = PATH_MAX - 1,
        },
        { }     /* terminate */
@@ -3494,7 +4123,7 @@ static struct cftype cgroup_base_files[] = {
 *
 * On failure, no file is added.
 */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
 {
        struct cgroup_subsys *ss;
        int i, ret = 0;
@@ -3503,7 +4132,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
        for_each_subsys(ss, i) {
                struct cftype *cfts;
-                if (!test_bit(i, &subsys_mask))
+                if (!(subsys_mask & (1 << i)))
                        continue;
                list_for_each_entry(cfts, &ss->cfts, node) {
@@ -3525,9 +4154,9 @@ err:
 *    Implemented in kill_css().
 *
 * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
- *    and thus css_tryget() is guaranteed to fail, the css can be offlined
+ *    and thus css_tryget_online() is guaranteed to fail, the css can be
- *    by invoking offline_css().  After offlining, the base ref is put.
+ *    offlined by invoking offline_css().  After offlining, the base ref is
- *    Implemented in css_killed_work_fn().
+ *    put.  Implemented in css_killed_work_fn().
 *
 * 3. When the percpu_ref reaches zero, the only possible remaining
 *    accessors are inside RCU read sections.  css_release() schedules the
@@ -3546,11 +4175,37 @@ static void css_free_work_fn(struct work_struct *work)
                container_of(work, struct cgroup_subsys_state, destroy_work);
        struct cgroup *cgrp = css->cgroup;
-        if (css->parent)
+        if (css->ss) {
-                css_put(css->parent);
+                /* css free path */
+                if (css->parent)
+                        css_put(css->parent);
-        css->ss->css_free(css);
+                css->ss->css_free(css);
-        cgroup_put(cgrp);
+                cgroup_put(cgrp);
+        } else {
+                /* cgroup free path */
+                atomic_dec(&cgrp->root->nr_cgrps);
+                cgroup_pidlist_destroy_all(cgrp);
+                if (cgroup_parent(cgrp)) {
+                        /*
+                         * We get a ref to the parent, and put the ref when
+                         * this cgroup is being freed, so it's guaranteed
+                         * that the parent won't be destroyed before its
+                         * children.
+                         */
+                        cgroup_put(cgroup_parent(cgrp));
+                        kernfs_put(cgrp->kn);
+                        kfree(cgrp);
+                } else {
+                        /*
+                         * This is root cgroup's refcnt reaching zero,
+                         * which indicates that the root should be
+                         * released.
+                         */
+                        cgroup_destroy_root(cgrp->root);
+                }
+        }
 }
 static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -3562,26 +4217,59 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
+static void css_release_work_fn(struct work_struct *work)
+{
+        struct cgroup_subsys_state *css =
+                container_of(work, struct cgroup_subsys_state, destroy_work);
+        struct cgroup_subsys *ss = css->ss;
+        struct cgroup *cgrp = css->cgroup;
+        mutex_lock(&cgroup_mutex);
+        css->flags |= CSS_RELEASED;
+        list_del_rcu(&css->sibling);
+        if (ss) {
+                /* css release path */
+                cgroup_idr_remove(&ss->css_idr, css->id);
+        } else {
+                /* cgroup release path */
+                cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+                cgrp->id = -1;
+        }
+        mutex_unlock(&cgroup_mutex);
+        call_rcu(&css->rcu_head, css_free_rcu_fn);
+}
 static void css_release(struct percpu_ref *ref)
 {
        struct cgroup_subsys_state *css =
                container_of(ref, struct cgroup_subsys_state, refcnt);
-        RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
+        INIT_WORK(&css->destroy_work, css_release_work_fn);
-        call_rcu(&css->rcu_head, css_free_rcu_fn);
+        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
-static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
+static void init_and_link_css(struct cgroup_subsys_state *css,
-                     struct cgroup *cgrp)
+                              struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
+        lockdep_assert_held(&cgroup_mutex);
+        cgroup_get(cgrp);
+        memset(css, 0, sizeof(*css));
        css->cgroup = cgrp;
        css->ss = ss;
-        css->flags = 0;
+        INIT_LIST_HEAD(&css->sibling);
+        INIT_LIST_HEAD(&css->children);
+        css->serial_nr = css_serial_nr_next++;
-        if (cgrp->parent)
+        if (cgroup_parent(cgrp)) {
-                css->parent = cgroup_css(cgrp->parent, ss);
+                css->parent = cgroup_css(cgroup_parent(cgrp), ss);
-        else
+                css_get(css->parent);
-                css->flags |= CSS_ROOT;
+        }
        BUG_ON(cgroup_css(cgrp, ss));
 }
@@ -3592,14 +4280,12 @@ static int online_css(struct cgroup_subsys_state *css)
        struct cgroup_subsys *ss = css->ss;
        int ret = 0;
-        lockdep_assert_held(&cgroup_tree_mutex);
        lockdep_assert_held(&cgroup_mutex);
        if (ss->css_online)
                ret = ss->css_online(css);
        if (!ret) {
                css->flags |= CSS_ONLINE;
-                css->cgroup->nr_css++;
                rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
        }
        return ret;
@@ -3610,7 +4296,6 @@ static void offline_css(struct cgroup_subsys_state *css)
 {
        struct cgroup_subsys *ss = css->ss;
-        lockdep_assert_held(&cgroup_tree_mutex);
        lockdep_assert_held(&cgroup_mutex);
        if (!(css->flags & CSS_ONLINE))
@@ -3620,8 +4305,9 @@ static void offline_css(struct cgroup_subsys_state *css)
                ss->css_offline(css);
        css->flags &= ~CSS_ONLINE;
-        css->cgroup->nr_css--;
+        RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
-        RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
+        wake_up_all(&css->cgroup->offline_waitq);
 }
 /**
@@ -3635,111 +4321,102 @@ static void offline_css(struct cgroup_subsys_state *css)
 */
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 {
-        struct cgroup *parent = cgrp->parent;
+        struct cgroup *parent = cgroup_parent(cgrp);
+        struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
        struct cgroup_subsys_state *css;
        int err;
        lockdep_assert_held(&cgroup_mutex);
-        css = ss->css_alloc(cgroup_css(parent, ss));
+        css = ss->css_alloc(parent_css);
        if (IS_ERR(css))
                return PTR_ERR(css);
+        init_and_link_css(css, ss, cgrp);
        err = percpu_ref_init(&css->refcnt, css_release);
        if (err)
                goto err_free_css;
-        init_css(css, ss, cgrp);
+        err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+        if (err < 0)
+                goto err_free_percpu_ref;
+        css->id = err;
        err = cgroup_populate_dir(cgrp, 1 << ss->id);
        if (err)
-                goto err_free_percpu_ref;
+                goto err_free_id;
+        /* @css is ready to be brought online now, make it visible */
+        list_add_tail_rcu(&css->sibling, &parent_css->children);
+        cgroup_idr_replace(&ss->css_idr, css, css->id);
        err = online_css(css);
        if (err)
-                goto err_clear_dir;
+                goto err_list_del;
-        cgroup_get(cgrp);
-        css_get(css->parent);
-        cgrp->subsys_mask |= 1 << ss->id;
        if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
-            parent->parent) {
+            cgroup_parent(parent)) {
-                pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
+                pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
-                           current->comm, current->pid, ss->name);
+                        current->comm, current->pid, ss->name);
                if (!strcmp(ss->name, "memory"))
-                        pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
+                        pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
                ss->warned_broken_hierarchy = true;
        }
        return 0;
-err_clear_dir:
+err_list_del:
+        list_del_rcu(&css->sibling);
        cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+err_free_id:
+        cgroup_idr_remove(&ss->css_idr, css->id);
 err_free_percpu_ref:
        percpu_ref_cancel_init(&css->refcnt);
 err_free_css:
-        ss->css_free(css);
+        call_rcu(&css->rcu_head, css_free_rcu_fn);
        return err;
 }
-/**
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- * cgroup_create - create a cgroup
+                        umode_t mode)
- * @parent: cgroup that will be parent of the new cgroup
- * @name: name of the new cgroup
- * @mode: mode to set on new cgroup
- */
-static long cgroup_create(struct cgroup *parent, const char *name,
-                          umode_t mode)
 {
-        struct cgroup *cgrp;
+        struct cgroup *parent, *cgrp;
-        struct cgroup_root *root = parent->root;
+        struct cgroup_root *root;
-        int ssid, err;
        struct cgroup_subsys *ss;
        struct kernfs_node *kn;
+        int ssid, ret;
-        /*
+        parent = cgroup_kn_lock_live(parent_kn);
-         * XXX: The default hierarchy isn't fully implemented yet.  Block
+        if (!parent)
-         * !root cgroup creation on it for now.
+                return -ENODEV;
-         */
+        root = parent->root;
-        if (root == &cgrp_dfl_root)
-                return -EINVAL;
        /* allocate the cgroup and its ID, 0 is reserved for the root */
        cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
-        if (!cgrp)
+        if (!cgrp) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out_unlock;
-        mutex_lock(&cgroup_tree_mutex);
-        /*
-         * Only live parents can have children.  Note that the liveliness
-         * check isn't strictly necessary because cgroup_mkdir() and
-         * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
-         * anyway so that locking is contained inside cgroup proper and we
-         * don't get nasty surprises if we ever grow another caller.
-         */
-        if (!cgroup_lock_live_group(parent)) {
-                err = -ENODEV;
-                goto err_unlock_tree;
        }
+        ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+        if (ret)
+                goto out_free_cgrp;
        /*
         * Temporarily set the pointer to NULL, so idr_find() won't return
         * a half-baked cgroup.
         */
-        cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
+        cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
        if (cgrp->id < 0) {
-                err = -ENOMEM;
+                ret = -ENOMEM;
-                goto err_unlock;
+                goto out_cancel_ref;
        }
        init_cgroup_housekeeping(cgrp);
-        cgrp->parent = parent;
+        cgrp->self.parent = &parent->self;
-        cgrp->dummy_css.parent = &parent->dummy_css;
+        cgrp->root = root;
-        cgrp->root = parent->root;
        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -3750,8 +4427,8 @@ static long cgroup_create(struct cgroup *parent, const char *name,
        /* create the directory */
        kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
        if (IS_ERR(kn)) {
-                err = PTR_ERR(kn);
+                ret = PTR_ERR(kn);
-                goto err_free_id;
+                goto out_free_id;
        }
        cgrp->kn = kn;
@@ -3761,10 +4438,10 @@ static long cgroup_create(struct cgroup *parent, const char *name,
         */
        kernfs_get(kn);
-        cgrp->serial_nr = cgroup_serial_nr_next++;
+        cgrp->self.serial_nr = css_serial_nr_next++;
        /* allocation complete, commit to creation */
-        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
+        list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
        atomic_inc(&root->nr_cgrps);
        cgroup_get(parent);
@@ -3772,107 +4449,66 @@ static long cgroup_create(struct cgroup *parent, const char *name,
         * @cgrp is now fully operational.  If something fails after this
         * point, it'll be released via the normal destruction path.
         */
-        idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
+        cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
-        err = cgroup_kn_set_ugid(kn);
+        ret = cgroup_kn_set_ugid(kn);
-        if (err)
+        if (ret)
-                goto err_destroy;
+                goto out_destroy;
-        err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+        ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
-        if (err)
+        if (ret)
-                goto err_destroy;
+                goto out_destroy;
        /* let's create and online css's */
        for_each_subsys(ss, ssid) {
-                if (root->cgrp.subsys_mask & (1 << ssid)) {
+                if (parent->child_subsys_mask & (1 << ssid)) {
-                        err = create_css(cgrp, ss);
+                        ret = create_css(cgrp, ss);
-                        if (err)
+                        if (ret)
-                                goto err_destroy;
+                                goto out_destroy;
                }
        }
-        kernfs_activate(kn);
+        /*
+         * On the default hierarchy, a child doesn't automatically inherit
+         * child_subsys_mask from the parent.  Each is configured manually.
+         */
+        if (!cgroup_on_dfl(cgrp))
+                cgrp->child_subsys_mask = parent->child_subsys_mask;
-        mutex_unlock(&cgroup_mutex);
+        kernfs_activate(kn);
-        mutex_unlock(&cgroup_tree_mutex);
-        return 0;
+        ret = 0;
+        goto out_unlock;
-err_free_id:
+out_free_id:
-        idr_remove(&root->cgroup_idr, cgrp->id);
+        cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
-err_unlock:
+out_cancel_ref:
-        mutex_unlock(&cgroup_mutex);
+        percpu_ref_cancel_init(&cgrp->self.refcnt);
-err_unlock_tree:
+out_free_cgrp:
-        mutex_unlock(&cgroup_tree_mutex);
        kfree(cgrp);
-        return err;
+out_unlock:
+        cgroup_kn_unlock(parent_kn);
+        return ret;
-err_destroy:
+out_destroy:
        cgroup_destroy_locked(cgrp);
-        mutex_unlock(&cgroup_mutex);
+        goto out_unlock;
-        mutex_unlock(&cgroup_tree_mutex);
-        return err;
-}
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-                        umode_t mode)
-{
-        struct cgroup *parent = parent_kn->priv;
-        int ret;
-        /*
-         * cgroup_create() grabs cgroup_tree_mutex which nests outside
-         * kernfs active_ref and cgroup_create() already synchronizes
-         * properly against removal through cgroup_lock_live_group().
-         * Break it before calling cgroup_create().
-         */
-        cgroup_get(parent);
-        kernfs_break_active_protection(parent_kn);
-        ret = cgroup_create(parent, name, mode);
-        kernfs_unbreak_active_protection(parent_kn);
-        cgroup_put(parent);
-        return ret;
 }
 /*
 * This is called when the refcnt of a css is confirmed to be killed.
- * css_tryget() is now guaranteed to fail.
+ * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
+ * initate destruction and put the css ref from kill_css().
 */
 static void css_killed_work_fn(struct work_struct *work)
 {
        struct cgroup_subsys_state *css =
                container_of(work, struct cgroup_subsys_state, destroy_work);
-        struct cgroup *cgrp = css->cgroup;
-        mutex_lock(&cgroup_tree_mutex);
        mutex_lock(&cgroup_mutex);
-        /*
-         * css_tryget() is guaranteed to fail now.  Tell subsystems to
-         * initate destruction.
-         */
        offline_css(css);
-        /*
-         * If @cgrp is marked dead, it's waiting for refs of all css's to
-         * be disabled before proceeding to the second phase of cgroup
-         * destruction.  If we are the last one, kick it off.
-         */
-        if (!cgrp->nr_css && cgroup_is_dead(cgrp))
-                cgroup_destroy_css_killed(cgrp);
        mutex_unlock(&cgroup_mutex);
-        mutex_unlock(&cgroup_tree_mutex);
-        /*
-         * Put the css refs from kill_css().  Each css holds an extra
-         * reference to the cgroup's dentry and cgroup removal proceeds
-         * regardless of css refs.  On the last put of each css, whenever
-         * that may be, the extra dentry ref is put so that dentry
-         * destruction happens only after all css's are released.
-         */
        css_put(css);
 }
@@ -3886,9 +4522,18 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
        queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
-static void __kill_css(struct cgroup_subsys_state *css)
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference.  ->css_offline() will be invoked
+ * asynchronously once css_tryget_online() is guaranteed to fail and when
+ * the reference count reaches zero, @css will be released.
+ */
+static void kill_css(struct cgroup_subsys_state *css)
 {
-        lockdep_assert_held(&cgroup_tree_mutex);
+        lockdep_assert_held(&cgroup_mutex);
        /*
         * This must happen before css is disassociated with its cgroup.
@@ -3905,7 +4550,7 @@ static void __kill_css(struct cgroup_subsys_state *css)
        /*
         * cgroup core guarantees that, by the time ->css_offline() is
         * invoked, no new css reference will be given out via
-         * css_tryget().  We can't simply call percpu_ref_kill() and
+         * css_tryget_online().  We can't simply call percpu_ref_kill() and
         * proceed to offlining css's because percpu_ref_kill() doesn't
         * guarantee that the ref is seen as killed on all CPUs on return.
         *
@@ -3916,36 +4561,14 @@ static void __kill_css(struct cgroup_subsys_state *css)
 }
 /**
- * kill_css - destroy a css
- * @css: css to destroy
- *
- * This function initiates destruction of @css by removing cgroup interface
- * files and putting its base reference.  ->css_offline() will be invoked
- * asynchronously once css_tryget() is guaranteed to fail and when the
- * reference count reaches zero, @css will be released.
- */
-static void kill_css(struct cgroup_subsys_state *css)
-{
-        struct cgroup *cgrp = css->cgroup;
-        lockdep_assert_held(&cgroup_tree_mutex);
-        /* if already killed, noop */
-        if (cgrp->subsys_mask & (1 << css->ss->id)) {
-                cgrp->subsys_mask &= ~(1 << css->ss->id);
-                __kill_css(css);
-        }
-}
-/**
 * cgroup_destroy_locked - the first stage of cgroup destruction
 * @cgrp: cgroup to be destroyed
 *
 * css's make use of percpu refcnts whose killing latency shouldn't be
 * exposed to userland and are RCU protected.  Also, cgroup core needs to
- * guarantee that css_tryget() won't succeed by the time ->css_offline() is
+ * guarantee that css_tryget_online() won't succeed by the time
- * invoked.  To satisfy all the requirements, destruction is implemented in
+ * ->css_offline() is invoked.  To satisfy all the requirements,
- * the following two steps.
+ * destruction is implemented in the following two steps.
 *
 * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
 *     userland visible parts and start killing the percpu refcnts of
@@ -3964,12 +4587,10 @@ static void kill_css(struct cgroup_subsys_state *css)
 static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
-        struct cgroup *child;
        struct cgroup_subsys_state *css;
        bool empty;
        int ssid;
-        lockdep_assert_held(&cgroup_tree_mutex);
        lockdep_assert_held(&cgroup_mutex);
        /*
@@ -3983,127 +4604,68 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
                return -EBUSY;
        /*
-         * Make sure there's no live children.  We can't test ->children
+         * Make sure there's no live children.  We can't test emptiness of
-         * emptiness as dead children linger on it while being destroyed;
+         * ->self.children as dead children linger on it while being
-         * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
+         * drained; otherwise, "rmdir parent/child parent" may fail.
         */
-        empty = true;
+        if (css_has_online_children(&cgrp->self))
-        rcu_read_lock();
-        list_for_each_entry_rcu(child, &cgrp->children, sibling) {
-                empty = cgroup_is_dead(child);
-                if (!empty)
-                        break;
-        }
-        rcu_read_unlock();
-        if (!empty)
                return -EBUSY;
        /*
         * Mark @cgrp dead.  This prevents further task migration and child
-         * creation by disabling cgroup_lock_live_group().  Note that
+         * creation by disabling cgroup_lock_live_group().
-         * CGRP_DEAD assertion is depended upon by css_next_child() to
-         * resume iteration after dropping RCU read lock.  See
-         * css_next_child() for details.
         */
-        set_bit(CGRP_DEAD, &cgrp->flags);
+        cgrp->self.flags &= ~CSS_ONLINE;
-        /*
+        /* initiate massacre of all css's */
-         * Initiate massacre of all css's.  cgroup_destroy_css_killed()
-         * will be invoked to perform the rest of destruction once the
-         * percpu refs of all css's are confirmed to be killed.  This
-         * involves removing the subsystem's files, drop cgroup_mutex.
-         */
-        mutex_unlock(&cgroup_mutex);
        for_each_css(css, ssid, cgrp)
                kill_css(css);
-        mutex_lock(&cgroup_mutex);
-        /* CGRP_DEAD is set, remove from ->release_list for the last time */
+        /* CSS_ONLINE is clear, remove from ->release_list for the last time */
        raw_spin_lock(&release_list_lock);
        if (!list_empty(&cgrp->release_list))
                list_del_init(&cgrp->release_list);
        raw_spin_unlock(&release_list_lock);
        /*
-         * If @cgrp has css's attached, the second stage of cgroup
+         * Remove @cgrp directory along with the base files.  @cgrp has an
-         * destruction is kicked off from css_killed_work_fn() after the
+         * extra ref on its kn.
-         * refs of all attached css's are killed.  If @cgrp doesn't have
-         * any css, we kick it off here.
         */
-        if (!cgrp->nr_css)
+        kernfs_remove(cgrp->kn);
-                cgroup_destroy_css_killed(cgrp);
-        /* remove @cgrp directory along with the base files */
+        set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
-        mutex_unlock(&cgroup_mutex);
+        check_for_release(cgroup_parent(cgrp));
-        /*
-         * There are two control paths which try to determine cgroup from
-         * dentry without going through kernfs - cgroupstats_build() and
-         * css_tryget_from_dir().  Those are supported by RCU protecting
-         * clearing of cgrp->kn->priv backpointer, which should happen
-         * after all files under it have been removed.
-         */
-        kernfs_remove(cgrp->kn);        /* @cgrp has an extra ref on its kn */
-        RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
-        mutex_lock(&cgroup_mutex);
+        /* put the base reference */
+        percpu_ref_kill(&cgrp->self.refcnt);
        return 0;
 };
-/**
- * cgroup_destroy_css_killed - the second step of cgroup destruction
- * @work: cgroup->destroy_free_work
- *
- * This function is invoked from a work item for a cgroup which is being
- * destroyed after all css's are offlined and performs the rest of
- * destruction.  This is the second step of destruction described in the
- * comment above cgroup_destroy_locked().
- */
-static void cgroup_destroy_css_killed(struct cgroup *cgrp)
-{
-        struct cgroup *parent = cgrp->parent;
-        lockdep_assert_held(&cgroup_tree_mutex);
-        lockdep_assert_held(&cgroup_mutex);
-        /* delete this cgroup from parent->children */
-        list_del_rcu(&cgrp->sibling);
-        cgroup_put(cgrp);
-        set_bit(CGRP_RELEASABLE, &parent->flags);
-        check_for_release(parent);
-}
 static int cgroup_rmdir(struct kernfs_node *kn)
 {
-        struct cgroup *cgrp = kn->priv;
+        struct cgroup *cgrp;
        int ret = 0;
-        /*
+        cgrp = cgroup_kn_lock_live(kn);
-         * This is self-destruction but @kn can't be removed while this
+        if (!cgrp)
-         * callback is in progress.  Let's break active protection.  Once
+                return 0;
-         * the protection is broken, @cgrp can be destroyed at any point.
+        cgroup_get(cgrp);       /* for @kn->priv clearing */
-         * Pin it so that it stays accessible.
-         */
-        cgroup_get(cgrp);
-        kernfs_break_active_protection(kn);
-        mutex_lock(&cgroup_tree_mutex);
+        ret = cgroup_destroy_locked(cgrp);
-        mutex_lock(&cgroup_mutex);
+        cgroup_kn_unlock(kn);
        /*
-         * @cgrp might already have been destroyed while we're trying to
+         * There are two control paths which try to determine cgroup from
-         * grab the mutexes.
+         * dentry without going through kernfs - cgroupstats_build() and
+         * css_tryget_online_from_dir().  Those are supported by RCU
+         * protecting clearing of cgrp->kn->priv backpointer, which should
+         * happen after all files under it have been removed.
         */
-        if (!cgroup_is_dead(cgrp))
+        if (!ret)
-                ret = cgroup_destroy_locked(cgrp);
+                RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
-        mutex_unlock(&cgroup_mutex);
-        mutex_unlock(&cgroup_tree_mutex);
-        kernfs_unbreak_active_protection(kn);
        cgroup_put(cgrp);
        return ret;
 }
@@ -4116,15 +4678,15 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
        .rename                 = cgroup_rename,
 };
-static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
+static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 {
        struct cgroup_subsys_state *css;
        printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
-        mutex_lock(&cgroup_tree_mutex);
        mutex_lock(&cgroup_mutex);
+        idr_init(&ss->css_idr);
        INIT_LIST_HEAD(&ss->cfts);
        /* Create the root cgroup state for this subsystem */
@@ -4132,7 +4694,21 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
-        init_css(css, ss, &cgrp_dfl_root.cgrp);
+        init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
+        /*
+         * Root csses are never destroyed and we can't initialize
+         * percpu_ref during early init.  Disable refcnting.
+         */
+        css->flags |= CSS_NO_REF;
+        if (early) {
+                /* allocation can't be done safely during early init */
+                css->id = 1;
+        } else {
+                css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
+                BUG_ON(css->id < 0);
+        }
        /* Update the init_css_set to contain a subsys
         * pointer to this state - since the subsystem is
@@ -4149,10 +4725,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        BUG_ON(online_css(css));
-        cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
        mutex_unlock(&cgroup_mutex);
-        mutex_unlock(&cgroup_tree_mutex);
 }
 /**
@@ -4169,6 +4742,8 @@ int __init cgroup_init_early(void)
        int i;
        init_cgroup_root(&cgrp_dfl_root, &opts);
+        cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
        RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
        for_each_subsys(ss, i) {
@@ -4183,7 +4758,7 @@ int __init cgroup_init_early(void)
                ss->name = cgroup_subsys_name[i];
                if (ss->early_init)
-                        cgroup_init_subsys(ss);
+                        cgroup_init_subsys(ss, true);
        }
        return 0;
 }
@@ -4202,7 +4777,6 @@ int __init cgroup_init(void)
        BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
-        mutex_lock(&cgroup_tree_mutex);
        mutex_lock(&cgroup_mutex);
        /* Add init_css_set to the hash table */
@@ -4212,18 +4786,31 @@ int __init cgroup_init(void)
        BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
        mutex_unlock(&cgroup_mutex);
-        mutex_unlock(&cgroup_tree_mutex);
        for_each_subsys(ss, ssid) {
-                if (!ss->early_init)
+                if (ss->early_init) {
-                        cgroup_init_subsys(ss);
+                        struct cgroup_subsys_state *css =
+                                init_css_set.subsys[ss->id];
+                        css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
+                                                   GFP_KERNEL);
+                        BUG_ON(css->id < 0);
+                } else {
+                        cgroup_init_subsys(ss, false);
+                }
+                list_add_tail(&init_css_set.e_cset_node[ssid],
+                              &cgrp_dfl_root.cgrp.e_csets[ssid]);
                /*
-                 * cftype registration needs kmalloc and can't be done
+                 * Setting dfl_root subsys_mask needs to consider the
-                 * during early_init.  Register base cftypes separately.
+                 * disabled flag and cftype registration needs kmalloc,
+                 * both of which aren't available during early_init.
                 */
-                if (ss->base_cftypes)
+                if (!ss->disabled) {
+                        cgrp_dfl_root.subsys_mask |= 1 << ss->id;
                        WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+                }
        }
        cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4306,7 +4893,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
                seq_printf(m, "%d:", root->hierarchy_id);
                for_each_subsys(ss, ssid)
-                        if (root->cgrp.subsys_mask & (1 << ssid))
+                        if (root->subsys_mask & (1 << ssid))
                                seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
                if (strlen(root->name))
                        seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4501,8 +5088,8 @@ void cgroup_exit(struct task_struct *tsk)
 static void check_for_release(struct cgroup *cgrp)
 {
-        if (cgroup_is_releasable(cgrp) &&
+        if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
-            list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
+            !css_has_online_children(&cgrp->self)) {
                /*
                 * Control Group is currently removeable. If it's not
                 * already queued for a userspace notification, queue
@@ -4619,7 +5206,7 @@ static int __init cgroup_disable(char *str)
 __setup("cgroup_disable=", cgroup_disable);
 /**
- * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
+ * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
 * @dentry: directory dentry of interest
 * @ss: subsystem of interest
 *
@@ -4627,8 +5214,8 @@ __setup("cgroup_disable=", cgroup_disable);
 * to get the corresponding css and return it.  If such css doesn't exist
 * or can't be pinned, an ERR_PTR value is returned.
 */
-struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
-                                                struct cgroup_subsys *ss)
+                                                       struct cgroup_subsys *ss)
 {
        struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
        struct cgroup_subsys_state *css = NULL;
@@ -4644,13 +5231,13 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
        /*
         * This path doesn't originate from kernfs and @kn could already
         * have been or be removed at any point.  @kn->priv is RCU
-         * protected for this access.  See destroy_locked() for details.
+         * protected for this access.  See cgroup_rmdir() for details.
         */
        cgrp = rcu_dereference(kn->priv);
        if (cgrp)
                css = cgroup_css(cgrp, ss);
-        if (!css || !css_tryget(css))
+        if (!css || !css_tryget_online(css))
                css = ERR_PTR(-ENOENT);
        rcu_read_unlock();
@@ -4667,14 +5254,8 @@ struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
 */
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
-        struct cgroup *cgrp;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        return idr_find(&ss->css_idr, id);
-        cgroup_assert_mutexes_or_rcu_locked();
-        cgrp = idr_find(&ss->root->cgroup_idr, id);
-        if (cgrp)
-                return cgroup_css(cgrp, ss);
-        return NULL;
 }
 #ifdef CONFIG_CGROUP_DEBUG
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2bc4a2256444..a79e40f9d700 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,6 +21,7 @@
 #include <linux/uaccess.h>
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
+#include <linux/mutex.h>
 /*
 * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
@@ -42,9 +43,10 @@ enum freezer_state_flags {
 struct freezer {
        struct cgroup_subsys_state      css;
        unsigned int                    state;
-        spinlock_t                      lock;
 };
+static DEFINE_MUTEX(freezer_mutex);
 static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
 {
        return css ? container_of(css, struct freezer, css) : NULL;
@@ -57,7 +59,7 @@ static inline struct freezer *task_freezer(struct task_struct *task)
 static struct freezer *parent_freezer(struct freezer *freezer)
 {
-        return css_freezer(css_parent(&freezer->css));
+        return css_freezer(freezer->css.parent);
 }
 bool cgroup_freezing(struct task_struct *task)
@@ -71,10 +73,6 @@ bool cgroup_freezing(struct task_struct *task)
        return ret;
 }
-/*
- * cgroups_write_string() limits the size of freezer state strings to
- * CGROUP_LOCAL_BUFFER_SIZE
- */
 static const char *freezer_state_strs(unsigned int state)
 {
        if (state & CGROUP_FROZEN)
@@ -93,7 +91,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
        if (!freezer)
                return ERR_PTR(-ENOMEM);
-        spin_lock_init(&freezer->lock);
        return &freezer->css;
 }
@@ -110,14 +107,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
        struct freezer *freezer = css_freezer(css);
        struct freezer *parent = parent_freezer(freezer);
-        /*
+        mutex_lock(&freezer_mutex);
-         * The following double locking and freezing state inheritance
-         * guarantee that @cgroup can never escape ancestors' freezing
-         * states.  See css_for_each_descendant_pre() for details.
-         */
-        if (parent)
-                spin_lock_irq(&parent->lock);
-        spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
        freezer->state |= CGROUP_FREEZER_ONLINE;
@@ -126,10 +116,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
                atomic_inc(&system_freezing_cnt);
        }
-        spin_unlock(&freezer->lock);
+        mutex_unlock(&freezer_mutex);
-        if (parent)
-                spin_unlock_irq(&parent->lock);
        return 0;
 }
@@ -144,14 +131,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
 {
        struct freezer *freezer = css_freezer(css);
-        spin_lock_irq(&freezer->lock);
+        mutex_lock(&freezer_mutex);
        if (freezer->state & CGROUP_FREEZING)
                atomic_dec(&system_freezing_cnt);
        freezer->state = 0;
-        spin_unlock_irq(&freezer->lock);
+        mutex_unlock(&freezer_mutex);
 }
 static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -175,7 +162,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
        struct task_struct *task;
        bool clear_frozen = false;
-        spin_lock_irq(&freezer->lock);
+        mutex_lock(&freezer_mutex);
        /*
         * Make the new tasks conform to the current state of @new_css.
@@ -197,21 +184,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
                }
        }
-        spin_unlock_irq(&freezer->lock);
+        /* propagate FROZEN clearing upwards */
-        /*
-         * Propagate FROZEN clearing upwards.  We may race with
-         * update_if_frozen(), but as long as both work bottom-up, either
-         * update_if_frozen() sees child's FROZEN cleared or we clear the
-         * parent's FROZEN later.  No parent w/ !FROZEN children can be
-         * left FROZEN.
-         */
        while (clear_frozen && (freezer = parent_freezer(freezer))) {
-                spin_lock_irq(&freezer->lock);
                freezer->state &= ~CGROUP_FROZEN;
                clear_frozen = freezer->state & CGROUP_FREEZING;
-                spin_unlock_irq(&freezer->lock);
        }
+        mutex_unlock(&freezer_mutex);
 }
 /**
@@ -228,9 +207,6 @@ static void freezer_fork(struct task_struct *task)
 {
        struct freezer *freezer;
-        rcu_read_lock();
-        freezer = task_freezer(task);
        /*
         * The root cgroup is non-freezable, so we can skip locking the
         * freezer.  This is safe regardless of race with task migration.
@@ -238,24 +214,18 @@ static void freezer_fork(struct task_struct *task)
         * to do.  If we lost and root is the new cgroup, noop is still the
         * right thing to do.
         */
-        if (!parent_freezer(freezer))
+        if (task_css_is_root(task, freezer_cgrp_id))
-                goto out;
+                return;
-        /*
+        mutex_lock(&freezer_mutex);
-         * Grab @freezer->lock and freeze @task after verifying @task still
+        rcu_read_lock();
-         * belongs to @freezer and it's freezing.  The former is for the
-         * case where we have raced against task migration and lost and
+        freezer = task_freezer(task);
-         * @task is already in a different cgroup which may not be frozen.
+        if (freezer->state & CGROUP_FREEZING)
-         * This isn't strictly necessary as freeze_task() is allowed to be
-         * called spuriously but let's do it anyway for, if nothing else,
-         * documentation.
-         */
-        spin_lock_irq(&freezer->lock);
-        if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
                freeze_task(task);
-        spin_unlock_irq(&freezer->lock);
-out:
        rcu_read_unlock();
+        mutex_unlock(&freezer_mutex);
 }
 /**
@@ -281,22 +251,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
        struct css_task_iter it;
        struct task_struct *task;
-        WARN_ON_ONCE(!rcu_read_lock_held());
+        lockdep_assert_held(&freezer_mutex);
-        spin_lock_irq(&freezer->lock);
        if (!(freezer->state & CGROUP_FREEZING) ||
            (freezer->state & CGROUP_FROZEN))
-                goto out_unlock;
+                return;
        /* are all (live) children frozen? */
+        rcu_read_lock();
        css_for_each_child(pos, css) {
                struct freezer *child = css_freezer(pos);
                if ((child->state & CGROUP_FREEZER_ONLINE) &&
-                    !(child->state & CGROUP_FROZEN))
+                    !(child->state & CGROUP_FROZEN)) {
-                        goto out_unlock;
+                        rcu_read_unlock();
+                        return;
+                }
        }
+        rcu_read_unlock();
        /* are all tasks frozen? */
        css_task_iter_start(css, &it);
@@ -317,21 +289,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
        freezer->state |= CGROUP_FROZEN;
 out_iter_end:
        css_task_iter_end(&it);
-out_unlock:
-        spin_unlock_irq(&freezer->lock);
 }
 static int freezer_read(struct seq_file *m, void *v)
 {
        struct cgroup_subsys_state *css = seq_css(m), *pos;
+        mutex_lock(&freezer_mutex);
        rcu_read_lock();
        /* update states bottom-up */
-        css_for_each_descendant_post(pos, css)
+        css_for_each_descendant_post(pos, css) {
+                if (!css_tryget_online(pos))
+                        continue;
+                rcu_read_unlock();
                update_if_frozen(pos);
+                rcu_read_lock();
+                css_put(pos);
+        }
        rcu_read_unlock();
+        mutex_unlock(&freezer_mutex);
        seq_puts(m, freezer_state_strs(css_freezer(css)->state));
        seq_putc(m, '\n');
@@ -373,7 +353,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
                                unsigned int state)
 {
        /* also synchronizes against task migration, see freezer_attach() */
-        lockdep_assert_held(&freezer->lock);
+        lockdep_assert_held(&freezer_mutex);
        if (!(freezer->state & CGROUP_FREEZER_ONLINE))
                return;
@@ -414,47 +394,47 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
         * descendant will try to inherit its parent's FREEZING state as
         * CGROUP_FREEZING_PARENT.
         */
+        mutex_lock(&freezer_mutex);
        rcu_read_lock();
        css_for_each_descendant_pre(pos, &freezer->css) {
                struct freezer *pos_f = css_freezer(pos);
                struct freezer *parent = parent_freezer(pos_f);
-                spin_lock_irq(&pos_f->lock);
+                if (!css_tryget_online(pos))
+                        continue;
+                rcu_read_unlock();
-                if (pos_f == freezer) {
+                if (pos_f == freezer)
                        freezer_apply_state(pos_f, freeze,
                                            CGROUP_FREEZING_SELF);
-                } else {
+                else
-                        /*
-                         * Our update to @parent->state is already visible
-                         * which is all we need.  No need to lock @parent.
-                         * For more info on synchronization, see
-                         * freezer_post_create().
-                         */
                        freezer_apply_state(pos_f,
                                            parent->state & CGROUP_FREEZING,
                                            CGROUP_FREEZING_PARENT);
-                }
-                spin_unlock_irq(&pos_f->lock);
+                rcu_read_lock();
+                css_put(pos);
        }
        rcu_read_unlock();
+        mutex_unlock(&freezer_mutex);
 }
-static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
+static ssize_t freezer_write(struct kernfs_open_file *of,
-                         char *buffer)
+                             char *buf, size_t nbytes, loff_t off)
 {
        bool freeze;
-        if (strcmp(buffer, freezer_state_strs(0)) == 0)
+        buf = strstrip(buf);
+        if (strcmp(buf, freezer_state_strs(0)) == 0)
                freeze = false;
-        else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
+        else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
                freeze = true;
        else
                return -EINVAL;
-        freezer_change_state(css_freezer(css), freeze);
+        freezer_change_state(css_freezer(of_css(of)), freeze);
-        return 0;
+        return nbytes;
 }
 static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
@@ -478,7 +458,7 @@ static struct cftype files[] = {
                .name = "state",
                .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = freezer_read,
-                .write_string = freezer_write,
+                .write = freezer_write,
        },
        {
                .name = "self_freezing",
diff --git a/kernel/compat.c b/kernel/compat.c
index e40b0430b562..633394f442f8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
 int compat_get_timeval(struct timeval *tv, const void __user *utv)
 {
        if (COMPAT_USE_64BIT_TIME)
-                return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0;
+                return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
        else
                return __compat_get_timeval(tv, utv);
 }
@@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval);
 int compat_put_timeval(const struct timeval *tv, void __user *utv)
 {
        if (COMPAT_USE_64BIT_TIME)
-                return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0;
+                return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
        else
                return __compat_put_timeval(tv, utv);
 }
@@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval);
 int compat_get_timespec(struct timespec *ts, const void __user *uts)
 {
        if (COMPAT_USE_64BIT_TIME)
-                return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0;
+                return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __compat_get_timespec(ts, uts);
 }
@@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec);
 int compat_put_timespec(const struct timespec *ts, void __user *uts)
 {
        if (COMPAT_USE_64BIT_TIME)
-                return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0;
+                return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __compat_put_timespec(ts, uts);
 }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 019d45008448..5664985c46a0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/hardirq.h>
 #include <linux/export.h>
+#include <linux/kprobes.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/context_tracking.h>
@@ -104,6 +105,7 @@ void context_tracking_user_enter(void)
        }
        local_irq_restore(flags);
 }
+NOKPROBE_SYMBOL(context_tracking_user_enter);
 #ifdef CONFIG_PREEMPT
 /**
@@ -181,6 +183,7 @@ void context_tracking_user_exit(void)
        }
        local_irq_restore(flags);
 }
+NOKPROBE_SYMBOL(context_tracking_user_exit);
 /**
 * __context_tracking_task_switch - context switch the syscall callbacks
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a9e710eef0e2..a343bde710b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
 #include <linux/gfp.h>
 #include <linux/suspend.h>
 #include <linux/lockdep.h>
+#include <trace/events/power.h>
 #include "smpboot.h"
@@ -283,8 +284,7 @@ static inline void check_for_tasks(int cpu)
                task_cputime(p, &utime, &stime);
                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
                    (utime || stime))
-                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
+                        pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
-                                "(state = %ld, flags = %x)\n",
                                p->comm, task_pid_nr(p), cpu,
                                p->state, p->flags);
        }
@@ -336,8 +336,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        if (err) {
                nr_calls--;
                __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
-                printk("%s: attempt to take down CPU %u failed\n",
+                pr_warn("%s: attempt to take down CPU %u failed\n",
-                                __func__, cpu);
+                        __func__, cpu);
                goto out_release;
        }
@@ -444,8 +444,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
        ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
        if (ret) {
                nr_calls--;
-                printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
+                pr_warn("%s: attempt to bring up CPU %u failed\n",
-                                __func__, cpu);
+                        __func__, cpu);
                goto out_notify;
        }
@@ -475,11 +475,10 @@ int cpu_up(unsigned int cpu)
        int err = 0;
        if (!cpu_possible(cpu)) {
-                printk(KERN_ERR "can't online cpu %d because it is not "
+                pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
-                        "configured as may-hotadd at boot time\n", cpu);
+                       cpu);
 #if defined(CONFIG_IA64)
-                printk(KERN_ERR "please check additional_cpus= boot "
+                pr_err("please check additional_cpus= boot parameter\n");
-                                "parameter\n");
 #endif
                return -EINVAL;
        }
@@ -518,16 +517,17 @@ int disable_nonboot_cpus(void)
         */
        cpumask_clear(frozen_cpus);
-        printk("Disabling non-boot CPUs ...\n");
+        pr_info("Disabling non-boot CPUs ...\n");
        for_each_online_cpu(cpu) {
                if (cpu == first_cpu)
                        continue;
+                trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
                error = _cpu_down(cpu, 1);
+                trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
                if (!error)
                        cpumask_set_cpu(cpu, frozen_cpus);
                else {
-                        printk(KERN_ERR "Error taking CPU%d down: %d\n",
+                        pr_err("Error taking CPU%d down: %d\n", cpu, error);
-                                cpu, error);
                        break;
                }
        }
@@ -537,7 +537,7 @@ int disable_nonboot_cpus(void)
                /* Make sure the CPUs won't be enabled by someone else */
                cpu_hotplug_disabled = 1;
        } else {
-                printk(KERN_ERR "Non-boot CPUs are not disabled\n");
+                pr_err("Non-boot CPUs are not disabled\n");
        }
        cpu_maps_update_done();
        return error;
@@ -561,17 +561,19 @@ void __ref enable_nonboot_cpus(void)
        if (cpumask_empty(frozen_cpus))
                goto out;
-        printk(KERN_INFO "Enabling non-boot CPUs ...\n");
+        pr_info("Enabling non-boot CPUs ...\n");
        arch_enable_nonboot_cpus_begin();
        for_each_cpu(cpu, frozen_cpus) {
+                trace_suspend_resume(TPS("CPU_ON"), cpu, true);
                error = _cpu_up(cpu, 1);
+                trace_suspend_resume(TPS("CPU_ON"), cpu, false);
                if (!error) {
-                        printk(KERN_INFO "CPU%d is up\n", cpu);
+                        pr_info("CPU%d is up\n", cpu);
                        continue;
                }
-                printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
+                pr_warn("Error taking CPU%d up: %d\n", cpu, error);
        }
        arch_enable_nonboot_cpus_end();
@@ -726,10 +728,12 @@ void set_cpu_present(unsigned int cpu, bool present)
 void set_cpu_online(unsigned int cpu, bool online)
 {
-        if (online)
+        if (online) {
                cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
-        else
+                cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+        } else {
                cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+        }
 }
 void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3d54c418bd06..116a4164720a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,12 +61,7 @@
 #include <linux/cgroup.h>
 #include <linux/wait.h>
-/*
+struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
- * Tracks how many cpusets are currently defined in system.
- * When there is only one cpuset (the root cpuset) we can
- * short circuit some hooks.
- */
-int number_of_cpusets __read_mostly;
 /* See "Frequency meter" comments, below. */
@@ -124,7 +119,7 @@ static inline struct cpuset *task_cs(struct task_struct *task)
 static inline struct cpuset *parent_cs(struct cpuset *cs)
 {
-        return css_cs(css_parent(&cs->css));
+        return css_cs(cs->css.parent);
 }
 #ifdef CONFIG_NUMA
@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
                goto done;
        }
-        csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
+        csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
        if (!csa)
                goto done;
        csn = 0;
@@ -696,11 +691,8 @@ restart:
                if (nslot == ndoms) {
                        static int warnings = 10;
                        if (warnings) {
-                                printk(KERN_WARNING
+                                pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
-                                 "rebuild_sched_domains confused:"
+                                        nslot, ndoms, csn, i, apn);
-                                  " nslot %d, ndoms %d, csn %d, i %d,"
-                                  " apn %d\n",
-                                  nslot, ndoms, csn, i, apn);
                                warnings--;
                        }
                        continue;
@@ -875,7 +867,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
                                continue;
                        }
                }
-                if (!css_tryget(&cp->css))
+                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();
@@ -890,6 +882,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
 /**
 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
 * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
 * @buf: buffer of cpu numbers written to this cpuset
 */
 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
@@ -1110,7 +1103,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
                                continue;
                        }
                }
-                if (!css_tryget(&cp->css))
+                if (!css_tryget_online(&cp->css))
                        continue;
                rcu_read_unlock();
@@ -1188,7 +1181,13 @@ done:
 int current_cpuset_is_being_rebound(void)
 {
-        return task_cs(current) == cpuset_being_rebound;
+        int ret;
+        rcu_read_lock();
+        ret = task_cs(current) == cpuset_being_rebound;
+        rcu_read_unlock();
+        return ret;
 }
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
@@ -1605,13 +1604,15 @@ out_unlock:
 /*
 * Common handling for a write to a "cpus" or "mems" file.
 */
-static int cpuset_write_resmask(struct cgroup_subsys_state *css,
+static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
-                                struct cftype *cft, char *buf)
+                                    char *buf, size_t nbytes, loff_t off)
 {
-        struct cpuset *cs = css_cs(css);
+        struct cpuset *cs = css_cs(of_css(of));
        struct cpuset *trialcs;
        int retval = -ENODEV;
+        buf = strstrip(buf);
        /*
         * CPU or memory hotunplug may leave @cs w/o any execution
         * resources, in which case the hotplug code asynchronously updates
@@ -1622,7 +1623,17 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
         * resources, wait for the previously scheduled operations before
         * proceeding, so that we don't end up keep removing tasks added
         * after execution capability is restored.
+         *
+         * cpuset_hotplug_work calls back into cgroup core via
+         * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+         * operation like this one can lead to a deadlock through kernfs
+         * active_ref protection.  Let's break the protection.  Losing the
+         * protection is okay as we check whether @cs is online after
+         * grabbing cpuset_mutex anyway.  This only happens on the legacy
+         * hierarchies.
         */
+        css_get(&cs->css);
+        kernfs_break_active_protection(of->kn);
        flush_work(&cpuset_hotplug_work);
        mutex_lock(&cpuset_mutex);
@@ -1635,7 +1646,7 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
                goto out_unlock;
        }
-        switch (cft->private) {
+        switch (of_cft(of)->private) {
        case FILE_CPULIST:
                retval = update_cpumask(cs, trialcs, buf);
                break;
@@ -1650,7 +1661,9 @@ static int cpuset_write_resmask(struct cgroup_subsys_state *css,
        free_trial_cpuset(trialcs);
 out_unlock:
        mutex_unlock(&cpuset_mutex);
-        return retval;
+        kernfs_unbreak_active_protection(of->kn);
+        css_put(&cs->css);
+        return retval ?: nbytes;
 }
 /*
@@ -1752,7 +1765,7 @@ static struct cftype files[] = {
        {
                .name = "cpus",
                .seq_show = cpuset_common_seq_show,
-                .write_string = cpuset_write_resmask,
+                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
        },
@@ -1760,7 +1773,7 @@ static struct cftype files[] = {
        {
                .name = "mems",
                .seq_show = cpuset_common_seq_show,
-                .write_string = cpuset_write_resmask,
+                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
        },
@@ -1888,7 +1901,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
        if (is_spread_slab(parent))
                set_bit(CS_SPREAD_SLAB, &cs->flags);
-        number_of_cpusets++;
+        cpuset_inc();
        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                goto out_unlock;
@@ -1939,7 +1952,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
        if (is_sched_load_balance(cs))
                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-        number_of_cpusets--;
+        cpuset_dec();
        clear_bit(CS_ONLINE, &cs->flags);
        mutex_unlock(&cpuset_mutex);
@@ -1992,7 +2005,6 @@ int __init cpuset_init(void)
        if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
                BUG();
-        number_of_cpusets = 1;
        return 0;
 }
@@ -2017,7 +2029,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
                parent = parent_cs(parent);
        if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
-                printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset ");
+                pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
                pr_cont_cgroup_name(cs->css.cgroup);
                pr_cont("\n");
        }
@@ -2155,7 +2167,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
                rcu_read_lock();
                cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
-                        if (cs == &top_cpuset || !css_tryget(&cs->css))
+                        if (cs == &top_cpuset || !css_tryget_online(&cs->css))
                                continue;
                        rcu_read_unlock();
@@ -2536,7 +2548,7 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
 /**
 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @task: pointer to task_struct of some task.
+ * @tsk: pointer to task_struct of some task.
 *
 * Description: Prints @task's name, cpuset name, and cached copy of its
 * mems_allowed to the kernel log.
@@ -2554,7 +2566,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
        cgrp = task_cs(tsk)->css.cgroup;
        nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
                           tsk->mems_allowed);
-        printk(KERN_INFO "%s cpuset=", tsk->comm);
+        pr_info("%s cpuset=", tsk->comm);
        pr_cont_cgroup_name(cgrp);
        pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
@@ -2646,10 +2658,10 @@ out:
 /* Display task mems_allowed in /proc/<pid>/status file. */
 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
 {
-        seq_printf(m, "Mems_allowed:\t");
+        seq_puts(m, "Mems_allowed:\t");
        seq_nodemask(m, &task->mems_allowed);
-        seq_printf(m, "\n");
+        seq_puts(m, "\n");
-        seq_printf(m, "Mems_allowed_list:\t");
+        seq_puts(m, "Mems_allowed_list:\t");
        seq_nodemask_list(m, &task->mems_allowed);
-        seq_printf(m, "\n");
+        seq_puts(m, "\n");
 }
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 2956c8da1605..1adf62b39b96 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -534,7 +534,7 @@ return_normal:
                        kgdb_info[cpu].exception_state &=
                                ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
                        kgdb_info[cpu].enter_kgdb--;
-                        smp_mb__before_atomic_dec();
+                        smp_mb__before_atomic();
                        atomic_dec(&slaves_in_kgdb);
                        dbg_touch_watchdogs();
                        local_irq_restore(flags);
@@ -662,7 +662,7 @@ kgdb_restore:
        kgdb_info[cpu].exception_state &=
                ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
        kgdb_info[cpu].enter_kgdb--;
-        smp_mb__before_atomic_dec();
+        smp_mb__before_atomic();
        atomic_dec(&masters_in_kgdb);
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index b03e0e814e43..fe15fff5df53 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -21,7 +21,7 @@
 static void kdb_show_stack(struct task_struct *p, void *addr)
 {
        int old_lvl = console_loglevel;
-        console_loglevel = 15;
+        console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
        kdb_trap_printk++;
        kdb_set_current_task(p);
        if (addr) {
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 14ff4849262c..7c70812caea5 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -710,7 +710,7 @@ kdb_printit:
        }
        if (logging) {
                saved_loglevel = console_loglevel;
-                console_loglevel = 0;
+                console_loglevel = CONSOLE_LOGLEVEL_SILENT;
                printk(KERN_INFO "%s", kdb_buffer);
        }
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 0b097c8a1e50..2f7c760305ca 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv)
 static void kdb_dumpregs(struct pt_regs *regs)
 {
        int old_lvl = console_loglevel;
-        console_loglevel = 15;
+        console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
        kdb_trap_printk++;
        show_regs(regs);
        kdb_trap_printk--;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f83a71a3e46d..b0c95f0f06fd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -39,6 +39,8 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/mm_types.h>
 #include <linux/cgroup.h>
+#include <linux/module.h>
+#include <linux/mman.h>
 #include "internal.h"
@@ -607,7 +609,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
        if (!f.file)
                return -EBADF;
-        css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
+        css = css_tryget_online_from_dir(f.file->f_dentry,
+                                         &perf_event_cgrp_subsys);
        if (IS_ERR(css)) {
                ret = PTR_ERR(css);
                goto out;
@@ -1443,6 +1446,11 @@ group_sched_out(struct perf_event *group_event,
                cpuctx->exclusive = 0;
 }
+struct remove_event {
+        struct perf_event *event;
+        bool detach_group;
+};
 /*
 * Cross CPU call to remove a performance event
 *
@@ -1451,12 +1459,15 @@ group_sched_out(struct perf_event *group_event,
 */
 static int __perf_remove_from_context(void *info)
 {
-        struct perf_event *event = info;
+        struct remove_event *re = info;
+        struct perf_event *event = re->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        raw_spin_lock(&ctx->lock);
        event_sched_out(event, cpuctx, ctx);
+        if (re->detach_group)
+                perf_group_detach(event);
        list_del_event(event, ctx);
        if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
                ctx->is_active = 0;
@@ -1481,10 +1492,14 @@ static int __perf_remove_from_context(void *info)
 * When called from perf_event_exit_task, it's OK because the
 * context has been detached from its task.
 */
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
+        struct remove_event re = {
+                .event = event,
+                .detach_group = detach_group,
+        };
        lockdep_assert_held(&ctx->mutex);
@@ -1493,12 +1508,12 @@ static void perf_remove_from_context(struct perf_event *event)
                 * Per cpu events are removed via an smp call and
                 * the removal is always successful.
                 */
-                cpu_function_call(event->cpu, __perf_remove_from_context, event);
+                cpu_function_call(event->cpu, __perf_remove_from_context, &re);
                return;
        }
 retry:
-        if (!task_function_call(task, __perf_remove_from_context, event))
+        if (!task_function_call(task, __perf_remove_from_context, &re))
                return;
        raw_spin_lock_irq(&ctx->lock);
@@ -1515,6 +1530,8 @@ retry:
         * Since the task isn't running, its safe to remove the event, us
         * holding the ctx->lock ensures the task won't get scheduled in.
         */
+        if (detach_group)
+                perf_group_detach(event);
        list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
 }
@@ -1663,6 +1680,8 @@ event_sched_in(struct perf_event *event,
        u64 tstamp = perf_event_time(event);
        int ret = 0;
+        lockdep_assert_held(&ctx->lock);
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
@@ -2301,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
        next_parent = rcu_dereference(next_ctx->parent_ctx);
        /* If neither context have a parent context; they cannot be clones. */
-        if (!parent && !next_parent)
+        if (!parent || !next_parent)
                goto unlock;
        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -2956,6 +2975,22 @@ out:
        local_irq_restore(flags);
 }
+void perf_event_exec(void)
+{
+        struct perf_event_context *ctx;
+        int ctxn;
+        rcu_read_lock();
+        for_each_task_context_nr(ctxn) {
+                ctx = current->perf_event_ctxp[ctxn];
+                if (!ctx)
+                        continue;
+                perf_event_enable_on_exec(ctx);
+        }
+        rcu_read_unlock();
+}
 /*
 * Cross CPU call to read the hardware event
 */
@@ -3178,7 +3213,8 @@ static void free_event_rcu(struct rcu_head *head)
 }
 static void ring_buffer_put(struct ring_buffer *rb);
-static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
+static void ring_buffer_attach(struct perf_event *event,
+                               struct ring_buffer *rb);
 static void unaccount_event_cpu(struct perf_event *event, int cpu)
 {
@@ -3229,17 +3265,19 @@ static void __free_event(struct perf_event *event)
        if (event->ctx)
                put_ctx(event->ctx);
+        if (event->pmu)
+                module_put(event->pmu->module);
        call_rcu(&event->rcu_head, free_event_rcu);
 }
-static void free_event(struct perf_event *event)
+static void _free_event(struct perf_event *event)
 {
        irq_work_sync(&event->pending);
        unaccount_event(event);
        if (event->rb) {
-                struct ring_buffer *rb;
                /*
                 * Can happen when we close an event with re-directed output.
                 *
@@ -3247,57 +3285,38 @@ static void free_event(struct perf_event *event)
                 * over us; possibly making our ring_buffer_put() the last.
                 */
                mutex_lock(&event->mmap_mutex);
-                rb = event->rb;
+                ring_buffer_attach(event, NULL);
-                if (rb) {
-                        rcu_assign_pointer(event->rb, NULL);
-                        ring_buffer_detach(event, rb);
-                        ring_buffer_put(rb); /* could be last */
-                }
                mutex_unlock(&event->mmap_mutex);
        }
        if (is_cgroup_event(event))
                perf_detach_cgroup(event);
        __free_event(event);
 }
-int perf_event_release_kernel(struct perf_event *event)
+/*
+ * Used to free events which have a known refcount of 1, such as in error paths
+ * where the event isn't exposed yet and inherited events.
+ */
+static void free_event(struct perf_event *event)
 {
-        struct perf_event_context *ctx = event->ctx;
+        if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
+                                "unexpected event refcount: %ld; ptr=%p\n",
-        WARN_ON_ONCE(ctx->parent_ctx);
+                                atomic_long_read(&event->refcount), event)) {
-        /*
+                /* leak to avoid use-after-free */
-         * There are two ways this annotation is useful:
+                return;
-         *
+        }
-         *  1) there is a lock recursion from perf_event_exit_task
-         *     see the comment there.
-         *
-         *  2) there is a lock-inversion with mmap_sem through
-         *     perf_event_read_group(), which takes faults while
-         *     holding ctx->mutex, however this is called after
-         *     the last filedesc died, so there is no possibility
-         *     to trigger the AB-BA case.
-         */
-        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
-        raw_spin_lock_irq(&ctx->lock);
-        perf_group_detach(event);
-        raw_spin_unlock_irq(&ctx->lock);
-        perf_remove_from_context(event);
-        mutex_unlock(&ctx->mutex);
-        free_event(event);
-        return 0;
+        _free_event(event);
 }
-EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 /*
 * Called when the last reference to the file is gone.
 */
 static void put_event(struct perf_event *event)
 {
+        struct perf_event_context *ctx = event->ctx;
        struct task_struct *owner;
        if (!atomic_long_dec_and_test(&event->refcount))
@@ -3336,8 +3355,32 @@ static void put_event(struct perf_event *event)
                put_task_struct(owner);
        }
-        perf_event_release_kernel(event);
+        WARN_ON_ONCE(ctx->parent_ctx);
+        /*
+         * There are two ways this annotation is useful:
+         *
+         *  1) there is a lock recursion from perf_event_exit_task
+         *     see the comment there.
+         *
+         *  2) there is a lock-inversion with mmap_sem through
+         *     perf_event_read_group(), which takes faults while
+         *     holding ctx->mutex, however this is called after
+         *     the last filedesc died, so there is no possibility
+         *     to trigger the AB-BA case.
+         */
+        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
+        perf_remove_from_context(event, true);
+        mutex_unlock(&ctx->mutex);
+        _free_event(event);
+}
+int perf_event_release_kernel(struct perf_event *event)
+{
+        put_event(event);
+        return 0;
 }
+EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 static int perf_release(struct inode *inode, struct file *file)
 {
@@ -3839,28 +3882,47 @@ unlock:
 static void ring_buffer_attach(struct perf_event *event,
                               struct ring_buffer *rb)
 {
+        struct ring_buffer *old_rb = NULL;
        unsigned long flags;
-        if (!list_empty(&event->rb_entry))
+        if (event->rb) {
-                return;
+                /*
+                 * Should be impossible, we set this when removing
+                 * event->rb_entry and wait/clear when adding event->rb_entry.
+                 */
+                WARN_ON_ONCE(event->rcu_pending);
-        spin_lock_irqsave(&rb->event_lock, flags);
+                old_rb = event->rb;
-        if (list_empty(&event->rb_entry))
+                event->rcu_batches = get_state_synchronize_rcu();
-                list_add(&event->rb_entry, &rb->event_list);
+                event->rcu_pending = 1;
-        spin_unlock_irqrestore(&rb->event_lock, flags);
-}
-static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
+                spin_lock_irqsave(&old_rb->event_lock, flags);
-{
+                list_del_rcu(&event->rb_entry);
-        unsigned long flags;
+                spin_unlock_irqrestore(&old_rb->event_lock, flags);
+        }
-        if (list_empty(&event->rb_entry))
+        if (event->rcu_pending && rb) {
-                return;
+                cond_synchronize_rcu(event->rcu_batches);
+                event->rcu_pending = 0;
+        }
-        spin_lock_irqsave(&rb->event_lock, flags);
+        if (rb) {
-        list_del_init(&event->rb_entry);
+                spin_lock_irqsave(&rb->event_lock, flags);
-        wake_up_all(&event->waitq);
+                list_add_rcu(&event->rb_entry, &rb->event_list);
-        spin_unlock_irqrestore(&rb->event_lock, flags);
+                spin_unlock_irqrestore(&rb->event_lock, flags);
+        }
+        rcu_assign_pointer(event->rb, rb);
+        if (old_rb) {
+                ring_buffer_put(old_rb);
+                /*
+                 * Since we detached before setting the new rb, so that we
+                 * could attach the new rb, we could have missed a wakeup.
+                 * Provide it now.
+                 */
+                wake_up_all(&event->waitq);
+        }
 }
 static void ring_buffer_wakeup(struct perf_event *event)
@@ -3929,7 +3991,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 {
        struct perf_event *event = vma->vm_file->private_data;
-        struct ring_buffer *rb = event->rb;
+        struct ring_buffer *rb = ring_buffer_get(event);
        struct user_struct *mmap_user = rb->mmap_user;
        int mmap_locked = rb->mmap_locked;
        unsigned long size = perf_data_size(rb);
@@ -3937,18 +3999,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        atomic_dec(&rb->mmap_count);
        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
-                return;
+                goto out_put;
-        /* Detach current event from the buffer. */
+        ring_buffer_attach(event, NULL);
-        rcu_assign_pointer(event->rb, NULL);
-        ring_buffer_detach(event, rb);
        mutex_unlock(&event->mmap_mutex);
        /* If there's still other mmap()s of this buffer, we're done. */
-        if (atomic_read(&rb->mmap_count)) {
+        if (atomic_read(&rb->mmap_count))
-                ring_buffer_put(rb); /* can't be last */
+                goto out_put;
-                return;
-        }
        /*
         * No other mmap()s, detach from all other events that might redirect
@@ -3978,11 +4036,9 @@ again:
                 * still restart the iteration to make sure we're not now
                 * iterating the wrong list.
                 */
-                if (event->rb == rb) {
+                if (event->rb == rb)
-                        rcu_assign_pointer(event->rb, NULL);
+                        ring_buffer_attach(event, NULL);
-                        ring_buffer_detach(event, rb);
-                        ring_buffer_put(rb); /* can't be last, we still have one */
-                }
                mutex_unlock(&event->mmap_mutex);
                put_event(event);
@@ -4007,6 +4063,7 @@ again:
        vma->vm_mm->pinned_vm -= mmap_locked;
        free_uid(mmap_user);
+out_put:
        ring_buffer_put(rb); /* could be last */
 }
@@ -4124,7 +4181,6 @@ again:
        vma->vm_mm->pinned_vm += extra;
        ring_buffer_attach(event, rb);
-        rcu_assign_pointer(event->rb, rb);
        perf_event_init_userpage(event);
        perf_event_update_userpage(event);
@@ -5036,21 +5092,9 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
                       NULL);
 }
-void perf_event_comm(struct task_struct *task)
+void perf_event_comm(struct task_struct *task, bool exec)
 {
        struct perf_comm_event comm_event;
-        struct perf_event_context *ctx;
-        int ctxn;
-        rcu_read_lock();
-        for_each_task_context_nr(ctxn) {
-                ctx = task->perf_event_ctxp[ctxn];
-                if (!ctx)
-                        continue;
-                perf_event_enable_on_exec(ctx);
-        }
-        rcu_read_unlock();
        if (!atomic_read(&nr_comm_events))
                return;
@@ -5062,7 +5106,7 @@ void perf_event_comm(struct task_struct *task)
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_COMM,
-                                .misc = 0,
+                                .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
                                /* .size */
                        },
                        /* .pid */
@@ -5085,6 +5129,7 @@ struct perf_mmap_event {
        int                     maj, min;
        u64                     ino;
        u64                     ino_generation;
+        u32                     prot, flags;
        struct {
                struct perf_event_header        header;
@@ -5126,6 +5171,8 @@ static void perf_event_mmap_output(struct perf_event *event,
                mmap_event->event_id.header.size += sizeof(mmap_event->min);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
+                mmap_event->event_id.header.size += sizeof(mmap_event->prot);
+                mmap_event->event_id.header.size += sizeof(mmap_event->flags);
        }
        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
@@ -5144,6 +5191,8 @@ static void perf_event_mmap_output(struct perf_event *event,
                perf_output_put(&handle, mmap_event->min);
                perf_output_put(&handle, mmap_event->ino);
                perf_output_put(&handle, mmap_event->ino_generation);
+                perf_output_put(&handle, mmap_event->prot);
+                perf_output_put(&handle, mmap_event->flags);
        }
        __output_copy(&handle, mmap_event->file_name,
@@ -5162,6 +5211,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
        struct file *file = vma->vm_file;
        int maj = 0, min = 0;
        u64 ino = 0, gen = 0;
+        u32 prot = 0, flags = 0;
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
@@ -5192,6 +5242,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
                gen = inode->i_generation;
                maj = MAJOR(dev);
                min = MINOR(dev);
+                if (vma->vm_flags & VM_READ)
+                        prot |= PROT_READ;
+                if (vma->vm_flags & VM_WRITE)
+                        prot |= PROT_WRITE;
+                if (vma->vm_flags & VM_EXEC)
+                        prot |= PROT_EXEC;
+                if (vma->vm_flags & VM_MAYSHARE)
+                        flags = MAP_SHARED;
+                else
+                        flags = MAP_PRIVATE;
+                if (vma->vm_flags & VM_DENYWRITE)
+                        flags |= MAP_DENYWRITE;
+                if (vma->vm_flags & VM_MAYEXEC)
+                        flags |= MAP_EXECUTABLE;
+                if (vma->vm_flags & VM_LOCKED)
+                        flags |= MAP_LOCKED;
+                if (vma->vm_flags & VM_HUGETLB)
+                        flags |= MAP_HUGETLB;
                goto got_name;
        } else {
                name = (char *)arch_vma_name(vma);
@@ -5232,6 +5304,8 @@ got_name:
        mmap_event->min = min;
        mmap_event->ino = ino;
        mmap_event->ino_generation = gen;
+        mmap_event->prot = prot;
+        mmap_event->flags = flags;
        if (!(vma->vm_flags & VM_EXEC))
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
@@ -5272,6 +5346,8 @@ void perf_event_mmap(struct vm_area_struct *vma)
                /* .min (attr_mmap2 only) */
                /* .ino (attr_mmap2 only) */
                /* .ino_generation (attr_mmap2 only) */
+                /* .prot (attr_mmap2 only) */
+                /* .flags (attr_mmap2 only) */
        };
        perf_event_mmap_event(&mmap_event);
@@ -5408,6 +5484,9 @@ struct swevent_htable {
        /* Recursion avoidance in each contexts */
        int                             recursion[PERF_NR_CONTEXTS];
+        /* Keeps track of cpu being initialized/exited */
+        bool                            online;
 };
 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5654,8 +5733,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
        hwc->state = !(flags & PERF_EF_START);
        head = find_swevent_head(swhash, event);
-        if (WARN_ON_ONCE(!head))
+        if (!head) {
+                /*
+                 * We can race with cpu hotplug code. Do not
+                 * WARN if the cpu just got unplugged.
+                 */
+                WARN_ON_ONCE(swhash->online);
                return -EINVAL;
+        }
        hlist_add_head_rcu(&event->hlist_entry, head);
@@ -6551,6 +6636,7 @@ free_pdc:
        free_percpu(pmu->pmu_disable_count);
        goto unlock;
 }
+EXPORT_SYMBOL_GPL(perf_pmu_register);
 void perf_pmu_unregister(struct pmu *pmu)
 {
@@ -6572,6 +6658,7 @@ void perf_pmu_unregister(struct pmu *pmu)
        put_device(pmu->dev);
        free_pmu_context(pmu);
 }
+EXPORT_SYMBOL_GPL(perf_pmu_unregister);
 struct pmu *perf_init_event(struct perf_event *event)
 {
@@ -6585,6 +6672,10 @@ struct pmu *perf_init_event(struct perf_event *event)
        pmu = idr_find(&pmu_idr, event->attr.type);
        rcu_read_unlock();
        if (pmu) {
+                if (!try_module_get(pmu->module)) {
+                        pmu = ERR_PTR(-ENODEV);
+                        goto unlock;
+                }
                event->pmu = pmu;
                ret = pmu->event_init(event);
                if (ret)
@@ -6593,6 +6684,10 @@ struct pmu *perf_init_event(struct perf_event *event)
        }
        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                if (!try_module_get(pmu->module)) {
+                        pmu = ERR_PTR(-ENODEV);
+                        goto unlock;
+                }
                event->pmu = pmu;
                ret = pmu->event_init(event);
                if (!ret)
@@ -6771,6 +6866,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 err_pmu:
        if (event->destroy)
                event->destroy(event);
+        module_put(pmu->module);
 err_ns:
        if (event->ns)
                put_pid_ns(event->ns);
@@ -6834,10 +6930,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (ret)
                return -EFAULT;
-        /* disabled for now */
-        if (attr->mmap2)
-                return -EINVAL;
        if (attr->__reserved_1)
                return -EINVAL;
@@ -6914,7 +7006,7 @@ err_size:
 static int
 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 {
-        struct ring_buffer *rb = NULL, *old_rb = NULL;
+        struct ring_buffer *rb = NULL;
        int ret = -EINVAL;
        if (!output_event)
@@ -6942,8 +7034,6 @@ set:
        if (atomic_read(&event->mmap_count))
                goto unlock;
-        old_rb = event->rb;
        if (output_event) {
                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
@@ -6951,23 +7041,7 @@ set:
                        goto unlock;
        }
-        if (old_rb)
+        ring_buffer_attach(event, rb);
-                ring_buffer_detach(event, old_rb);
-        if (rb)
-                ring_buffer_attach(event, rb);
-        rcu_assign_pointer(event->rb, rb);
-        if (old_rb) {
-                ring_buffer_put(old_rb);
-                /*
-                 * Since we detached before setting the new rb, so that we
-                 * could attach the new rb, we could have missed a wakeup.
-                 * Provide it now.
-                 */
-                wake_up_all(&event->waitq);
-        }
        ret = 0;
 unlock:
@@ -7018,6 +7092,9 @@ SYSCALL_DEFINE5(perf_event_open,
        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;
+        } else {
+                if (attr.sample_period & (1ULL << 63))
+                        return -EINVAL;
        }
        /*
@@ -7055,20 +7132,33 @@ SYSCALL_DEFINE5(perf_event_open,
                }
        }
+        if (task && group_leader &&
+            group_leader->attr.inherit != attr.inherit) {
+                err = -EINVAL;
+                goto err_task;
+        }
        get_online_cpus();
        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
                                 NULL, NULL);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
-                goto err_task;
+                goto err_cpus;
        }
        if (flags & PERF_FLAG_PID_CGROUP) {
                err = perf_cgroup_connect(pid, event, &attr, group_leader);
                if (err) {
                        __free_event(event);
-                        goto err_task;
+                        goto err_cpus;
+                }
+        }
+        if (is_sampling_event(event)) {
+                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
+                        err = -ENOTSUPP;
+                        goto err_alloc;
                }
        }
@@ -7165,7 +7255,7 @@ SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_context *gctx = group_leader->ctx;
                mutex_lock(&gctx->mutex);
-                perf_remove_from_context(group_leader);
+                perf_remove_from_context(group_leader, false);
                /*
                 * Removing from the context ends up with disabled
@@ -7175,7 +7265,7 @@ SYSCALL_DEFINE5(perf_event_open,
                perf_event__state_init(group_leader);
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
-                        perf_remove_from_context(sibling);
+                        perf_remove_from_context(sibling, false);
                        perf_event__state_init(sibling);
                        put_ctx(gctx);
                }
@@ -7230,8 +7320,9 @@ err_context:
        put_ctx(ctx);
 err_alloc:
        free_event(event);
-err_task:
+err_cpus:
        put_online_cpus();
+err_task:
        if (task)
                put_task_struct(task);
 err_group_fd:
@@ -7305,7 +7396,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
        mutex_lock(&src_ctx->mutex);
        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                 event_entry) {
-                perf_remove_from_context(event);
+                perf_remove_from_context(event, false);
                unaccount_event_cpu(event, src_cpu);
                put_ctx(src_ctx);
                list_add(&event->migrate_entry, &events);
@@ -7367,13 +7458,7 @@ __perf_event_exit_task(struct perf_event *child_event,
                         struct perf_event_context *child_ctx,
                         struct task_struct *child)
 {
-        if (child_event->parent) {
+        perf_remove_from_context(child_event, true);
-                raw_spin_lock_irq(&child_ctx->lock);
-                perf_group_detach(child_event);
-                raw_spin_unlock_irq(&child_ctx->lock);
-        }
-        perf_remove_from_context(child_event);
        /*
         * It can happen that the parent exits first, and has events
@@ -7388,7 +7473,7 @@ __perf_event_exit_task(struct perf_event *child_event,
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 {
-        struct perf_event *child_event, *tmp;
+        struct perf_event *child_event, *next;
        struct perf_event_context *child_ctx;
        unsigned long flags;
@@ -7442,24 +7527,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
         */
        mutex_lock(&child_ctx->mutex);
-again:
+        list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
-        list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
-                                 group_entry)
-                __perf_event_exit_task(child_event, child_ctx, child);
-        list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
-                                 group_entry)
                __perf_event_exit_task(child_event, child_ctx, child);
-        /*
-         * If the last event was a group event, it will have appended all
-         * its siblings to the list, but we obtained 'tmp' before that which
-         * will still point to the list head terminating the iteration.
-         */
-        if (!list_empty(&child_ctx->pinned_groups) ||
-            !list_empty(&child_ctx->flexible_groups))
-                goto again;
        mutex_unlock(&child_ctx->mutex);
        put_ctx(child_ctx);
@@ -7724,6 +7794,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
         * swapped under us.
         */
        parent_ctx = perf_pin_task_context(parent, ctxn);
+        if (!parent_ctx)
+                return 0;
        /*
         * No need to check if parent_ctx != NULL here; since we saw
@@ -7835,6 +7907,7 @@ static void perf_event_init_cpu(int cpu)
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        mutex_lock(&swhash->hlist_mutex);
+        swhash->online = true;
        if (swhash->hlist_refcount > 0) {
                struct swevent_hlist *hlist;
@@ -7857,14 +7930,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
 static void __perf_event_exit_context(void *__info)
 {
+        struct remove_event re = { .detach_group = false };
        struct perf_event_context *ctx = __info;
-        struct perf_event *event;
        perf_pmu_rotate_stop(ctx->pmu);
        rcu_read_lock();
-        list_for_each_entry_rcu(event, &ctx->event_list, event_entry)
+        list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
-                __perf_remove_from_context(event);
+                __perf_remove_from_context(&re);
        rcu_read_unlock();
 }
@@ -7892,6 +7965,7 @@ static void perf_event_exit_cpu(int cpu)
        perf_event_exit_cpu_context(cpu);
        mutex_lock(&swhash->hlist_mutex);
+        swhash->online = false;
        swevent_hlist_release(swhash);
        mutex_unlock(&swhash->hlist_mutex);
 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 04709b66369d..6f3254e8c137 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -36,6 +36,7 @@
 #include "../../mm/internal.h"  /* munlock_vma_page */
 #include <linux/percpu-rwsem.h>
 #include <linux/task_work.h>
+#include <linux/shmem_fs.h>
 #include <linux/uprobes.h>
@@ -60,8 +61,6 @@ static struct percpu_rw_semaphore dup_mmap_sem;
 /* Have a copy of original instruction */
 #define UPROBE_COPY_INSN        0
-/* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP       1
 struct uprobe {
        struct rb_node          rb_node;        /* node in the rb tree */
@@ -129,7 +128,7 @@ struct xol_area {
 */
 static bool valid_vma(struct vm_area_struct *vma, bool is_register)
 {
-        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
+        vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
        if (is_register)
                flags |= VM_WRITE;
@@ -281,18 +280,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
 * supported by that architecture then we need to modify is_trap_at_addr and
 * uprobe_write_opcode accordingly. This would never be a problem for archs
 * that have fixed length instructions.
- */
+ *
-/*
 * uprobe_write_opcode - write the opcode at a given virtual address.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to store the opcode.
 * @opcode: opcode to be written at @vaddr.
 *
- * Called with mm->mmap_sem held (for read and with a reference to
+ * Called with mm->mmap_sem held for write.
- * mm).
- *
- * For mm @mm, write the opcode at @vaddr.
 * Return 0 (success) or a negative errno.
 */
 int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
@@ -312,21 +306,25 @@ retry:
        if (ret <= 0)
                goto put_old;
+        ret = anon_vma_prepare(vma);
+        if (ret)
+                goto put_old;
        ret = -ENOMEM;
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
        if (!new_page)
                goto put_old;
-        __SetPageUptodate(new_page);
+        if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
+                goto put_new;
+        __SetPageUptodate(new_page);
        copy_highpage(new_page, old_page);
        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
-        ret = anon_vma_prepare(vma);
-        if (ret)
-                goto put_new;
        ret = __replace_page(vma, vaddr, old_page, new_page);
+        if (ret)
+                mem_cgroup_uncharge_page(new_page);
 put_new:
        page_cache_release(new_page);
@@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
        uprobe->offset = offset;
        init_rwsem(&uprobe->register_rwsem);
        init_rwsem(&uprobe->consumer_rwsem);
-        /* For now assume that the instruction need not be single-stepped */
-        __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
        /* a uprobe exists for this inode:offset combination */
        if (cur_uprobe) {
                kfree(uprobe);
@@ -542,14 +537,15 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
                        void *insn, int nbytes, loff_t offset)
 {
        struct page *page;
-        if (!mapping->a_ops->readpage)
-                return -EIO;
        /*
-         * Ensure that the page that has the original instruction is
+         * Ensure that the page that has the original instruction is populated
-         * populated and in page-cache.
+         * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+         * see uprobe_register().
         */
-        page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+        if (mapping->a_ops->readpage)
+                page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+        else
+                page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
        if (IS_ERR(page))
                return PTR_ERR(page);
@@ -850,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
 {
        int err;
-        if (!consumer_del(uprobe, uc))  /* WARN? */
+        if (WARN_ON(!consumer_del(uprobe, uc)))
                return;
        err = register_for_each_vma(uprobe, NULL);
@@ -885,6 +881,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
        if (!uc->handler && !uc->ret_handler)
                return -EINVAL;
+        /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+        if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+                return -EIO;
        /* Racy, just to catch the obvious mistakes */
        if (offset > i_size_read(inode))
                return -EINVAL;
@@ -928,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
        int ret = -ENOENT;
        uprobe = find_uprobe(inode, offset);
-        if (!uprobe)
+        if (WARN_ON(!uprobe))
                return ret;
        down_write(&uprobe->register_rwsem);
@@ -953,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
        struct uprobe *uprobe;
        uprobe = find_uprobe(inode, offset);
-        if (!uprobe)
+        if (WARN_ON(!uprobe))
                return;
        down_write(&uprobe->register_rwsem);
@@ -1296,14 +1295,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
        if (unlikely(!xol_vaddr))
                return 0;
-        /* Initialize the slot */
+        arch_uprobe_copy_ixol(area->page, xol_vaddr,
-        copy_to_page(area->page, xol_vaddr,
+                              &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
-                        &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
-        /*
-         * We probably need flush_icache_user_range() but it needs vma.
-         * This should work on supported architectures too.
-         */
-        flush_dcache_page(area->page);
        return xol_vaddr;
 }
@@ -1346,6 +1339,21 @@ static void xol_free_insn_slot(struct task_struct *tsk)
        }
 }
+void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
+                                  void *src, unsigned long len)
+{
+        /* Initialize the slot */
+        copy_to_page(page, vaddr, src, len);
+        /*
+         * We probably need flush_icache_user_range() but it needs vma.
+         * This should work on most of architectures by default. If
+         * architecture needs to do something different it can define
+         * its own version of the function.
+         */
+        flush_dcache_page(page);
+}
 /**
 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
 * @regs: Reflects the saved state of the task after it has hit a breakpoint
@@ -1357,6 +1365,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
        return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
 }
+unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
+{
+        struct uprobe_task *utask = current->utask;
+        if (unlikely(utask && utask->active_uprobe))
+                return utask->vaddr;
+        return instruction_pointer(regs);
+}
 /*
 * Called with no locks held.
 * Called in context of a exiting or a exec-ing thread.
@@ -1628,20 +1646,6 @@ bool uprobe_deny_signal(void)
        return true;
 }
-/*
- * Avoid singlestepping the original instruction if the original instruction
- * is a NOP or can be emulated.
- */
-static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
-{
-        if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) {
-                if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
-                        return true;
-                clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
-        }
-        return false;
-}
 static void mmf_recalc_uprobes(struct mm_struct *mm)
 {
        struct vm_area_struct *vma;
@@ -1868,13 +1872,13 @@ static void handle_swbp(struct pt_regs *regs)
        handler_chain(uprobe, regs);
-        if (can_skip_sstep(uprobe, regs))
+        if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
                goto out;
        if (!pre_ssout(uprobe, regs, bp_vaddr))
                return;
-        /* can_skip_sstep() succeeded, or restart if can't singlestep */
+        /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
 out:
        put_uprobe(uprobe);
 }
@@ -1886,10 +1890,11 @@ out:
 static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
 {
        struct uprobe *uprobe;
+        int err = 0;
        uprobe = utask->active_uprobe;
        if (utask->state == UTASK_SSTEP_ACK)
-                arch_uprobe_post_xol(&uprobe->arch, regs);
+                err = arch_uprobe_post_xol(&uprobe->arch, regs);
        else if (utask->state == UTASK_SSTEP_TRAPPED)
                arch_uprobe_abort_xol(&uprobe->arch, regs);
        else
@@ -1903,6 +1908,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
        spin_lock_irq(&current->sighand->siglock);
        recalc_sigpending(); /* see uprobe_deny_signal() */
        spin_unlock_irq(&current->sighand->siglock);
+        if (unlikely(err)) {
+                uprobe_warn(current, "execute the probed insn, sending SIGILL.");
+                force_sig_info(SIGILL, SEND_SIG_FORCED, current);
+        }
 }
 /*
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0dbeae374225..83d4382f5699 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -37,7 +37,7 @@ static unsigned long ident_map[32] = {
 struct exec_domain default_exec_domain = {
        .name           = "Linux",              /* name */
        .handler        = default_handler,      /* lcall7 causes a seg fault. */
-        .pers_low       = 0,                    /* PER_LINUX personality. */
+        .pers_low       = 0,                    /* PER_LINUX personality. */
        .pers_high      = 0,                    /* PER_LINUX personality. */
        .signal_map     = ident_map,            /* Identity map signals. */
        .signal_invmap  = ident_map,            /*  - both ways. */
@@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality)
        ep = &default_exec_domain;
 out:
        read_unlock(&exec_domains_lock);
-        return (ep);
+        return ep;
 }
 int
@@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep)
 out:
        write_unlock(&exec_domains_lock);
-        return (err);
+        return err;
 }
+EXPORT_SYMBOL(register_exec_domain);
 int
 unregister_exec_domain(struct exec_domain *ep)
@@ -133,6 +134,7 @@ unregister:
        write_unlock(&exec_domains_lock);
        return 0;
 }
+EXPORT_SYMBOL(unregister_exec_domain);
 int __set_personality(unsigned int personality)
 {
@@ -144,6 +146,7 @@ int __set_personality(unsigned int personality)
        return 0;
 }
+EXPORT_SYMBOL(__set_personality);
 #ifdef CONFIG_PROC_FS
 static int execdomains_proc_show(struct seq_file *m, void *v)
@@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality)
        return old;
 }
-EXPORT_SYMBOL(register_exec_domain);
-EXPORT_SYMBOL(unregister_exec_domain);
-EXPORT_SYMBOL(__set_personality);
diff --git a/kernel/exit.c b/kernel/exit.c
index 6ed6a1d552b5..e5c4668f1799 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -313,46 +313,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
        }
 }
-/*
+#ifdef CONFIG_MEMCG
- * Let kernel threads use this to say that they allow a certain signal.
- * Must not be used if kthread was cloned with CLONE_SIGHAND.
- */
-int allow_signal(int sig)
-{
-        if (!valid_signal(sig) || sig < 1)
-                return -EINVAL;
-        spin_lock_irq(&current->sighand->siglock);
-        /* This is only needed for daemonize()'ed kthreads */
-        sigdelset(&current->blocked, sig);
-        /*
-         * Kernel threads handle their own signals. Let the signal code
-         * know it'll be handled, so that they don't get converted to
-         * SIGKILL or just silently dropped.
-         */
-        current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
-        return 0;
-}
-EXPORT_SYMBOL(allow_signal);
-int disallow_signal(int sig)
-{
-        if (!valid_signal(sig) || sig < 1)
-                return -EINVAL;
-        spin_lock_irq(&current->sighand->siglock);
-        current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
-        recalc_sigpending();
-        spin_unlock_irq(&current->sighand->siglock);
-        return 0;
-}
-EXPORT_SYMBOL(disallow_signal);
-#ifdef CONFIG_MM_OWNER
 /*
 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 */
@@ -395,14 +356,18 @@ retry:
        }
        /*
-         * Search through everything else. We should not get
+         * Search through everything else, we should not get here often.
-         * here often
         */
-        do_each_thread(g, c) {
+        for_each_process(g) {
-                if (c->mm == mm)
+                if (g->flags & PF_KTHREAD)
-                        goto assign_new_owner;
+                        continue;
-        } while_each_thread(g, c);
+                for_each_thread(g, c) {
+                        if (c->mm == mm)
+                                goto assign_new_owner;
+                        if (c->mm)
+                                break;
+                }
+        }
        read_unlock(&tasklist_lock);
        /*
         * We found no owner yet mm_users > 1: this implies that we are
@@ -434,7 +399,7 @@ assign_new_owner:
        task_unlock(c);
        put_task_struct(c);
 }
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
 /*
 * Turn us into a lazy TLB process if we
diff --git a/kernel/fork.c b/kernel/fork.c
index 54a8d26f612f..6a13c46cd87d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti)
 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
                                                  int node)
 {
-        struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
+        struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
-                                             THREAD_SIZE_ORDER);
+                                                  THREAD_SIZE_ORDER);
        return page ? page_address(page) : NULL;
 }
 static inline void free_thread_info(struct thread_info *ti)
 {
-        free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+        free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_info_cache;
@@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p)
 #endif
 }
-#ifdef CONFIG_MM_OWNER
+#ifdef CONFIG_MEMCG
 void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 {
        mm->owner = p;
 }
-#endif /* CONFIG_MM_OWNER */
+#endif /* CONFIG_MEMCG */
 /*
 * Initialize POSIX timer handling for a single task.
@@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        total_forks++;
        spin_unlock(&current->sighand->siglock);
+        syscall_tracepoint_update(p);
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        cgroup_post_fork(p);
        if (clone_flags & CLONE_THREAD)
@@ -1606,10 +1608,12 @@ long do_fork(unsigned long clone_flags,
         */
        if (!IS_ERR(p)) {
                struct completion vfork;
+                struct pid *pid;
                trace_sched_process_fork(current, p);
-                nr = task_pid_vnr(p);
+                pid = get_task_pid(p, PIDTYPE_PID);
+                nr = pid_vnr(pid);
                if (clone_flags & CLONE_PARENT_SETTID)
                        put_user(nr, parent_tidptr);
@@ -1624,12 +1628,14 @@ long do_fork(unsigned long clone_flags,
                /* forking complete and child started to run, tell ptracer */
                if (unlikely(trace))
-                        ptrace_event(trace, nr);
+                        ptrace_event_pid(trace, pid);
                if (clone_flags & CLONE_VFORK) {
                        if (!wait_for_vfork_done(p, &vfork))
-                                ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
+                                ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
                }
+                put_pid(pid);
        } else {
                nr = PTR_ERR(p);
        }
diff --git a/kernel/futex.c b/kernel/futex.c
index 5f589279e462..b632b5f3f094 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -267,7 +267,7 @@ static inline void futex_get_mm(union futex_key *key)
         * get_futex_key() implies a full barrier. This is relied upon
         * as full barrier (B), see the ordering comment above.
         */
-        smp_mb__after_atomic_inc();
+        smp_mb__after_atomic();
 }
 /*
@@ -280,7 +280,7 @@ static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
        /*
         * Full barrier (A), see the ordering comment above.
         */
-        smp_mb__after_atomic_inc();
+        smp_mb__after_atomic();
 #endif
 }
@@ -743,6 +743,55 @@ void exit_pi_state_list(struct task_struct *curr)
        raw_spin_unlock_irq(&curr->pi_lock);
 }
+/*
+ * We need to check the following states:
+ *
+ *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
+ *
+ * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
+ * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
+ *
+ * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
+ *
+ * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
+ * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
+ *
+ * [6]  Found  | Found    | task      | 0         | 1      | Valid
+ *
+ * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
+ *
+ * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
+ * [9]  Found  | Found    | task      | 0         | 0      | Invalid
+ * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
+ *
+ * [1]  Indicates that the kernel can acquire the futex atomically. We
+ *      came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ *
+ * [2]  Valid, if TID does not belong to a kernel thread. If no matching
+ *      thread is found then it indicates that the owner TID has died.
+ *
+ * [3]  Invalid. The waiter is queued on a non PI futex
+ *
+ * [4]  Valid state after exit_robust_list(), which sets the user space
+ *      value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+ *
+ * [5]  The user space value got manipulated between exit_robust_list()
+ *      and exit_pi_state_list()
+ *
+ * [6]  Valid state after exit_pi_state_list() which sets the new owner in
+ *      the pi_state but cannot access the user space value.
+ *
+ * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+ *
+ * [8]  Owner and user space value match
+ *
+ * [9]  There is no transient state which sets the user space TID to 0
+ *      except exit_robust_list(), but this is indicated by the
+ *      FUTEX_OWNER_DIED bit. See [4]
+ *
+ * [10] There is no transient state which leaves owner and user space
+ *      TID out of sync.
+ */
 static int
 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                union futex_key *key, struct futex_pi_state **ps)
@@ -755,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        plist_for_each_entry_safe(this, next, &hb->chain, list) {
                if (match_futex(&this->key, key)) {
                        /*
-                         * Another waiter already exists - bump up
+                         * Sanity check the waiter before increasing
-                         * the refcount and return its pi_state:
+                         * the refcount and attaching to it.
                         */
                        pi_state = this->pi_state;
                        /*
-                         * Userspace might have messed up non-PI and PI futexes
+                         * Userspace might have messed up non-PI and
+                         * PI futexes [3]
                         */
                        if (unlikely(!pi_state))
                                return -EINVAL;
@@ -768,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                        WARN_ON(!atomic_read(&pi_state->refcount));
                        /*
-                         * When pi_state->owner is NULL then the owner died
+                         * Handle the owner died case:
-                         * and another waiter is on the fly. pi_state->owner
-                         * is fixed up by the task which acquires
-                         * pi_state->rt_mutex.
-                         *
-                         * We do not check for pid == 0 which can happen when
-                         * the owner died and robust_list_exit() cleared the
-                         * TID.
                         */
-                        if (pid && pi_state->owner) {
+                        if (uval & FUTEX_OWNER_DIED) {
+                                /*
+                                 * exit_pi_state_list sets owner to NULL and
+                                 * wakes the topmost waiter. The task which
+                                 * acquires the pi_state->rt_mutex will fixup
+                                 * owner.
+                                 */
+                                if (!pi_state->owner) {
+                                        /*
+                                         * No pi state owner, but the user
+                                         * space TID is not 0. Inconsistent
+                                         * state. [5]
+                                         */
+                                        if (pid)
+                                                return -EINVAL;
+                                        /*
+                                         * Take a ref on the state and
+                                         * return. [4]
+                                         */
+                                        goto out_state;
+                                }
                                /*
-                                 * Bail out if user space manipulated the
+                                 * If TID is 0, then either the dying owner
-                                 * futex value.
+                                 * has not yet executed exit_pi_state_list()
+                                 * or some waiter acquired the rtmutex in the
+                                 * pi state, but did not yet fixup the TID in
+                                 * user space.
+                                 *
+                                 * Take a ref on the state and return. [6]
                                 */
-                                if (pid != task_pid_vnr(pi_state->owner))
+                                if (!pid)
+                                        goto out_state;
+                        } else {
+                                /*
+                                 * If the owner died bit is not set,
+                                 * then the pi_state must have an
+                                 * owner. [7]
+                                 */
+                                if (!pi_state->owner)
                                        return -EINVAL;
                        }
+                        /*
+                         * Bail out if user space manipulated the
+                         * futex value. If pi state exists then the
+                         * owner TID must be the same as the user
+                         * space TID. [9/10]
+                         */
+                        if (pid != task_pid_vnr(pi_state->owner))
+                                return -EINVAL;
+                out_state:
                        atomic_inc(&pi_state->refcount);
                        *ps = pi_state;
                        return 0;
                }
        }
        /*
         * We are the first waiter - try to look up the real owner and attach
-         * the new pi_state to it, but bail out when TID = 0
+         * the new pi_state to it, but bail out when TID = 0 [1]
         */
        if (!pid)
                return -ESRCH;
@@ -803,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        if (!p)
                return -ESRCH;
+        if (!p->mm) {
+                put_task_struct(p);
+                return -EPERM;
+        }
        /*
         * We need to look at the task state flags to figure out,
         * whether the task is exiting. To protect against the do_exit
@@ -823,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                return ret;
        }
+        /*
+         * No existing pi state. First waiter. [2]
+         */
        pi_state = alloc_pi_state();
        /*
@@ -894,10 +988,18 @@ retry:
                return -EDEADLK;
        /*
-         * Surprise - we got the lock. Just return to userspace:
+         * Surprise - we got the lock, but we do not trust user space at all.
         */
-        if (unlikely(!curval))
+        if (unlikely(!curval)) {
-                return 1;
+                /*
+                 * We verify whether there is kernel state for this
+                 * futex. If not, we can safely assume, that the 0 ->
+                 * TID transition is correct. If state exists, we do
+                 * not bother to fixup the user space state as it was
+                 * corrupted already.
+                 */
+                return futex_top_waiter(hb, key) ? -EINVAL : 1;
+        }
        uval = curval;
@@ -1028,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        struct task_struct *new_owner;
        struct futex_pi_state *pi_state = this->pi_state;
        u32 uninitialized_var(curval), newval;
+        int ret = 0;
        if (!pi_state)
                return -EINVAL;
@@ -1051,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
                new_owner = this->task;
        /*
-         * We pass it to the next owner. (The WAITERS bit is always
+         * We pass it to the next owner. The WAITERS bit is always
-         * kept enabled while there is PI state around. We must also
+         * kept enabled while there is PI state around. We cleanup the
-         * preserve the owner died bit.)
+         * owner died bit, because we are the owner.
         */
-        if (!(uval & FUTEX_OWNER_DIED)) {
+        newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
-                int ret = 0;
-                newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+        if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+                ret = -EFAULT;
-                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+        else if (curval != uval)
-                        ret = -EFAULT;
+                ret = -EINVAL;
-                else if (curval != uval)
+        if (ret) {
-                        ret = -EINVAL;
+                raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-                if (ret) {
+                return ret;
-                        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-                        return ret;
-                }
        }
        raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1347,7 +1446,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 *
 * Return:
 *  0 - failed to acquire the lock atomically;
- *  1 - acquired the lock;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
 * <0 - error
 */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1358,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 {
        struct futex_q *top_waiter = NULL;
        u32 curval;
-        int ret;
+        int ret, vpid;
        if (get_futex_value_locked(&curval, pifutex))
                return -EFAULT;
@@ -1386,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
         * the contended case or if set_waiters is 1.  The pi_state is returned
         * in ps in contended cases.
         */
+        vpid = task_pid_vnr(top_waiter->task);
        ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
                                   set_waiters);
-        if (ret == 1)
+        if (ret == 1) {
                requeue_pi_wake_futex(top_waiter, key2, hb2);
+                return vpid;
+        }
        return ret;
 }
@@ -1421,10 +1522,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
        struct futex_pi_state *pi_state = NULL;
        struct futex_hash_bucket *hb1, *hb2;
        struct futex_q *this, *next;
-        u32 curval2;
        if (requeue_pi) {
                /*
+                 * Requeue PI only works on two distinct uaddrs. This
+                 * check is only valid for private futexes. See below.
+                 */
+                if (uaddr1 == uaddr2)
+                        return -EINVAL;
+                /*
                 * requeue_pi requires a pi_state, try to allocate it now
                 * without any locks in case it fails.
                 */
@@ -1462,6 +1569,15 @@ retry:
        if (unlikely(ret != 0))
                goto out_put_key1;
+        /*
+         * The check above which compares uaddrs is not sufficient for
+         * shared futexes. We need to compare the keys:
+         */
+        if (requeue_pi && match_futex(&key1, &key2)) {
+                ret = -EINVAL;
+                goto out_put_keys;
+        }
        hb1 = hash_futex(&key1);
        hb2 = hash_futex(&key2);
@@ -1509,16 +1625,25 @@ retry_private:
                 * At this point the top_waiter has either taken uaddr2 or is
                 * waiting on it.  If the former, then the pi_state will not
                 * exist yet, look it up one more time to ensure we have a
-                 * reference to it.
+                 * reference to it. If the lock was taken, ret contains the
+                 * vpid of the top waiter task.
                 */
-                if (ret == 1) {
+                if (ret > 0) {
                        WARN_ON(pi_state);
                        drop_count++;
                        task_count++;
-                        ret = get_futex_value_locked(&curval2, uaddr2);
+                        /*
-                        if (!ret)
+                         * If we acquired the lock, then the user
-                                ret = lookup_pi_state(curval2, hb2, &key2,
+                         * space value of uaddr2 should be vpid. It
-                                                      &pi_state);
+                         * cannot be changed by the top waiter as it
+                         * is blocked on hb2 lock if it tries to do
+                         * so. If something fiddled with it behind our
+                         * back the pi state lookup might unearth
+                         * it. So we rather use the known value than
+                         * rereading and handing potential crap to
+                         * lookup_pi_state.
+                         */
+                        ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
                }
                switch (ret) {
@@ -2301,9 +2426,10 @@ retry:
        /*
         * To avoid races, try to do the TID -> 0 atomic transition
         * again. If it succeeds then we can return without waking
-         * anyone else up:
+         * anyone else up. We only try this if neither the waiters nor
+         * the owner died bit are set.
         */
-        if (!(uval & FUTEX_OWNER_DIED) &&
+        if (!(uval & ~FUTEX_TID_MASK) &&
            cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
                goto pi_faulted;
        /*
@@ -2333,11 +2459,9 @@ retry:
        /*
         * No waiters - kernel unlocks the futex:
         */
-        if (!(uval & FUTEX_OWNER_DIED)) {
+        ret = unlock_futex_pi(uaddr, uval);
-                ret = unlock_futex_pi(uaddr, uval);
+        if (ret == -EFAULT)
-                if (ret == -EFAULT)
+                goto pi_faulted;
-                        goto pi_faulted;
-        }
 out_unlock:
        spin_unlock(&hb->lock);
@@ -2499,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        if (ret)
                goto out_key2;
+        /*
+         * The check above which compares uaddrs is not sufficient for
+         * shared futexes. We need to compare the keys:
+         */
+        if (match_futex(&q.key, &key2)) {
+                ret = -EINVAL;
+                goto out_put_keys;
+        }
        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
        futex_wait_queue_me(hb, &q, to);
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index f45b75b713c0..b358a802fd18 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -85,6 +85,12 @@ void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
 }
 EXPORT_SYMBOL(__gcov_merge_ior);
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+        /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
 /**
 * gcov_enable_events - enable event reporting through gcov_event()
 *
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 2c6e4631c814..826ba9fb5e32 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,12 @@
 #include <linux/vmalloc.h>
 #include "gcov.h"
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 9
+#define GCOV_COUNTERS                   9
+#else
 #define GCOV_COUNTERS                   8
+#endif
 #define GCOV_TAG_FUNCTION_LENGTH        3
 static struct gcov_info *gcov_info_head;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6b715c0af1b1..3ab28993f6e0 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -990,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
        /* Remove an active timer from the queue: */
        ret = remove_hrtimer(timer, base);
-        /* Switch the timer base, if necessary: */
-        new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
        if (mode & HRTIMER_MODE_REL) {
-                tim = ktime_add_safe(tim, new_base->get_time());
+                tim = ktime_add_safe(tim, base->get_time());
                /*
                 * CONFIG_TIME_LOW_RES is a temporary way for architectures
                 * to signal that they simply return xtime in
@@ -1009,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
        hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+        /* Switch the timer base, if necessary: */
+        new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
        timer_stats_hrtimer_set_start_info(timer);
        leftmost = enqueue_hrtimer(timer, new_base);
@@ -1039,6 +1039,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
        return ret;
 }
+EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
 /**
 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 06bb1417b063..06db12434d72 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic =
 static int __init hung_task_panic_setup(char *str)
 {
-        sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+        int rc = kstrtouint(str, 0, &sysctl_hung_task_panic);
+        if (rc)
+                return rc;
        return 1;
 }
 __setup("hung_task_panic=", hung_task_panic_setup);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 07cbdfea9ae2..d269cecdfbf0 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -5,6 +5,10 @@ menu "IRQ subsystem"
 config MAY_HAVE_SPARSE_IRQ
       bool
+# Legacy support, required for itanic
+config GENERIC_IRQ_LEGACY
+       bool
 # Enable the generic irq autoprobe mechanism
 config GENERIC_IRQ_PROBE
        bool
@@ -17,6 +21,11 @@ config GENERIC_IRQ_SHOW
 config GENERIC_IRQ_SHOW_LEVEL
       bool
+# Facility to allocate a hardware interrupt. This is legacy support
+# and should not be used in new code. Use irq domains instead.
+config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+       bool
 # Support for delayed migration from interrupt context
 config GENERIC_PENDING_IRQ
        bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6397df2d6945..a2b28a2fd7b1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -40,10 +40,9 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
        irq_put_desc_unlock(desc, flags);
        /*
         * For !CONFIG_SPARSE_IRQ make the irq show up in
-         * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
+         * allocated_irqs.
-         * already marked, and this call is harmless.
         */
-        irq_reserve_irq(irq);
+        irq_mark_irq(irq);
        return 0;
 }
 EXPORT_SYMBOL(irq_set_chip);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ddf1ffeb79f1..099ea2e0eb88 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -33,7 +33,7 @@ enum {
 };
 /*
- * Bit masks for desc->state
+ * Bit masks for desc->core_internal_state__do_not_mess_with_it
 *
 * IRQS_AUTODETECT              - autodetection in progress
 * IRQS_SPURIOUS_DISABLED       - was disabled due to spurious interrupt
@@ -76,6 +76,12 @@ extern void mask_irq(struct irq_desc *desc);
 extern void unmask_irq(struct irq_desc *desc);
 extern void unmask_threaded_irq(struct irq_desc *desc);
+#ifdef CONFIG_SPARSE_IRQ
+static inline void irq_mark_irq(unsigned int irq) { }
+#else
+extern void irq_mark_irq(unsigned int irq);
+#endif
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index bb07f2928f4b..1487a123db5c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -278,7 +278,12 @@ EXPORT_SYMBOL(irq_to_desc);
 static void free_desc(unsigned int irq)
 {
-        dynamic_irq_cleanup(irq);
+        struct irq_desc *desc = irq_to_desc(irq);
+        unsigned long flags;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        desc_set_defaults(irq, desc, desc_node(desc), NULL);
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
@@ -299,6 +304,20 @@ static int irq_expand_nr_irqs(unsigned int nr)
        return -ENOMEM;
 }
+void irq_mark_irq(unsigned int irq)
+{
+        mutex_lock(&sparse_irq_lock);
+        bitmap_set(allocated_irqs, irq, 1);
+        mutex_unlock(&sparse_irq_lock);
+}
+#ifdef CONFIG_GENERIC_IRQ_LEGACY
+void irq_init_desc(unsigned int irq)
+{
+        free_desc(irq);
+}
+#endif
 #endif /* !CONFIG_SPARSE_IRQ */
 /**
@@ -396,30 +415,56 @@ err:
 }
 EXPORT_SYMBOL_GPL(__irq_alloc_descs);
+#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
 /**
- * irq_reserve_irqs - mark irqs allocated
+ * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware
- * @from:       mark from irq number
+ * @cnt:        number of interrupts to allocate
- * @cnt:        number of irqs to mark
+ * @node:       node on which to allocate
 *
- * Returns 0 on success or an appropriate error code
+ * Returns an interrupt number > 0 or 0, if the allocation fails.
 */
-int irq_reserve_irqs(unsigned int from, unsigned int cnt)
+unsigned int irq_alloc_hwirqs(int cnt, int node)
 {
-        unsigned int start;
+        int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL);
-        int ret = 0;
-        if (!cnt || (from + cnt) > nr_irqs)
+        if (irq < 0)
-                return -EINVAL;
+                return 0;
-        mutex_lock(&sparse_irq_lock);
+        for (i = irq; cnt > 0; i++, cnt--) {
-        start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+                if (arch_setup_hwirq(i, node))
-        if (start == from)
+                        goto err;
-                bitmap_set(allocated_irqs, start, cnt);
+                irq_clear_status_flags(i, _IRQ_NOREQUEST);
-        else
+        }
-                ret = -EEXIST;
+        return irq;
-        mutex_unlock(&sparse_irq_lock);
-        return ret;
+err:
+        for (i--; i >= irq; i--) {
+                irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
+                arch_teardown_hwirq(i);
+        }
+        irq_free_descs(irq, cnt);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
+/**
+ * irq_free_hwirqs - Free irq descriptor and cleanup the hardware
+ * @from:       Free from irq number
+ * @cnt:        number of interrupts to free
+ *
+ */
+void irq_free_hwirqs(unsigned int from, int cnt)
+{
+        int i, j;
+        for (i = from, j = cnt; j > 0; i++, j--) {
+                irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
+                arch_teardown_hwirq(i);
+        }
+        irq_free_descs(from, cnt);
 }
+EXPORT_SYMBOL_GPL(irq_free_hwirqs);
+#endif
 /**
 * irq_get_next_irq - get next allocated irq number
@@ -482,20 +527,6 @@ int irq_set_percpu_devid(unsigned int irq)
        return 0;
 }
-/**
- * dynamic_irq_cleanup - cleanup a dynamically allocated irq
- * @irq:        irq number to initialize
- */
-void dynamic_irq_cleanup(unsigned int irq)
-{
-        struct irq_desc *desc = irq_to_desc(irq);
-        unsigned long flags;
-        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc_set_defaults(irq, desc, desc_node(desc), NULL);
-        raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
 void kstat_incr_irq_this_cpu(unsigned int irq)
 {
        kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index f14033700c25..eb5e10e32e05 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -27,14 +27,14 @@ static struct irq_domain *irq_default_domain;
 * __irq_domain_add() - Allocate a new irq_domain data structure
 * @of_node: optional device-tree node of the interrupt controller
 * @size: Size of linear map; 0 for radix mapping only
+ * @hwirq_max: Maximum number of interrupts supported by controller
 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
 *              direct mapping
 * @ops: map/unmap domain callbacks
 * @host_data: Controller private data pointer
 *
- * Allocates and initialize and irq_domain structure.  Caller is expected to
+ * Allocates and initialize and irq_domain structure.
- * register allocated irq_domain with irq_domain_register().  Returns pointer
+ * Returns pointer to IRQ domain, or NULL on failure.
- * to IRQ domain, or NULL on failure.
 */
 struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
                                    irq_hw_number_t hwirq_max, int direct_max,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d34131ca372b..3dc6a61bf06a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -886,8 +886,8 @@ static int irq_thread(void *data)
                irq_thread_check_affinity(desc, action);
                action_ret = handler_fn(desc, action);
-                if (!noirqdebug)
+                if (action_ret == IRQ_HANDLED)
-                        note_interrupt(action->irq, desc, action_ret);
+                        atomic_inc(&desc->threads_handled);
                wake_threads_waitq(desc);
        }
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index a1d8cc63b56e..e2514b0e439e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -270,6 +270,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
        return action && (action->flags & IRQF_IRQPOLL);
 }
+#define SPURIOUS_DEFERRED       0x80000000
 void note_interrupt(unsigned int irq, struct irq_desc *desc,
                    irqreturn_t action_ret)
 {
@@ -277,15 +279,111 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
            irq_settings_is_polled(desc))
                return;
-        /* we get here again via the threaded handler */
-        if (action_ret == IRQ_WAKE_THREAD)
-                return;
        if (bad_action_ret(action_ret)) {
                report_bad_irq(irq, desc, action_ret);
                return;
        }
+        /*
+         * We cannot call note_interrupt from the threaded handler
+         * because we need to look at the compound of all handlers
+         * (primary and threaded). Aside of that in the threaded
+         * shared case we have no serialization against an incoming
+         * hardware interrupt while we are dealing with a threaded
+         * result.
+         *
+         * So in case a thread is woken, we just note the fact and
+         * defer the analysis to the next hardware interrupt.
+         *
+         * The threaded handlers store whether they sucessfully
+         * handled an interrupt and we check whether that number
+         * changed versus the last invocation.
+         *
+         * We could handle all interrupts with the delayed by one
+         * mechanism, but for the non forced threaded case we'd just
+         * add pointless overhead to the straight hardirq interrupts
+         * for the sake of a few lines less code.
+         */
+        if (action_ret & IRQ_WAKE_THREAD) {
+                /*
+                 * There is a thread woken. Check whether one of the
+                 * shared primary handlers returned IRQ_HANDLED. If
+                 * not we defer the spurious detection to the next
+                 * interrupt.
+                 */
+                if (action_ret == IRQ_WAKE_THREAD) {
+                        int handled;
+                        /*
+                         * We use bit 31 of thread_handled_last to
+                         * denote the deferred spurious detection
+                         * active. No locking necessary as
+                         * thread_handled_last is only accessed here
+                         * and we have the guarantee that hard
+                         * interrupts are not reentrant.
+                         */
+                        if (!(desc->threads_handled_last & SPURIOUS_DEFERRED)) {
+                                desc->threads_handled_last |= SPURIOUS_DEFERRED;
+                                return;
+                        }
+                        /*
+                         * Check whether one of the threaded handlers
+                         * returned IRQ_HANDLED since the last
+                         * interrupt happened.
+                         *
+                         * For simplicity we just set bit 31, as it is
+                         * set in threads_handled_last as well. So we
+                         * avoid extra masking. And we really do not
+                         * care about the high bits of the handled
+                         * count. We just care about the count being
+                         * different than the one we saw before.
+                         */
+                        handled = atomic_read(&desc->threads_handled);
+                        handled |= SPURIOUS_DEFERRED;
+                        if (handled != desc->threads_handled_last) {
+                                action_ret = IRQ_HANDLED;
+                                /*
+                                 * Note: We keep the SPURIOUS_DEFERRED
+                                 * bit set. We are handling the
+                                 * previous invocation right now.
+                                 * Keep it for the current one, so the
+                                 * next hardware interrupt will
+                                 * account for it.
+                                 */
+                                desc->threads_handled_last = handled;
+                        } else {
+                                /*
+                                 * None of the threaded handlers felt
+                                 * responsible for the last interrupt
+                                 *
+                                 * We keep the SPURIOUS_DEFERRED bit
+                                 * set in threads_handled_last as we
+                                 * need to account for the current
+                                 * interrupt as well.
+                                 */
+                                action_ret = IRQ_NONE;
+                        }
+                } else {
+                        /*
+                         * One of the primary handlers returned
+                         * IRQ_HANDLED. So we don't care about the
+                         * threaded handlers on the same line. Clear
+                         * the deferred detection bit.
+                         *
+                         * In theory we could/should check whether the
+                         * deferred bit is set and take the result of
+                         * the previous run into account here as
+                         * well. But it's really not worth the
+                         * trouble. If every other interrupt is
+                         * handled we never trigger the spurious
+                         * detector. And if this is just the one out
+                         * of 100k unhandled ones which is handled
+                         * then we merily delay the spurious detection
+                         * by one hard interrupt. Not a real problem.
+                         */
+                        desc->threads_handled_last &= ~SPURIOUS_DEFERRED;
+                }
+        }
        if (unlikely(action_ret == IRQ_NONE)) {
                /*
                 * If we are seeing only the odd spurious IRQ caused by
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8380ad203bc..369f41a94124 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -125,8 +125,8 @@ static struct page *kimage_alloc_page(struct kimage *image,
                                       unsigned long dest);
 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
-                            unsigned long nr_segments,
+                           unsigned long nr_segments,
-                            struct kexec_segment __user *segments)
+                           struct kexec_segment __user *segments)
 {
        size_t segment_bytes;
        struct kimage *image;
@@ -257,13 +257,13 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
        image->control_code_page = kimage_alloc_control_pages(image,
                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
-                printk(KERN_ERR "Could not allocate control_code_buffer\n");
+                pr_err("Could not allocate control_code_buffer\n");
                goto out_free;
        }
        image->swap_page = kimage_alloc_control_pages(image, 0);
        if (!image->swap_page) {
-                printk(KERN_ERR "Could not allocate swap buffer\n");
+                pr_err("Could not allocate swap buffer\n");
                goto out_free;
        }
@@ -332,7 +332,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
        image->control_code_page = kimage_alloc_control_pages(image,
                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
-                printk(KERN_ERR "Could not allocate control_code_buffer\n");
+                pr_err("Could not allocate control_code_buffer\n");
                goto out_free;
        }
@@ -621,8 +621,8 @@ static void kimage_terminate(struct kimage *image)
 #define for_each_kimage_entry(image, ptr, entry) \
        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
-                ptr = (entry & IND_INDIRECTION)? \
+                ptr = (entry & IND_INDIRECTION) ? \
-                        phys_to_virt((entry & PAGE_MASK)): ptr +1)
+                        phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
 static void kimage_free_entry(kimage_entry_t entry)
 {
@@ -650,8 +650,7 @@ static void kimage_free(struct kimage *image)
                         * done with it.
                         */
                        ind = entry;
-                }
+                } else if (entry & IND_SOURCE)
-                else if (entry & IND_SOURCE)
                        kimage_free_entry(entry);
        }
        /* Free the final indirection page */
@@ -774,8 +773,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
                        addr = old_addr;
                        page = old_page;
                        break;
-                }
+                } else {
-                else {
                        /* Place the page on the destination list I
                         * will use it later.
                         */
@@ -1059,7 +1057,7 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
                return -EINVAL;
        ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
-        for (i=0; i < nr_segments; i++) {
+        for (i = 0; i < nr_segments; i++) {
                result = copy_from_user(&in, &segments[i], sizeof(in));
                if (result)
                        return -EFAULT;
@@ -1214,14 +1212,14 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
         * squirrelled away.  ELF notes happen to provide
         * all of that, so there is no need to invent something new.
         */
-        buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+        buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
        if (!buf)
                return;
        memset(&prstatus, 0, sizeof(prstatus));
        prstatus.pr_pid = current->pid;
        elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
        buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
-                              &prstatus, sizeof(prstatus));
+                              &prstatus, sizeof(prstatus));
        final_note(buf);
 }
@@ -1230,8 +1228,7 @@ static int __init crash_notes_memory_init(void)
        /* Allocate memory for saving cpu registers. */
        crash_notes = alloc_percpu(note_buf_t);
        if (!crash_notes) {
-                printk("Kexec: Memory allocation for saving cpu register"
+                pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
-                " states failed\n");
                return -ENOMEM;
        }
        return 0;
@@ -1253,10 +1250,10 @@ subsys_initcall(crash_notes_memory_init);
 *
 * The function returns 0 on success and -EINVAL on failure.
 */
-static int __init parse_crashkernel_mem(char                    *cmdline,
+static int __init parse_crashkernel_mem(char *cmdline,
-                                        unsigned long long      system_ram,
+                                        unsigned long long system_ram,
-                                        unsigned long long      *crash_size,
+                                        unsigned long long *crash_size,
-                                        unsigned long long      *crash_base)
+                                        unsigned long long *crash_base)
 {
        char *cur = cmdline, *tmp;
@@ -1267,12 +1264,12 @@ static int __init parse_crashkernel_mem(char 			*cmdline,
                /* get the start of the range */
                start = memparse(cur, &tmp);
                if (cur == tmp) {
-                        pr_warning("crashkernel: Memory value expected\n");
+                        pr_warn("crashkernel: Memory value expected\n");
                        return -EINVAL;
                }
                cur = tmp;
                if (*cur != '-') {
-                        pr_warning("crashkernel: '-' expected\n");
+                        pr_warn("crashkernel: '-' expected\n");
                        return -EINVAL;
                }
                cur++;
@@ -1281,31 +1278,30 @@ static int __init parse_crashkernel_mem(char 			*cmdline,
                if (*cur != ':') {
                        end = memparse(cur, &tmp);
                        if (cur == tmp) {
-                                pr_warning("crashkernel: Memory "
+                                pr_warn("crashkernel: Memory value expected\n");
-                                                "value expected\n");
                                return -EINVAL;
                        }
                        cur = tmp;
                        if (end <= start) {
-                                pr_warning("crashkernel: end <= start\n");
+                                pr_warn("crashkernel: end <= start\n");
                                return -EINVAL;
                        }
                }
                if (*cur != ':') {
-                        pr_warning("crashkernel: ':' expected\n");
+                        pr_warn("crashkernel: ':' expected\n");
                        return -EINVAL;
                }
                cur++;
                size = memparse(cur, &tmp);
                if (cur == tmp) {
-                        pr_warning("Memory value expected\n");
+                        pr_warn("Memory value expected\n");
                        return -EINVAL;
                }
                cur = tmp;
                if (size >= system_ram) {
-                        pr_warning("crashkernel: invalid size\n");
+                        pr_warn("crashkernel: invalid size\n");
                        return -EINVAL;
                }
@@ -1323,8 +1319,7 @@ static int __init parse_crashkernel_mem(char 			*cmdline,
                        cur++;
                        *crash_base = memparse(cur, &tmp);
                        if (cur == tmp) {
-                                pr_warning("Memory value expected "
+                                pr_warn("Memory value expected after '@'\n");
-                                                "after '@'\n");
                                return -EINVAL;
                        }
                }
@@ -1336,26 +1331,26 @@ static int __init parse_crashkernel_mem(char 			*cmdline,
 /*
 * That function parses "simple" (old) crashkernel command lines like
 *
- *      crashkernel=size[@offset]
+ *      crashkernel=size[@offset]
 *
 * It returns 0 on success and -EINVAL on failure.
 */
-static int __init parse_crashkernel_simple(char                 *cmdline,
+static int __init parse_crashkernel_simple(char *cmdline,
-                                           unsigned long long   *crash_size,
+                                           unsigned long long *crash_size,
-                                           unsigned long long   *crash_base)
+                                           unsigned long long *crash_base)
 {
        char *cur = cmdline;
        *crash_size = memparse(cmdline, &cur);
        if (cmdline == cur) {
-                pr_warning("crashkernel: memory value expected\n");
+                pr_warn("crashkernel: memory value expected\n");
                return -EINVAL;
        }
        if (*cur == '@')
                *crash_base = memparse(cur+1, &cur);
        else if (*cur != ' ' && *cur != '\0') {
-                pr_warning("crashkernel: unrecognized char\n");
+                pr_warn("crashkernel: unrecognized char\n");
                return -EINVAL;
        }
@@ -1622,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 #ifdef CONFIG_MEMORY_FAILURE
        VMCOREINFO_NUMBER(PG_hwpoison);
 #endif
+        VMCOREINFO_NUMBER(PG_head_mask);
        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
        arch_crash_save_vmcoreinfo();
@@ -1683,7 +1679,15 @@ int kernel_kexec(void)
                kexec_in_progress = true;
                kernel_restart_prepare(NULL);
                migrate_to_reboot_cpu();
-                printk(KERN_EMERG "Starting new kernel\n");
+                /*
+                 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+                 * no further code needs to use CPU hotplug (which is true in
+                 * the reboot case). However, the kexec path depends on using
+                 * CPU hotplug again; so re-enable it here.
+                 */
+                cpu_hotplug_enable();
+                pr_emerg("Starting new kernel\n");
                machine_shutdown();
        }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6b375af4958d..8637e041a247 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -285,10 +285,7 @@ static int wait_for_helper(void *data)
        pid_t pid;
        /* If SIGCLD is ignored sys_wait4 won't populate the status. */
-        spin_lock_irq(&current->sighand->siglock);
+        kernel_sigaction(SIGCHLD, SIG_DFL);
-        current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
-        spin_unlock_irq(&current->sighand->siglock);
        pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
        if (pid < 0) {
                sub_info->retval = pid;
@@ -498,7 +495,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth)
 static void helper_lock(void)
 {
        atomic_inc(&running_helpers);
-        smp_mb__after_atomic_inc();
+        smp_mb__after_atomic();
 }
 static void helper_unlock(void)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ceeadfcabb76..3214289df5a7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -86,21 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
        return &(kretprobe_table_locks[hash].lock);
 }
-/*
+/* Blacklist -- list of struct kprobe_blacklist_entry */
- * Normally, functions that we'd want to prohibit kprobes in, are marked
+static LIST_HEAD(kprobe_blacklist);
- * __kprobes. But, there are cases where such functions already belong to
- * a different section (__sched for preempt_schedule)
- *
- * For such cases, we now have a blacklist
- */
-static struct kprobe_blackpoint kprobe_blacklist[] = {
-        {"preempt_schedule",},
-        {"native_get_debugreg",},
-        {"irq_entries_start",},
-        {"common_interrupt",},
-        {"mcount",},    /* mcount can be called from everywhere */
-        {NULL}    /* Terminator */
-};
 #ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 /*
@@ -151,13 +138,13 @@ struct kprobe_insn_cache kprobe_insn_slots = {
        .insn_size = MAX_INSN_SIZE,
        .nr_garbage = 0,
 };
-static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
+static int collect_garbage_slots(struct kprobe_insn_cache *c);
 /**
 * __get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
 */
-kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
+kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 {
        struct kprobe_insn_page *kip;
        kprobe_opcode_t *slot = NULL;
@@ -214,7 +201,7 @@ out:
 }
 /* Return 1 if all garbages are collected, otherwise 0. */
-static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
 {
        kip->slot_used[idx] = SLOT_CLEAN;
        kip->nused--;
@@ -235,7 +222,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
        return 0;
 }
-static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
+static int collect_garbage_slots(struct kprobe_insn_cache *c)
 {
        struct kprobe_insn_page *kip, *next;
@@ -257,8 +244,8 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
        return 0;
 }
-void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
+void __free_insn_slot(struct kprobe_insn_cache *c,
-                                kprobe_opcode_t *slot, int dirty)
+                      kprobe_opcode_t *slot, int dirty)
 {
        struct kprobe_insn_page *kip;
@@ -314,7 +301,7 @@ static inline void reset_kprobe_instance(void)
 *                              OR
 *      - with preemption disabled - from arch/xxx/kernel/kprobes.c
 */
-struct kprobe __kprobes *get_kprobe(void *addr)
+struct kprobe *get_kprobe(void *addr)
 {
        struct hlist_head *head;
        struct kprobe *p;
@@ -327,8 +314,9 @@ struct kprobe __kprobes *get_kprobe(void *addr)
        return NULL;
 }
+NOKPROBE_SYMBOL(get_kprobe);
-static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
 /* Return true if the kprobe is an aggregator */
 static inline int kprobe_aggrprobe(struct kprobe *p)
@@ -360,7 +348,7 @@ static bool kprobes_allow_optimization;
 * Call all pre_handler on the list, but ignores its return value.
 * This must be called from arch-dep optimized caller.
 */
-void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe *kp;
@@ -372,9 +360,10 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
                reset_kprobe_instance();
        }
 }
+NOKPROBE_SYMBOL(opt_pre_handler);
 /* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
+static void free_aggr_kprobe(struct kprobe *p)
 {
        struct optimized_kprobe *op;
@@ -412,7 +401,7 @@ static inline int kprobe_disarmed(struct kprobe *p)
 }
 /* Return true(!0) if the probe is queued on (un)optimizing lists */
-static int __kprobes kprobe_queued(struct kprobe *p)
+static int kprobe_queued(struct kprobe *p)
 {
        struct optimized_kprobe *op;
@@ -428,7 +417,7 @@ static int __kprobes kprobe_queued(struct kprobe *p)
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including addr (exclude breakpoint).
 */
-static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *get_optimized_kprobe(unsigned long addr)
 {
        int i;
        struct kprobe *p = NULL;
@@ -460,7 +449,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
 * Optimize (replace a breakpoint with a jump) kprobes listed on
 * optimizing_list.
 */
-static __kprobes void do_optimize_kprobes(void)
+static void do_optimize_kprobes(void)
 {
        /* Optimization never be done when disarmed */
        if (kprobes_all_disarmed || !kprobes_allow_optimization ||
@@ -488,7 +477,7 @@ static __kprobes void do_optimize_kprobes(void)
 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
 * if need) kprobes listed on unoptimizing_list.
 */
-static __kprobes void do_unoptimize_kprobes(void)
+static void do_unoptimize_kprobes(void)
 {
        struct optimized_kprobe *op, *tmp;
@@ -520,7 +509,7 @@ static __kprobes void do_unoptimize_kprobes(void)
 }
 /* Reclaim all kprobes on the free_list */
-static __kprobes void do_free_cleaned_kprobes(void)
+static void do_free_cleaned_kprobes(void)
 {
        struct optimized_kprobe *op, *tmp;
@@ -532,13 +521,13 @@ static __kprobes void do_free_cleaned_kprobes(void)
 }
 /* Start optimizer after OPTIMIZE_DELAY passed */
-static __kprobes void kick_kprobe_optimizer(void)
+static void kick_kprobe_optimizer(void)
 {
        schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
 }
 /* Kprobe jump optimizer */
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+static void kprobe_optimizer(struct work_struct *work)
 {
        mutex_lock(&kprobe_mutex);
        /* Lock modules while optimizing kprobes */
@@ -574,7 +563,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
 }
 /* Wait for completing optimization and unoptimization */
-static __kprobes void wait_for_kprobe_optimizer(void)
+static void wait_for_kprobe_optimizer(void)
 {
        mutex_lock(&kprobe_mutex);
@@ -593,7 +582,7 @@ static __kprobes void wait_for_kprobe_optimizer(void)
 }
 /* Optimize kprobe if p is ready to be optimized */
-static __kprobes void optimize_kprobe(struct kprobe *p)
+static void optimize_kprobe(struct kprobe *p)
 {
        struct optimized_kprobe *op;
@@ -627,7 +616,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
 }
 /* Short cut to direct unoptimizing */
-static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+static void force_unoptimize_kprobe(struct optimized_kprobe *op)
 {
        get_online_cpus();
        arch_unoptimize_kprobe(op);
@@ -637,7 +626,7 @@ static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
 }
 /* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
+static void unoptimize_kprobe(struct kprobe *p, bool force)
 {
        struct optimized_kprobe *op;
@@ -697,7 +686,7 @@ static void reuse_unused_kprobe(struct kprobe *ap)
 }
 /* Remove optimized instructions */
-static void __kprobes kill_optimized_kprobe(struct kprobe *p)
+static void kill_optimized_kprobe(struct kprobe *p)
 {
        struct optimized_kprobe *op;
@@ -723,7 +712,7 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
 }
 /* Try to prepare optimized instructions */
-static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
+static void prepare_optimized_kprobe(struct kprobe *p)
 {
        struct optimized_kprobe *op;
@@ -732,7 +721,7 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
 }
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
-static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 {
        struct optimized_kprobe *op;
@@ -747,13 +736,13 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
        return &op->kp;
 }
-static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
+static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
 /*
 * Prepare an optimized_kprobe and optimize it
 * NOTE: p must be a normal registered kprobe
 */
-static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
+static void try_to_optimize_kprobe(struct kprobe *p)
 {
        struct kprobe *ap;
        struct optimized_kprobe *op;
@@ -787,7 +776,7 @@ out:
 }
 #ifdef CONFIG_SYSCTL
-static void __kprobes optimize_all_kprobes(void)
+static void optimize_all_kprobes(void)
 {
        struct hlist_head *head;
        struct kprobe *p;
@@ -810,7 +799,7 @@ out:
        mutex_unlock(&kprobe_mutex);
 }
-static void __kprobes unoptimize_all_kprobes(void)
+static void unoptimize_all_kprobes(void)
 {
        struct hlist_head *head;
        struct kprobe *p;
@@ -861,7 +850,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
 #endif /* CONFIG_SYSCTL */
 /* Put a breakpoint for a probe. Must be called with text_mutex locked */
-static void __kprobes __arm_kprobe(struct kprobe *p)
+static void __arm_kprobe(struct kprobe *p)
 {
        struct kprobe *_p;
@@ -876,7 +865,7 @@ static void __kprobes __arm_kprobe(struct kprobe *p)
 }
 /* Remove the breakpoint of a probe. Must be called with text_mutex locked */
-static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
+static void __disarm_kprobe(struct kprobe *p, bool reopt)
 {
        struct kprobe *_p;
@@ -911,13 +900,13 @@ static void reuse_unused_kprobe(struct kprobe *ap)
        BUG_ON(kprobe_unused(ap));
 }
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
+static void free_aggr_kprobe(struct kprobe *p)
 {
        arch_remove_kprobe(p);
        kfree(p);
 }
-static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 {
        return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
 }
@@ -931,7 +920,7 @@ static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
 static int kprobe_ftrace_enabled;
 /* Must ensure p->addr is really on ftrace */
-static int __kprobes prepare_kprobe(struct kprobe *p)
+static int prepare_kprobe(struct kprobe *p)
 {
        if (!kprobe_ftrace(p))
                return arch_prepare_kprobe(p);
@@ -940,7 +929,7 @@ static int __kprobes prepare_kprobe(struct kprobe *p)
 }
 /* Caller must lock kprobe_mutex */
-static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
+static void arm_kprobe_ftrace(struct kprobe *p)
 {
        int ret;
@@ -955,7 +944,7 @@ static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
 }
 /* Caller must lock kprobe_mutex */
-static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
+static void disarm_kprobe_ftrace(struct kprobe *p)
 {
        int ret;
@@ -975,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
 #endif
 /* Arm a kprobe with text_mutex */
-static void __kprobes arm_kprobe(struct kprobe *kp)
+static void arm_kprobe(struct kprobe *kp)
 {
        if (unlikely(kprobe_ftrace(kp))) {
                arm_kprobe_ftrace(kp);
@@ -992,7 +981,7 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
 }
 /* Disarm a kprobe with text_mutex */
-static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
+static void disarm_kprobe(struct kprobe *kp, bool reopt)
 {
        if (unlikely(kprobe_ftrace(kp))) {
                disarm_kprobe_ftrace(kp);
@@ -1008,7 +997,7 @@ static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
 * Aggregate handlers for multiple kprobes support - these handlers
 * take care of invoking the individual kprobe handlers on p->list
 */
-static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe *kp;
@@ -1022,9 +1011,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
        }
        return 0;
 }
+NOKPROBE_SYMBOL(aggr_pre_handler);
-static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
-                                        unsigned long flags)
+                              unsigned long flags)
 {
        struct kprobe *kp;
@@ -1036,9 +1026,10 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
                }
        }
 }
+NOKPROBE_SYMBOL(aggr_post_handler);
-static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
-                                        int trapnr)
+                              int trapnr)
 {
        struct kprobe *cur = __this_cpu_read(kprobe_instance);
@@ -1052,8 +1043,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
        }
        return 0;
 }
+NOKPROBE_SYMBOL(aggr_fault_handler);
-static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe *cur = __this_cpu_read(kprobe_instance);
        int ret = 0;
@@ -1065,9 +1057,10 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
        reset_kprobe_instance();
        return ret;
 }
+NOKPROBE_SYMBOL(aggr_break_handler);
 /* Walks the list and increments nmissed count for multiprobe case */
-void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
+void kprobes_inc_nmissed_count(struct kprobe *p)
 {
        struct kprobe *kp;
        if (!kprobe_aggrprobe(p)) {
@@ -1078,9 +1071,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
        }
        return;
 }
+NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
-void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
+void recycle_rp_inst(struct kretprobe_instance *ri,
-                                struct hlist_head *head)
+                     struct hlist_head *head)
 {
        struct kretprobe *rp = ri->rp;
@@ -1095,8 +1089,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
                /* Unregistering */
                hlist_add_head(&ri->hlist, head);
 }
+NOKPROBE_SYMBOL(recycle_rp_inst);
-void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
+void kretprobe_hash_lock(struct task_struct *tsk,
                         struct hlist_head **head, unsigned long *flags)
 __acquires(hlist_lock)
 {
@@ -1107,17 +1102,19 @@ __acquires(hlist_lock)
        hlist_lock = kretprobe_table_lock_ptr(hash);
        raw_spin_lock_irqsave(hlist_lock, *flags);
 }
+NOKPROBE_SYMBOL(kretprobe_hash_lock);
-static void __kprobes kretprobe_table_lock(unsigned long hash,
+static void kretprobe_table_lock(unsigned long hash,
-        unsigned long *flags)
+                                 unsigned long *flags)
 __acquires(hlist_lock)
 {
        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        raw_spin_lock_irqsave(hlist_lock, *flags);
 }
+NOKPROBE_SYMBOL(kretprobe_table_lock);
-void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
+void kretprobe_hash_unlock(struct task_struct *tsk,
-        unsigned long *flags)
+                           unsigned long *flags)
 __releases(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -1126,14 +1123,16 @@ __releases(hlist_lock)
        hlist_lock = kretprobe_table_lock_ptr(hash);
        raw_spin_unlock_irqrestore(hlist_lock, *flags);
 }
+NOKPROBE_SYMBOL(kretprobe_hash_unlock);
-static void __kprobes kretprobe_table_unlock(unsigned long hash,
+static void kretprobe_table_unlock(unsigned long hash,
-       unsigned long *flags)
+                                   unsigned long *flags)
 __releases(hlist_lock)
 {
        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
        raw_spin_unlock_irqrestore(hlist_lock, *flags);
 }
+NOKPROBE_SYMBOL(kretprobe_table_unlock);
 /*
 * This function is called from finish_task_switch when task tk becomes dead,
@@ -1141,7 +1140,7 @@ __releases(hlist_lock)
 * with this task. These left over instances represent probed functions
 * that have been called but will never return.
 */
-void __kprobes kprobe_flush_task(struct task_struct *tk)
+void kprobe_flush_task(struct task_struct *tk)
 {
        struct kretprobe_instance *ri;
        struct hlist_head *head, empty_rp;
@@ -1166,6 +1165,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
                kfree(ri);
        }
 }
+NOKPROBE_SYMBOL(kprobe_flush_task);
 static inline void free_rp_inst(struct kretprobe *rp)
 {
@@ -1178,7 +1178,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
        }
 }
-static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
+static void cleanup_rp_inst(struct kretprobe *rp)
 {
        unsigned long flags, hash;
        struct kretprobe_instance *ri;
@@ -1197,12 +1197,13 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
        }
        free_rp_inst(rp);
 }
+NOKPROBE_SYMBOL(cleanup_rp_inst);
 /*
 * Add the new probe to ap->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
-static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
+static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
        BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
@@ -1226,7 +1227,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 * Fill in the required fields of the "manager kprobe". Replace the
 * earlier kprobe in the hlist with the manager kprobe
 */
-static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
        /* Copy p's insn slot to ap */
        copy_kprobe(p, ap);
@@ -1252,8 +1253,7 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 * This is the second or subsequent kprobe at the address - handle
 * the intricacies
 */
-static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
+static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
-                                          struct kprobe *p)
 {
        int ret = 0;
        struct kprobe *ap = orig_p;
@@ -1324,25 +1324,29 @@ out:
        return ret;
 }
-static int __kprobes in_kprobes_functions(unsigned long addr)
+bool __weak arch_within_kprobe_blacklist(unsigned long addr)
 {
-        struct kprobe_blackpoint *kb;
+        /* The __kprobes marked functions and entry code must not be probed */
+        return addr >= (unsigned long)__kprobes_text_start &&
+               addr < (unsigned long)__kprobes_text_end;
+}
-        if (addr >= (unsigned long)__kprobes_text_start &&
+static bool within_kprobe_blacklist(unsigned long addr)
-            addr < (unsigned long)__kprobes_text_end)
+{
-                return -EINVAL;
+        struct kprobe_blacklist_entry *ent;
+        if (arch_within_kprobe_blacklist(addr))
+                return true;
        /*
         * If there exists a kprobe_blacklist, verify and
         * fail any probe registration in the prohibited area
         */
-        for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
+        list_for_each_entry(ent, &kprobe_blacklist, list) {
-                if (kb->start_addr) {
+                if (addr >= ent->start_addr && addr < ent->end_addr)
-                        if (addr >= kb->start_addr &&
+                        return true;
-                            addr < (kb->start_addr + kb->range))
-                                return -EINVAL;
-                }
        }
-        return 0;
+        return false;
 }
 /*
@@ -1351,7 +1355,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
 * This returns encoded errors if it fails to look up symbol or invalid
 * combination of parameters.
 */
-static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
 {
        kprobe_opcode_t *addr = p->addr;
@@ -1374,7 +1378,7 @@ invalid:
 }
 /* Check passed kprobe is valid and return kprobe in kprobe_table. */
-static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
+static struct kprobe *__get_valid_kprobe(struct kprobe *p)
 {
        struct kprobe *ap, *list_p;
@@ -1406,8 +1410,8 @@ static inline int check_kprobe_rereg(struct kprobe *p)
        return ret;
 }
-static __kprobes int check_kprobe_address_safe(struct kprobe *p,
+static int check_kprobe_address_safe(struct kprobe *p,
-                                               struct module **probed_mod)
+                                     struct module **probed_mod)
 {
        int ret = 0;
        unsigned long ftrace_addr;
@@ -1433,7 +1437,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
        /* Ensure it is not in reserved area nor out of text */
        if (!kernel_text_address((unsigned long) p->addr) ||
-            in_kprobes_functions((unsigned long) p->addr) ||
+            within_kprobe_blacklist((unsigned long) p->addr) ||
            jump_label_text_reserved(p->addr, p->addr)) {
                ret = -EINVAL;
                goto out;
@@ -1469,7 +1473,7 @@ out:
        return ret;
 }
-int __kprobes register_kprobe(struct kprobe *p)
+int register_kprobe(struct kprobe *p)
 {
        int ret;
        struct kprobe *old_p;
@@ -1531,7 +1535,7 @@ out:
 EXPORT_SYMBOL_GPL(register_kprobe);
 /* Check if all probes on the aggrprobe are disabled */
-static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+static int aggr_kprobe_disabled(struct kprobe *ap)
 {
        struct kprobe *kp;
@@ -1547,7 +1551,7 @@ static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
 }
 /* Disable one kprobe: Make sure called under kprobe_mutex is locked */
-static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+static struct kprobe *__disable_kprobe(struct kprobe *p)
 {
        struct kprobe *orig_p;
@@ -1574,7 +1578,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
 /*
 * Unregister a kprobe without a scheduler synchronization.
 */
-static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+static int __unregister_kprobe_top(struct kprobe *p)
 {
        struct kprobe *ap, *list_p;
@@ -1631,7 +1635,7 @@ disarmed:
        return 0;
 }
-static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
+static void __unregister_kprobe_bottom(struct kprobe *p)
 {
        struct kprobe *ap;
@@ -1647,7 +1651,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
        /* Otherwise, do nothing. */
 }
-int __kprobes register_kprobes(struct kprobe **kps, int num)
+int register_kprobes(struct kprobe **kps, int num)
 {
        int i, ret = 0;
@@ -1665,13 +1669,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
 }
 EXPORT_SYMBOL_GPL(register_kprobes);
-void __kprobes unregister_kprobe(struct kprobe *p)
+void unregister_kprobe(struct kprobe *p)
 {
        unregister_kprobes(&p, 1);
 }
 EXPORT_SYMBOL_GPL(unregister_kprobe);
-void __kprobes unregister_kprobes(struct kprobe **kps, int num)
+void unregister_kprobes(struct kprobe **kps, int num)
 {
        int i;
@@ -1700,7 +1704,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
        return (unsigned long)entry;
 }
-int __kprobes register_jprobes(struct jprobe **jps, int num)
+int register_jprobes(struct jprobe **jps, int num)
 {
        struct jprobe *jp;
        int ret = 0, i;
@@ -1731,19 +1735,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
 }
 EXPORT_SYMBOL_GPL(register_jprobes);
-int __kprobes register_jprobe(struct jprobe *jp)
+int register_jprobe(struct jprobe *jp)
 {
        return register_jprobes(&jp, 1);
 }
 EXPORT_SYMBOL_GPL(register_jprobe);
-void __kprobes unregister_jprobe(struct jprobe *jp)
+void unregister_jprobe(struct jprobe *jp)
 {
        unregister_jprobes(&jp, 1);
 }
 EXPORT_SYMBOL_GPL(unregister_jprobe);
-void __kprobes unregister_jprobes(struct jprobe **jps, int num)
+void unregister_jprobes(struct jprobe **jps, int num)
 {
        int i;
@@ -1768,8 +1772,7 @@ EXPORT_SYMBOL_GPL(unregister_jprobes);
 * This kprobe pre_handler is registered with every kretprobe. When probe
 * hits it will set up the return probe.
 */
-static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
-                                           struct pt_regs *regs)
 {
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
        unsigned long hash, flags = 0;
@@ -1807,8 +1810,9 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
        }
        return 0;
 }
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
-int __kprobes register_kretprobe(struct kretprobe *rp)
+int register_kretprobe(struct kretprobe *rp)
 {
        int ret = 0;
        struct kretprobe_instance *inst;
@@ -1861,7 +1865,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 }
 EXPORT_SYMBOL_GPL(register_kretprobe);
-int __kprobes register_kretprobes(struct kretprobe **rps, int num)
+int register_kretprobes(struct kretprobe **rps, int num)
 {
        int ret = 0, i;
@@ -1879,13 +1883,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 }
 EXPORT_SYMBOL_GPL(register_kretprobes);
-void __kprobes unregister_kretprobe(struct kretprobe *rp)
+void unregister_kretprobe(struct kretprobe *rp)
 {
        unregister_kretprobes(&rp, 1);
 }
 EXPORT_SYMBOL_GPL(unregister_kretprobe);
-void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+void unregister_kretprobes(struct kretprobe **rps, int num)
 {
        int i;
@@ -1908,38 +1912,38 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
 EXPORT_SYMBOL_GPL(unregister_kretprobes);
 #else /* CONFIG_KRETPROBES */
-int __kprobes register_kretprobe(struct kretprobe *rp)
+int register_kretprobe(struct kretprobe *rp)
 {
        return -ENOSYS;
 }
 EXPORT_SYMBOL_GPL(register_kretprobe);
-int __kprobes register_kretprobes(struct kretprobe **rps, int num)
+int register_kretprobes(struct kretprobe **rps, int num)
 {
        return -ENOSYS;
 }
 EXPORT_SYMBOL_GPL(register_kretprobes);
-void __kprobes unregister_kretprobe(struct kretprobe *rp)
+void unregister_kretprobe(struct kretprobe *rp)
 {
 }
 EXPORT_SYMBOL_GPL(unregister_kretprobe);
-void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+void unregister_kretprobes(struct kretprobe **rps, int num)
 {
 }
 EXPORT_SYMBOL_GPL(unregister_kretprobes);
-static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
-                                           struct pt_regs *regs)
 {
        return 0;
 }
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
 #endif /* CONFIG_KRETPROBES */
 /* Set the kprobe gone and remove its instruction buffer. */
-static void __kprobes kill_kprobe(struct kprobe *p)
+static void kill_kprobe(struct kprobe *p)
 {
        struct kprobe *kp;
@@ -1963,7 +1967,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
 }
 /* Disable one kprobe */
-int __kprobes disable_kprobe(struct kprobe *kp)
+int disable_kprobe(struct kprobe *kp)
 {
        int ret = 0;
@@ -1979,7 +1983,7 @@ int __kprobes disable_kprobe(struct kprobe *kp)
 EXPORT_SYMBOL_GPL(disable_kprobe);
 /* Enable one kprobe */
-int __kprobes enable_kprobe(struct kprobe *kp)
+int enable_kprobe(struct kprobe *kp)
 {
        int ret = 0;
        struct kprobe *p;
@@ -2012,16 +2016,49 @@ out:
 }
 EXPORT_SYMBOL_GPL(enable_kprobe);
-void __kprobes dump_kprobe(struct kprobe *kp)
+void dump_kprobe(struct kprobe *kp)
 {
        printk(KERN_WARNING "Dumping kprobe:\n");
        printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
               kp->symbol_name, kp->addr, kp->offset);
 }
+NOKPROBE_SYMBOL(dump_kprobe);
+/*
+ * Lookup and populate the kprobe_blacklist.
+ *
+ * Unlike the kretprobe blacklist, we'll need to determine
+ * the range of addresses that belong to the said functions,
+ * since a kprobe need not necessarily be at the beginning
+ * of a function.
+ */
+static int __init populate_kprobe_blacklist(unsigned long *start,
+                                             unsigned long *end)
+{
+        unsigned long *iter;
+        struct kprobe_blacklist_entry *ent;
+        unsigned long offset = 0, size = 0;
+        for (iter = start; iter < end; iter++) {
+                if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) {
+                        pr_err("Failed to find blacklist %p\n", (void *)*iter);
+                        continue;
+                }
+                ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+                if (!ent)
+                        return -ENOMEM;
+                ent->start_addr = *iter;
+                ent->end_addr = *iter + size;
+                INIT_LIST_HEAD(&ent->list);
+                list_add_tail(&ent->list, &kprobe_blacklist);
+        }
+        return 0;
+}
 /* Module notifier call back, checking kprobes on the module */
-static int __kprobes kprobes_module_callback(struct notifier_block *nb,
+static int kprobes_module_callback(struct notifier_block *nb,
-                                             unsigned long val, void *data)
+                                   unsigned long val, void *data)
 {
        struct module *mod = data;
        struct hlist_head *head;
@@ -2062,14 +2099,13 @@ static struct notifier_block kprobe_module_nb = {
        .priority = 0
 };
+/* Markers of _kprobe_blacklist section */
+extern unsigned long __start_kprobe_blacklist[];
+extern unsigned long __stop_kprobe_blacklist[];
 static int __init init_kprobes(void)
 {
        int i, err = 0;
-        unsigned long offset = 0, size = 0;
-        char *modname, namebuf[KSYM_NAME_LEN];
-        const char *symbol_name;
-        void *addr;
-        struct kprobe_blackpoint *kb;
        /* FIXME allocate the probe table, currently defined statically */
        /* initialize all list heads */
@@ -2079,26 +2115,11 @@ static int __init init_kprobes(void)
                raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
        }
-        /*
+        err = populate_kprobe_blacklist(__start_kprobe_blacklist,
-         * Lookup and populate the kprobe_blacklist.
+                                        __stop_kprobe_blacklist);
-         *
+        if (err) {
-         * Unlike the kretprobe blacklist, we'll need to determine
+                pr_err("kprobes: failed to populate blacklist: %d\n", err);
-         * the range of addresses that belong to the said functions,
+                pr_err("Please take care of using kprobes.\n");
-         * since a kprobe need not necessarily be at the beginning
-         * of a function.
-         */
-        for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
-                kprobe_lookup_name(kb->name, addr);
-                if (!addr)
-                        continue;
-                kb->start_addr = (unsigned long)addr;
-                symbol_name = kallsyms_lookup(kb->start_addr,
-                                &size, &offset, &modname, namebuf);
-                if (!symbol_name)
-                        kb->range = 0;
-                else
-                        kb->range = size;
        }
        if (kretprobe_blacklist_size) {
@@ -2138,7 +2159,7 @@ static int __init init_kprobes(void)
 }
 #ifdef CONFIG_DEBUG_FS
-static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
+static void report_probe(struct seq_file *pi, struct kprobe *p,
                const char *sym, int offset, char *modname, struct kprobe *pp)
 {
        char *kprobe_type;
@@ -2167,12 +2188,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
                (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
 }
-static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
+static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
 {
        return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
 }
-static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
+static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
 {
        (*pos)++;
        if (*pos >= KPROBE_TABLE_SIZE)
@@ -2180,12 +2201,12 @@ static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
        return pos;
 }
-static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
+static void kprobe_seq_stop(struct seq_file *f, void *v)
 {
        /* Nothing to do */
 }
-static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
+static int show_kprobe_addr(struct seq_file *pi, void *v)
 {
        struct hlist_head *head;
        struct kprobe *p, *kp;
@@ -2216,7 +2237,7 @@ static const struct seq_operations kprobes_seq_ops = {
        .show  = show_kprobe_addr
 };
-static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
+static int kprobes_open(struct inode *inode, struct file *filp)
 {
        return seq_open(filp, &kprobes_seq_ops);
 }
@@ -2228,7 +2249,47 @@ static const struct file_operations debugfs_kprobes_operations = {
        .release        = seq_release,
 };
-static void __kprobes arm_all_kprobes(void)
+/* kprobes/blacklist -- shows which functions can not be probed */
+static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
+{
+        return seq_list_start(&kprobe_blacklist, *pos);
+}
+static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        return seq_list_next(v, &kprobe_blacklist, pos);
+}
+static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
+{
+        struct kprobe_blacklist_entry *ent =
+                list_entry(v, struct kprobe_blacklist_entry, list);
+        seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
+                   (void *)ent->end_addr, (void *)ent->start_addr);
+        return 0;
+}
+static const struct seq_operations kprobe_blacklist_seq_ops = {
+        .start = kprobe_blacklist_seq_start,
+        .next  = kprobe_blacklist_seq_next,
+        .stop  = kprobe_seq_stop,       /* Reuse void function */
+        .show  = kprobe_blacklist_seq_show,
+};
+static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
+{
+        return seq_open(filp, &kprobe_blacklist_seq_ops);
+}
+static const struct file_operations debugfs_kprobe_blacklist_ops = {
+        .open           = kprobe_blacklist_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static void arm_all_kprobes(void)
 {
        struct hlist_head *head;
        struct kprobe *p;
@@ -2256,7 +2317,7 @@ already_enabled:
        return;
 }
-static void __kprobes disarm_all_kprobes(void)
+static void disarm_all_kprobes(void)
 {
        struct hlist_head *head;
        struct kprobe *p;
@@ -2340,7 +2401,7 @@ static const struct file_operations fops_kp = {
        .llseek =       default_llseek,
 };
-static int __kprobes debugfs_kprobe_init(void)
+static int __init debugfs_kprobe_init(void)
 {
        struct dentry *dir, *file;
        unsigned int value = 1;
@@ -2351,19 +2412,24 @@ static int __kprobes debugfs_kprobe_init(void)
        file = debugfs_create_file("list", 0444, dir, NULL,
                                &debugfs_kprobes_operations);
-        if (!file) {
+        if (!file)
-                debugfs_remove(dir);
+                goto error;
-                return -ENOMEM;
-        }
        file = debugfs_create_file("enabled", 0600, dir,
                                        &value, &fops_kp);
-        if (!file) {
+        if (!file)
-                debugfs_remove(dir);
+                goto error;
-                return -ENOMEM;
-        }
+        file = debugfs_create_file("blacklist", 0444, dir, NULL,
+                                &debugfs_kprobe_blacklist_ops);
+        if (!file)
+                goto error;
        return 0;
+error:
+        debugfs_remove(dir);
+        return -ENOMEM;
 }
 late_initcall(debugfs_kprobe_init);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 2495a9b14ac8..6683ccef9fff 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -37,6 +37,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(uevent_seqnum);
+#ifdef CONFIG_UEVENT_HELPER
 /* uevent helper program, used during early boot */
 static ssize_t uevent_helper_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
@@ -56,7 +57,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
        return count;
 }
 KERNEL_ATTR_RW(uevent_helper);
+#endif
 #ifdef CONFIG_PROFILING
 static ssize_t profiling_show(struct kobject *kobj,
@@ -189,7 +190,9 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
 static struct attribute * kernel_attrs[] = {
        &fscaps_attr.attr,
        &uevent_seqnum_attr.attr,
+#ifdef CONFIG_UEVENT_HELPER
        &uevent_helper_attr.attr,
+#endif
 #ifdef CONFIG_PROFILING
        &profiling_attr.attr,
 #endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9a130ec06f7a..c2390f41307b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create)
 * kthread_stop() has been called).  The return value should be zero
 * or a negative error number; it will be passed to kthread_stop().
 *
- * Returns a task_struct or ERR_PTR(-ENOMEM).
+ * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
 */
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           void *data, int node,
@@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                 * that thread.
                 */
                if (xchg(&create->done, NULL))
-                        return ERR_PTR(-ENOMEM);
+                        return ERR_PTR(-EINTR);
                /*
                 * kthreadd (or new kernel thread) will call complete()
                 * shortly.
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a462b317f9a0..a02812743a7e 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void)
 }
 static void __sched
-account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
+account_global_scheduler_latency(struct task_struct *tsk,
+                                 struct latency_record *lat)
 {
        int firstnonnull = MAXLR + 1;
        int i;
@@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v)
                                        break;
                                seq_printf(m, " %ps", (void *)bt);
                        }
-                        seq_printf(m, "\n");
+                        seq_puts(m, "\n");
                }
        }
        return 0;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index b8bdcd4785b7..8541bfdfd232 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -24,4 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 4f560cfedc8f..51c4b24b6328 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -54,9 +54,9 @@ enum {
 * table (if it's not there yet), and we check it for lock order
 * conflicts and deadlocks.
 */
-#define MAX_LOCKDEP_ENTRIES     16384UL
+#define MAX_LOCKDEP_ENTRIES     32768UL
-#define MAX_LOCKDEP_CHAINS_BITS 15
+#define MAX_LOCKDEP_CHAINS_BITS 16
 #define MAX_LOCKDEP_CHAINS      (1UL << MAX_LOCKDEP_CHAINS_BITS)
 #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
@@ -65,7 +65,7 @@ enum {
 * Stack-trace: tightly packed array of stack backtrace
 * addresses. Protected by the hash_lock.
 */
-#define MAX_STACK_TRACE_ENTRIES 262144UL
+#define MAX_STACK_TRACE_ENTRIES 524288UL
 extern struct list_head all_lock_classes;
 extern struct lock_chain lock_chains[];
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f26b1a18e34e..0955b885d0dc 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -82,14 +82,14 @@ struct lock_writer_stress_stats {
 };
 static struct lock_writer_stress_stats *lwsa;
-#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE)
+#if defined(MODULE)
 #define LOCKTORTURE_RUNNABLE_INIT 1
 #else
 #define LOCKTORTURE_RUNNABLE_INIT 0
 #endif
 int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
 module_param(locktorture_runnable, int, 0444);
-MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot");
+MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init");
 /* Forward reference. */
 static void lock_torture_cleanup(void);
@@ -216,10 +216,11 @@ static int lock_torture_writer(void *arg)
        static DEFINE_TORTURE_RANDOM(rand);
        VERBOSE_TOROUT_STRING("lock_torture_writer task started");
-        set_user_nice(current, 19);
+        set_user_nice(current, MAX_NICE);
        do {
-                schedule_timeout_uninterruptible(1);
+                if ((torture_random(&rand) & 0xfffff) == 0)
+                        schedule_timeout_uninterruptible(1);
                cur_ops->writelock();
                if (WARN_ON_ONCE(lock_is_write_held))
                        lwsp->n_write_lock_fail++;
@@ -354,7 +355,8 @@ static int __init lock_torture_init(void)
                &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
        };
-        torture_init_begin(torture_type, verbose, &locktorture_runnable);
+        if (!torture_init_begin(torture_type, verbose, &locktorture_runnable))
+                return -EBUSY;
        /* Process args and tell the world that the torturer is on the job. */
        for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..be9ee1559fca 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,21 +14,47 @@
 * called from interrupt context and we have preemption disabled while
 * spinning.
 */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+        return cpu_nr + 1;
+}
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+        int cpu_nr = encoded_cpu_val - 1;
+        return per_cpu_ptr(&osq_node, cpu_nr);
+}
 /*
 * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
 * Can return NULL in case we were the last queued and we updated @lock instead.
 */
-static inline struct optimistic_spin_queue *
+static inline struct optimistic_spin_node *
-osq_wait_next(struct optimistic_spin_queue **lock,
+osq_wait_next(struct optimistic_spin_queue *lock,
-              struct optimistic_spin_queue *node,
+              struct optimistic_spin_node *node,
-              struct optimistic_spin_queue *prev)
+              struct optimistic_spin_node *prev)
 {
-        struct optimistic_spin_queue *next = NULL;
+        struct optimistic_spin_node *next = NULL;
+        int curr = encode_cpu(smp_processor_id());
+        int old;
+        /*
+         * If there is a prev node in queue, then the 'old' value will be
+         * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+         * we're currently last in queue, then the queue will then become empty.
+         */
+        old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
        for (;;) {
-                if (*lock == node && cmpxchg(lock, node, prev) == node) {
+                if (atomic_read(&lock->tail) == curr &&
+                    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
                        /*
                         * We were the last queued, we moved @lock back. @prev
                         * will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_queue **lock,
        return next;
 }
-bool osq_lock(struct optimistic_spin_queue **lock)
+bool osq_lock(struct optimistic_spin_queue *lock)
 {
-        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
-        struct optimistic_spin_queue *prev, *next;
+        struct optimistic_spin_node *prev, *next;
+        int curr = encode_cpu(smp_processor_id());
+        int old;
        node->locked = 0;
        node->next = NULL;
+        node->cpu = curr;
-        node->prev = prev = xchg(lock, node);
+        old = atomic_xchg(&lock->tail, curr);
-        if (likely(prev == NULL))
+        if (old == OSQ_UNLOCKED_VAL)
                return true;
+        prev = decode_cpu(old);
+        node->prev = prev;
        ACCESS_ONCE(prev->next) = node;
        /*
@@ -149,20 +180,21 @@ unqueue:
        return false;
 }
-void osq_unlock(struct optimistic_spin_queue **lock)
+void osq_unlock(struct optimistic_spin_queue *lock)
 {
-        struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+        struct optimistic_spin_node *node, *next;
-        struct optimistic_spin_queue *next;
+        int curr = encode_cpu(smp_processor_id());
        /*
         * Fast path for the uncontended case.
         */
-        if (likely(cmpxchg(lock, node, NULL) == node))
+        if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
                return;
        /*
         * Second most likely case.
         */
+        node = this_cpu_ptr(&osq_node);
        next = xchg(&node->next, NULL);
        if (next) {
                ACCESS_ONCE(next->locked) = 1;
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..74356dc0ce29 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,13 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 * mutex_lock()/rwsem_down_{read,write}() etc.
 */
-struct optimistic_spin_queue {
+struct optimistic_spin_node {
-        struct optimistic_spin_queue *next, *prev;
+        struct optimistic_spin_node *next, *prev;
        int locked; /* 1 if lock acquired */
+        int cpu; /* encoded CPU # value */
 };
-extern bool osq_lock(struct optimistic_spin_queue **lock);
+extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue **lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);
 #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33c6760..acca2c1a3c5e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
        INIT_LIST_HEAD(&lock->wait_list);
        mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-        lock->osq = NULL;
+        osq_lock_init(&lock->osq);
 #endif
        debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
new file mode 100644
index 000000000000..fb5b8ac411a5
--- /dev/null
+++ b/kernel/locking/qrwlock.c
@@ -0,0 +1,133 @@
+/*
+ * Queue read/write lock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/qrwlock.h>
+/**
+ * rspin_until_writer_unlock - inc reader count & spin until writer is gone
+ * @lock  : Pointer to queue rwlock structure
+ * @writer: Current queue rwlock writer status byte
+ *
+ * In interrupt context or at the head of the queue, the reader will just
+ * increment the reader count & wait until the writer releases the lock.
+ */
+static __always_inline void
+rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
+{
+        while ((cnts & _QW_WMASK) == _QW_LOCKED) {
+                arch_mutex_cpu_relax();
+                cnts = smp_load_acquire((u32 *)&lock->cnts);
+        }
+}
+/**
+ * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * @lock: Pointer to queue rwlock structure
+ */
+void queue_read_lock_slowpath(struct qrwlock *lock)
+{
+        u32 cnts;
+        /*
+         * Readers come here when they cannot get the lock without waiting
+         */
+        if (unlikely(in_interrupt())) {
+                /*
+                 * Readers in interrupt context will spin until the lock is
+                 * available without waiting in the queue.
+                 */
+                cnts = smp_load_acquire((u32 *)&lock->cnts);
+                rspin_until_writer_unlock(lock, cnts);
+                return;
+        }
+        atomic_sub(_QR_BIAS, &lock->cnts);
+        /*
+         * Put the reader into the wait queue
+         */
+        arch_spin_lock(&lock->lock);
+        /*
+         * At the head of the wait queue now, wait until the writer state
+         * goes to 0 and then try to increment the reader count and get
+         * the lock. It is possible that an incoming writer may steal the
+         * lock in the interim, so it is necessary to check the writer byte
+         * to make sure that the write lock isn't taken.
+         */
+        while (atomic_read(&lock->cnts) & _QW_WMASK)
+                arch_mutex_cpu_relax();
+        cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+        rspin_until_writer_unlock(lock, cnts);
+        /*
+         * Signal the next one in queue to become queue head
+         */
+        arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_read_lock_slowpath);
+/**
+ * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ */
+void queue_write_lock_slowpath(struct qrwlock *lock)
+{
+        u32 cnts;
+        /* Put the writer into the wait queue */
+        arch_spin_lock(&lock->lock);
+        /* Try to acquire the lock directly if no reader is present */
+        if (!atomic_read(&lock->cnts) &&
+            (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+                goto unlock;
+        /*
+         * Set the waiting flag to notify readers that a writer is pending,
+         * or wait for a previous writer to go away.
+         */
+        for (;;) {
+                cnts = atomic_read(&lock->cnts);
+                if (!(cnts & _QW_WMASK) &&
+                    (atomic_cmpxchg(&lock->cnts, cnts,
+                                    cnts | _QW_WAITING) == cnts))
+                        break;
+                arch_mutex_cpu_relax();
+        }
+        /* When no more readers, set the locked flag */
+        for (;;) {
+                cnts = atomic_read(&lock->cnts);
+                if ((cnts == _QW_WAITING) &&
+                    (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
+                                    _QW_LOCKED) == _QW_WAITING))
+                        break;
+                arch_mutex_cpu_relax();
+        }
+unlock:
+        arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_write_lock_slowpath);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 14193d596d78..ab29b6a22669 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
 {
        return (waiter != NULL);
 }
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+        debug_rt_mutex_print_deadlock(w);
+}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index aa4dff04b594..fc605941b9b8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
                owner = *p;
        } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
 }
+/*
+ * Safe fastpath aware unlock:
+ * 1) Clear the waiters bit
+ * 2) Drop lock->wait_lock
+ * 3) Try to unlock the lock with cmpxchg
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+        __releases(lock->wait_lock)
+{
+        struct task_struct *owner = rt_mutex_owner(lock);
+        clear_rt_mutex_waiters(lock);
+        raw_spin_unlock(&lock->wait_lock);
+        /*
+         * If a new waiter comes in between the unlock and the cmpxchg
+         * we have two situations:
+         *
+         * unlock(wait_lock);
+         *                                      lock(wait_lock);
+         * cmpxchg(p, owner, 0) == owner
+         *                                      mark_rt_mutex_waiters(lock);
+         *                                      acquire(lock);
+         * or:
+         *
+         * unlock(wait_lock);
+         *                                      lock(wait_lock);
+         *                                      mark_rt_mutex_waiters(lock);
+         *
+         * cmpxchg(p, owner, 0) != owner
+         *                                      enqueue_waiter();
+         *                                      unlock(wait_lock);
+         * lock(wait_lock);
+         * wake waiter();
+         * unlock(wait_lock);
+         *                                      lock(wait_lock);
+         *                                      acquire(lock);
+         */
+        return rt_mutex_cmpxchg(lock, owner, NULL);
+}
 #else
 # define rt_mutex_cmpxchg(l,c,n)        (0)
 static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
@@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
        lock->owner = (struct task_struct *)
                        ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
 }
+/*
+ * Simple slow path only version: lock->owner is protected by lock->wait_lock.
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+        __releases(lock->wait_lock)
+{
+        lock->owner = NULL;
+        raw_spin_unlock(&lock->wait_lock);
+        return true;
+}
 #endif
 static inline int
@@ -260,27 +312,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
 */
 int max_lock_depth = 1024;
+static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+{
+        return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
+}
 /*
 * Adjust the priority chain. Also used for deadlock detection.
 * Decreases task's usage by one - may thus free the task.
 *
- * @task: the task owning the mutex (owner) for which a chain walk is probably
+ * @task:       the task owning the mutex (owner) for which a chain walk is
- *        needed
+ *              probably needed
 * @deadlock_detect: do we have to carry out deadlock detection?
- * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
+ * @orig_lock:  the mutex (can be NULL if we are walking the chain to recheck
- *             things for a task that has just got its priority adjusted, and
+ *              things for a task that has just got its priority adjusted, and
- *             is waiting on a mutex)
+ *              is waiting on a mutex)
+ * @next_lock:  the mutex on which the owner of @orig_lock was blocked before
+ *              we dropped its pi_lock. Is never dereferenced, only used for
+ *              comparison to detect lock chain changes.
 * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
- *               its priority to the mutex owner (can be NULL in the case
+ *              its priority to the mutex owner (can be NULL in the case
- *               depicted above or if the top waiter is gone away and we are
+ *              depicted above or if the top waiter is gone away and we are
- *               actually deboosting the owner)
+ *              actually deboosting the owner)
- * @top_task: the current top waiter
+ * @top_task:   the current top waiter
 *
 * Returns 0 or -EDEADLK.
 */
 static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                                      int deadlock_detect,
                                      struct rt_mutex *orig_lock,
+                                      struct rt_mutex *next_lock,
                                      struct rt_mutex_waiter *orig_waiter,
                                      struct task_struct *top_task)
 {
@@ -314,7 +375,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                }
                put_task_struct(task);
-                return deadlock_detect ? -EDEADLK : 0;
+                return -EDEADLK;
        }
 retry:
        /*
@@ -339,13 +400,32 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                goto out_unlock_pi;
        /*
+         * We dropped all locks after taking a refcount on @task, so
+         * the task might have moved on in the lock chain or even left
+         * the chain completely and blocks now on an unrelated lock or
+         * on @orig_lock.
+         *
+         * We stored the lock on which @task was blocked in @next_lock,
+         * so we can detect the chain change.
+         */
+        if (next_lock != waiter->lock)
+                goto out_unlock_pi;
+        /*
         * Drop out, when the task has no waiters. Note,
         * top_waiter can be NULL, when we are in the deboosting
         * mode!
         */
-        if (top_waiter && (!task_has_pi_waiters(task) ||
+        if (top_waiter) {
-                           top_waiter != task_top_pi_waiter(task)))
+                if (!task_has_pi_waiters(task))
-                goto out_unlock_pi;
+                        goto out_unlock_pi;
+                /*
+                 * If deadlock detection is off, we stop here if we
+                 * are not the top pi waiter of the task.
+                 */
+                if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
+                        goto out_unlock_pi;
+        }
        /*
         * When deadlock detection is off then we check, if further
@@ -361,11 +441,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                goto retry;
        }
-        /* Deadlock detection */
+        /*
+         * Deadlock detection. If the lock is the same as the original
+         * lock which caused us to walk the lock chain or if the
+         * current lock is owned by the task which initiated the chain
+         * walk, we detected a deadlock.
+         */
        if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
                debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
                raw_spin_unlock(&lock->wait_lock);
-                ret = deadlock_detect ? -EDEADLK : 0;
+                ret = -EDEADLK;
                goto out_unlock_pi;
        }
@@ -410,11 +495,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                __rt_mutex_adjust_prio(task);
        }
+        /*
+         * Check whether the task which owns the current lock is pi
+         * blocked itself. If yes we store a pointer to the lock for
+         * the lock chain change detection above. After we dropped
+         * task->pi_lock next_lock cannot be dereferenced anymore.
+         */
+        next_lock = task_blocked_on_lock(task);
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        top_waiter = rt_mutex_top_waiter(lock);
        raw_spin_unlock(&lock->wait_lock);
+        /*
+         * We reached the end of the lock chain. Stop right here. No
+         * point to go back just to figure that out.
+         */
+        if (!next_lock)
+                goto out_put_task;
        if (!detect_deadlock && waiter != top_waiter)
                goto out_put_task;
@@ -524,8 +624,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 {
        struct task_struct *owner = rt_mutex_owner(lock);
        struct rt_mutex_waiter *top_waiter = waiter;
-        unsigned long flags;
+        struct rt_mutex *next_lock;
        int chain_walk = 0, res;
+        unsigned long flags;
+        /*
+         * Early deadlock detection. We really don't want the task to
+         * enqueue on itself just to untangle the mess later. It's not
+         * only an optimization. We drop the locks, so another waiter
+         * can come in before the chain walk detects the deadlock. So
+         * the other will detect the deadlock and return -EDEADLOCK,
+         * which is wrong, as the other waiter is not in a deadlock
+         * situation.
+         */
+        if (owner == task)
+                return -EDEADLK;
        raw_spin_lock_irqsave(&task->pi_lock, flags);
        __rt_mutex_adjust_prio(task);
@@ -545,20 +658,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        if (!owner)
                return 0;
+        raw_spin_lock_irqsave(&owner->pi_lock, flags);
        if (waiter == rt_mutex_top_waiter(lock)) {
-                raw_spin_lock_irqsave(&owner->pi_lock, flags);
                rt_mutex_dequeue_pi(owner, top_waiter);
                rt_mutex_enqueue_pi(owner, waiter);
                __rt_mutex_adjust_prio(owner);
                if (owner->pi_blocked_on)
                        chain_walk = 1;
-                raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+        } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
-        }
-        else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
                chain_walk = 1;
+        }
+        /* Store the lock on which owner is blocked or NULL */
+        next_lock = task_blocked_on_lock(owner);
-        if (!chain_walk)
+        raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+        /*
+         * Even if full deadlock detection is on, if the owner is not
+         * blocked itself, we can avoid finding this out in the chain
+         * walk.
+         */
+        if (!chain_walk || !next_lock)
                return 0;
        /*
@@ -570,8 +691,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        raw_spin_unlock(&lock->wait_lock);
-        res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
+        res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
-                                         task);
+                                         next_lock, waiter, task);
        raw_spin_lock(&lock->wait_lock);
@@ -581,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 /*
 * Wake up the next waiter on the lock.
 *
- * Remove the top waiter from the current tasks waiter list and wake it up.
+ * Remove the top waiter from the current tasks pi waiter list and
+ * wake it up.
 *
 * Called with lock->wait_lock held.
 */
@@ -602,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
         */
        rt_mutex_dequeue_pi(current, waiter);
-        rt_mutex_set_owner(lock, NULL);
+        /*
+         * As we are waking up the top waiter, and the waiter stays
+         * queued on the lock until it gets the lock, this lock
+         * obviously has waiters. Just set the bit here and this has
+         * the added benefit of forcing all new tasks into the
+         * slow path making sure no task of lower priority than
+         * the top waiter can steal this lock.
+         */
+        lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
        raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+        /*
+         * It's safe to dereference waiter as it cannot go away as
+         * long as we hold lock->wait_lock. The waiter task needs to
+         * acquire it in order to dequeue the waiter.
+         */
        wake_up_process(waiter->task);
 }
@@ -620,8 +755,8 @@ static void remove_waiter(struct rt_mutex *lock,
 {
        int first = (waiter == rt_mutex_top_waiter(lock));
        struct task_struct *owner = rt_mutex_owner(lock);
+        struct rt_mutex *next_lock = NULL;
        unsigned long flags;
-        int chain_walk = 0;
        raw_spin_lock_irqsave(&current->pi_lock, flags);
        rt_mutex_dequeue(lock, waiter);
@@ -645,13 +780,13 @@ static void remove_waiter(struct rt_mutex *lock,
                }
                __rt_mutex_adjust_prio(owner);
-                if (owner->pi_blocked_on)
+                /* Store the lock on which owner is blocked or NULL */
-                        chain_walk = 1;
+                next_lock = task_blocked_on_lock(owner);
                raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
        }
-        if (!chain_walk)
+        if (!next_lock)
                return;
        /* gets dropped in rt_mutex_adjust_prio_chain()! */
@@ -659,7 +794,7 @@ static void remove_waiter(struct rt_mutex *lock,
        raw_spin_unlock(&lock->wait_lock);
-        rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
+        rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
        raw_spin_lock(&lock->wait_lock);
 }
@@ -672,6 +807,7 @@ static void remove_waiter(struct rt_mutex *lock,
 void rt_mutex_adjust_pi(struct task_struct *task)
 {
        struct rt_mutex_waiter *waiter;
+        struct rt_mutex *next_lock;
        unsigned long flags;
        raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -682,12 +818,13 @@ void rt_mutex_adjust_pi(struct task_struct *task)
                raw_spin_unlock_irqrestore(&task->pi_lock, flags);
                return;
        }
+        next_lock = waiter->lock;
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        /* gets dropped in rt_mutex_adjust_prio_chain()! */
        get_task_struct(task);
-        rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
+        rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
 }
 /**
@@ -739,6 +876,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
        return ret;
 }
+static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
+                                     struct rt_mutex_waiter *w)
+{
+        /*
+         * If the result is not -EDEADLOCK or the caller requested
+         * deadlock detection, nothing to do here.
+         */
+        if (res != -EDEADLOCK || detect_deadlock)
+                return;
+        /*
+         * Yell lowdly and stop the task right here.
+         */
+        rt_mutex_print_deadlock(w);
+        while (1) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                schedule();
+        }
+}
 /*
 * Slow path lock function:
 */
@@ -778,8 +935,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
        set_current_state(TASK_RUNNING);
-        if (unlikely(ret))
+        if (unlikely(ret)) {
                remove_waiter(lock, &waiter);
+                rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
+        }
        /*
         * try_to_take_rt_mutex() sets the waiter bit
@@ -835,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
        rt_mutex_deadlock_account_unlock(current);
-        if (!rt_mutex_has_waiters(lock)) {
+        /*
-                lock->owner = NULL;
+         * We must be careful here if the fast path is enabled. If we
-                raw_spin_unlock(&lock->wait_lock);
+         * have no waiters queued we cannot set owner to NULL here
-                return;
+         * because of:
+         *
+         * foo->lock->owner = NULL;
+         *                      rtmutex_lock(foo->lock);   <- fast path
+         *                      free = atomic_dec_and_test(foo->refcnt);
+         *                      rtmutex_unlock(foo->lock); <- fast path
+         *                      if (free)
+         *                              kfree(foo);
+         * raw_spin_unlock(foo->lock->wait_lock);
+         *
+         * So for the fastpath enabled kernel:
+         *
+         * Nothing can set the waiters bit as long as we hold
+         * lock->wait_lock. So we do the following sequence:
+         *
+         *      owner = rt_mutex_owner(lock);
+         *      clear_rt_mutex_waiters(lock);
+         *      raw_spin_unlock(&lock->wait_lock);
+         *      if (cmpxchg(&lock->owner, owner, 0) == owner)
+         *              return;
+         *      goto retry;
+         *
+         * The fastpath disabled variant is simple as all access to
+         * lock->owner is serialized by lock->wait_lock:
+         *
+         *      lock->owner = NULL;
+         *      raw_spin_unlock(&lock->wait_lock);
+         */
+        while (!rt_mutex_has_waiters(lock)) {
+                /* Drops lock->wait_lock ! */
+                if (unlock_rt_mutex_safe(lock) == true)
+                        return;
+                /* Relock the rtmutex and try again */
+                raw_spin_lock(&lock->wait_lock);
        }
+        /*
+         * The wakeup next waiter path does not suffer from the above
+         * race. See the comments there.
+         */
        wakeup_next_waiter(lock);
        raw_spin_unlock(&lock->wait_lock);
@@ -1088,7 +1284,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
                return 1;
        }
-        ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+        /* We enforce deadlock detection for futexes */
+        ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
        if (ret && !rt_mutex_owner(lock)) {
                /*
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index a1a1dd06421d..f6a1f3c133b1 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -24,3 +24,8 @@
 #define debug_rt_mutex_print_deadlock(w)                do { } while (0)
 #define debug_rt_mutex_detect_deadlock(w,d)             (d)
 #define debug_rt_mutex_reset_waiter(w)                  do { } while (0)
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+        WARN(1, "rtmutex deadlock detected\n");
+}
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
        unsigned long flags;
        if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
-                ret = (sem->activity != 0);
+                ret = (sem->count != 0);
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
        }
        return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
        lockdep_init_map(&sem->dep_map, name, key, 0);
 #endif
-        sem->activity = 0;
+        sem->count = 0;
        raw_spin_lock_init(&sem->wait_lock);
        INIT_LIST_HEAD(&sem->wait_list);
 }
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
                waiter = list_entry(next, struct rwsem_waiter, list);
        } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-        sem->activity += woken;
+        sem->count += woken;
 out:
        return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
                /* granted */
-                sem->activity++;
+                sem->count++;
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
                goto out;
        }
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
                /* granted */
-                sem->activity++;
+                sem->count++;
                ret = 1;
        }
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
                 * itself into sleep and waiting for system woke it or someone
                 * else in the head of the wait list up.
                 */
-                if (sem->activity == 0)
+                if (sem->count == 0)
                        break;
                set_task_state(tsk, TASK_UNINTERRUPTIBLE);
                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
                raw_spin_lock_irqsave(&sem->wait_lock, flags);
        }
        /* got the lock */
-        sem->activity = -1;
+        sem->count = -1;
        list_del(&waiter.list);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->activity == 0) {
+        if (sem->count == 0) {
                /* got the lock */
-                sem->activity = -1;
+                sem->count = -1;
                ret = 1;
        }
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (--sem->activity == 0 && !list_empty(&sem->wait_list))
+        if (--sem->count == 0 && !list_empty(&sem->wait_list))
                sem = __rwsem_wake_one_writer(sem);
        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->activity = 0;
+        sem->count = 0;
        if (!list_empty(&sem->wait_list))
                sem = __rwsem_do_wake(sem, 1);
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->activity = 1;
+        sem->count = 1;
        if (!list_empty(&sem->wait_list))
                sem = __rwsem_do_wake(sem, 0);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 1d66e08e897d..a2391ac135c8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,11 +5,66 @@
 *
 * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
 * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
 */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/sched/rt.h>
+#include "mcs_spinlock.h"
+/*
+ * Guide to the rw_semaphore's count field for common values.
+ * (32-bit case illustrated, similar for 64-bit)
+ *
+ * 0x0000000X   (1) X readers active or attempting lock, no writer waiting
+ *                  X = #active_readers + #readers attempting to lock
+ *                  (X*ACTIVE_BIAS)
+ *
+ * 0x00000000   rwsem is unlocked, and no one is waiting for the lock or
+ *              attempting to read lock or write lock.
+ *
+ * 0xffff000X   (1) X readers active or attempting lock, with waiters for lock
+ *                  X = #active readers + # readers attempting lock
+ *                  (X*ACTIVE_BIAS + WAITING_BIAS)
+ *              (2) 1 writer attempting lock, no waiters for lock
+ *                  X-1 = #active readers + #readers attempting lock
+ *                  ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ *              (3) 1 writer active, no waiters for lock
+ *                  X-1 = #active readers + #readers attempting lock
+ *                  ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0001   (1) 1 reader active or attempting lock, waiters for lock
+ *                  (WAITING_BIAS + ACTIVE_BIAS)
+ *              (2) 1 writer active or attempting lock, no waiters for lock
+ *                  (ACTIVE_WRITE_BIAS)
+ *
+ * 0xffff0000   (1) There are writers or readers queued but none active
+ *                  or in the process of attempting lock.
+ *                  (WAITING_BIAS)
+ *              Note: writer can attempt to steal lock for this count by adding
+ *              ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
+ *
+ * 0xfffe0001   (1) 1 writer active, or attempting lock. Waiters on queue.
+ *                  (ACTIVE_WRITE_BIAS + WAITING_BIAS)
+ *
+ * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
+ *       the count becomes more than 0 for successful lock acquisition,
+ *       i.e. the case where there are only readers or nobody has lock.
+ *       (1st and 2nd case above).
+ *
+ *       Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
+ *       checking the count becomes ACTIVE_WRITE_BIAS for successful lock
+ *       acquisition (i.e. nobody else has lock or attempts lock).  If
+ *       unsuccessful, in rwsem_down_write_failed, we'll check to see if there
+ *       are only waiters but none active (5th case above), and attempt to
+ *       steal the lock.
+ *
+ */
 /*
 * Initialize an rwsem:
@@ -27,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
        sem->count = RWSEM_UNLOCKED_VALUE;
        raw_spin_lock_init(&sem->wait_lock);
        INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+        sem->owner = NULL;
+        osq_lock_init(&sem->osq);
+#endif
 }
 EXPORT_SYMBOL(__init_rwsem);
@@ -141,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 }
 /*
- * wait for the read lock to be granted
+ * Wait for the read lock to be granted
 */
 __visible
 struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
@@ -188,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
        return sem;
 }
+static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+        if (!(count & RWSEM_ACTIVE_MASK)) {
+                /* try acquiring the write lock */
+                if (sem->count == RWSEM_WAITING_BIAS &&
+                    cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+                            RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+                        if (!list_is_singular(&sem->wait_list))
+                                rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+                        return true;
+                }
+        }
+        return false;
+}
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 /*
- * wait until we successfully acquire the write lock
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+        long old, count = ACCESS_ONCE(sem->count);
+        while (true) {
+                if (!(count == 0 || count == RWSEM_WAITING_BIAS))
+                        return false;
+                old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+                if (old == count)
+                        return true;
+                count = old;
+        }
+}
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+        struct task_struct *owner;
+        bool on_cpu = false;
+        if (need_resched())
+                return false;
+        rcu_read_lock();
+        owner = ACCESS_ONCE(sem->owner);
+        if (owner)
+                on_cpu = owner->on_cpu;
+        rcu_read_unlock();
+        /*
+         * If sem->owner is not set, yet we have just recently entered the
+         * slowpath, then there is a possibility reader(s) may have the lock.
+         * To be safe, avoid spinning in these situations.
+         */
+        return on_cpu;
+}
+static inline bool owner_running(struct rw_semaphore *sem,
+                                 struct task_struct *owner)
+{
+        if (sem->owner != owner)
+                return false;
+        /*
+         * Ensure we emit the owner->on_cpu, dereference _after_ checking
+         * sem->owner still matches owner, if that fails, owner might
+         * point to free()d memory, if it still matches, the rcu_read_lock()
+         * ensures the memory stays valid.
+         */
+        barrier();
+        return owner->on_cpu;
+}
+static noinline
+bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+{
+        rcu_read_lock();
+        while (owner_running(sem, owner)) {
+                if (need_resched())
+                        break;
+                arch_mutex_cpu_relax();
+        }
+        rcu_read_unlock();
+        /*
+         * We break out the loop above on need_resched() or when the
+         * owner changed, which is a sign for heavy contention. Return
+         * success only when sem->owner is NULL.
+         */
+        return sem->owner == NULL;
+}
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+        struct task_struct *owner;
+        bool taken = false;
+        preempt_disable();
+        /* sem->wait_lock should not be held when doing optimistic spinning */
+        if (!rwsem_can_spin_on_owner(sem))
+                goto done;
+        if (!osq_lock(&sem->osq))
+                goto done;
+        while (true) {
+                owner = ACCESS_ONCE(sem->owner);
+                if (owner && !rwsem_spin_on_owner(sem, owner))
+                        break;
+                /* wait_lock will be acquired if write_lock is obtained */
+                if (rwsem_try_write_lock_unqueued(sem)) {
+                        taken = true;
+                        break;
+                }
+                /*
+                 * When there's no owner, we might have preempted between the
+                 * owner acquiring the lock and setting the owner field. If
+                 * we're an RT task that will live-lock because we won't let
+                 * the owner complete.
+                 */
+                if (!owner && (need_resched() || rt_task(current)))
+                        break;
+                /*
+                 * The cpu_relax() call is a compiler barrier which forces
+                 * everything in this loop to be re-loaded. We don't need
+                 * memory barriers as we'll eventually observe the right
+                 * values at the cost of a few extra spins.
+                 */
+                arch_mutex_cpu_relax();
+        }
+        osq_unlock(&sem->osq);
+done:
+        preempt_enable();
+        return taken;
+}
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+        return false;
+}
+#endif
+/*
+ * Wait until we successfully acquire the write lock
 */
 __visible
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
-        long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+        long count;
+        bool waiting = true; /* any queued threads before us */
        struct rwsem_waiter waiter;
-        struct task_struct *tsk = current;
-        /* set up my own style of waitqueue */
+        /* undo write bias from down_write operation, stop active locking */
-        waiter.task = tsk;
+        count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+        /* do optimistic spinning and steal lock if possible */
+        if (rwsem_optimistic_spin(sem))
+                return sem;
+        /*
+         * Optimistic spinning failed, proceed to the slowpath
+         * and block until we can acquire the sem.
+         */
+        waiter.task = current;
        waiter.type = RWSEM_WAITING_FOR_WRITE;
        raw_spin_lock_irq(&sem->wait_lock);
+        /* account for this before adding a new element to the list */
        if (list_empty(&sem->wait_list))
-                adjustment += RWSEM_WAITING_BIAS;
+                waiting = false;
        list_add_tail(&waiter.list, &sem->wait_list);
        /* we're now waiting on the lock, but no longer actively locking */
-        count = rwsem_atomic_update(adjustment, sem);
+        if (waiting) {
+                count = ACCESS_ONCE(sem->count);
+                /*
+                 * If there were already threads queued before us and there are
+                 * no active writers, the lock must be read owned; so we try to
+                 * wake any read locks that were queued ahead of us.
+                 */
+                if (count > RWSEM_WAITING_BIAS)
+                        sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
-        /* If there were already threads queued before us and there are no
+        } else
-         * active writers, the lock must be read owned; so we try to wake
+                count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
-         * any read locks that were queued ahead of us. */
-        if (count > RWSEM_WAITING_BIAS &&
-            adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
-                sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
        /* wait until we successfully acquire the lock */
-        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+        set_current_state(TASK_UNINTERRUPTIBLE);
        while (true) {
-                if (!(count & RWSEM_ACTIVE_MASK)) {
+                if (rwsem_try_write_lock(count, sem))
-                        /* Try acquiring the write lock. */
+                        break;
-                        count = RWSEM_ACTIVE_WRITE_BIAS;
-                        if (!list_is_singular(&sem->wait_list))
-                                count += RWSEM_WAITING_BIAS;
-                        if (sem->count == RWSEM_WAITING_BIAS &&
-                            cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
-                                                        RWSEM_WAITING_BIAS)
-                                break;
-                }
                raw_spin_unlock_irq(&sem->wait_lock);
                /* Block until there are no active lockers. */
                do {
                        schedule();
-                        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
                } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
                raw_spin_lock_irq(&sem->wait_lock);
        }
+        __set_current_state(TASK_RUNNING);
        list_del(&waiter.list);
        raw_spin_unlock_irq(&sem->wait_lock);
-        tsk->state = TASK_RUNNING;
        return sem;
 }
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index cfff1435bdfb..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@
 #include <linux/atomic.h>
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+        sem->owner = current;
+}
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+        sem->owner = NULL;
+}
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
 /*
 * lock for reading
 */
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
        rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
 {
        int ret = __down_write_trylock(sem);
-        if (ret == 1)
+        if (ret == 1) {
                rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+                rwsem_set_owner(sem);
+        }
        return ret;
 }
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
 {
        rwsem_release(&sem->dep_map, 1, _RET_IP_);
+        rwsem_clear_owner(sem);
        __up_write(sem);
 }
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
         * lockdep: a downgraded write will live on as a write
         * dependency.
         */
+        rwsem_clear_owner(sem);
        __downgrade_write(sem);
 }
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
        rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
        rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(down_write_nested);
diff --git a/kernel/module.c b/kernel/module.c
index 079c4615607d..81e727cf6df9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3020,21 +3020,6 @@ static int do_init_module(struct module *mod)
         */
        current->flags &= ~PF_USED_ASYNC;
-        blocking_notifier_call_chain(&module_notify_list,
-                        MODULE_STATE_COMING, mod);
-        /* Set RO and NX regions for core */
-        set_section_ro_nx(mod->module_core,
-                                mod->core_text_size,
-                                mod->core_ro_size,
-                                mod->core_size);
-        /* Set RO and NX regions for init */
-        set_section_ro_nx(mod->module_init,
-                                mod->init_text_size,
-                                mod->init_ro_size,
-                                mod->init_size);
        do_mod_ctors(mod);
        /* Start the module */
        if (mod->init != NULL)
@@ -3165,9 +3150,26 @@ static int complete_formation(struct module *mod, struct load_info *info)
        /* This relies on module_mutex for list integrity. */
        module_bug_finalize(info->hdr, info->sechdrs, mod);
+        /* Set RO and NX regions for core */
+        set_section_ro_nx(mod->module_core,
+                                mod->core_text_size,
+                                mod->core_ro_size,
+                                mod->core_size);
+        /* Set RO and NX regions for init */
+        set_section_ro_nx(mod->module_init,
+                                mod->init_text_size,
+                                mod->init_ro_size,
+                                mod->init_size);
        /* Mark state as coming so strong_try_module_get() ignores us,
         * but kallsyms etc. can see us. */
        mod->state = MODULE_STATE_COMING;
+        mutex_unlock(&module_mutex);
+        blocking_notifier_call_chain(&module_notify_list,
+                                     MODULE_STATE_COMING, mod);
+        return 0;
 out:
        mutex_unlock(&module_mutex);
@@ -3190,6 +3192,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 {
        struct module *mod;
        long err;
+        char *after_dashes;
        err = module_sig_check(info);
        if (err)
@@ -3277,10 +3280,15 @@ static int load_module(struct load_info *info, const char __user *uargs,
                goto ddebug_cleanup;
        /* Module is ready to execute: parsing args may do that. */
-        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+        after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-                         -32768, 32767, unknown_module_param_cb);
+                                  -32768, 32767, unknown_module_param_cb);
-        if (err < 0)
+        if (IS_ERR(after_dashes)) {
+                err = PTR_ERR(after_dashes);
                goto bug_cleanup;
+        } else if (after_dashes) {
+                pr_warn("%s: parameters '%s' after `--' ignored\n",
+                       mod->name, after_dashes);
+        }
        /* Link in to syfs. */
        err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index db4c8b08a50c..4803da6eab62 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block **nl,
 *      @returns:       notifier_call_chain returns the value returned by the
 *                      last notifier function called.
 */
-static int __kprobes notifier_call_chain(struct notifier_block **nl,
+static int notifier_call_chain(struct notifier_block **nl,
-                                        unsigned long val, void *v,
+                               unsigned long val, void *v,
-                                        int nr_to_call, int *nr_calls)
+                               int nr_to_call, int *nr_calls)
 {
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;
@@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
        }
        return ret;
 }
+NOKPROBE_SYMBOL(notifier_call_chain);
 /*
 *      Atomic notifier chain routines.  Registration and unregistration
@@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
 *      Otherwise the return value is the return value
 *      of the last notifier function called.
 */
-int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
-                                        unsigned long val, void *v,
+                                 unsigned long val, void *v,
-                                        int nr_to_call, int *nr_calls)
+                                 int nr_to_call, int *nr_calls)
 {
        int ret;
@@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
        return ret;
 }
 EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
-int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
-                unsigned long val, void *v)
+                               unsigned long val, void *v)
 {
        return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
 }
 EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(atomic_notifier_call_chain);
 /*
 *      Blocking notifier chain routines.  All access to the chain is
@@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
 static ATOMIC_NOTIFIER_HEAD(die_chain);
-int notrace __kprobes notify_die(enum die_val val, const char *str,
+int notrace notify_die(enum die_val val, const char *str,
               struct pt_regs *regs, long err, int trap, int sig)
 {
        struct die_args args = {
@@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const char *str,
        };
        return atomic_notifier_call_chain(&die_chain, val, &args);
 }
+NOKPROBE_SYMBOL(notify_die);
 int register_die_notifier(struct notifier_block *nb)
 {
diff --git a/kernel/panic.c b/kernel/panic.c
index d02fa9fef46a..62e16cef9cc2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -32,6 +32,7 @@ static unsigned long tainted_mask;
 static int pause_on_oops;
 static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
+static bool crash_kexec_post_notifiers;
 int panic_timeout = CONFIG_PANIC_TIMEOUT;
 EXPORT_SYMBOL_GPL(panic_timeout);
@@ -112,9 +113,11 @@ void panic(const char *fmt, ...)
        /*
         * If we have crashed and we have a crash kernel loaded let it handle
         * everything else.
-         * Do we want to call this before we try to display a message?
+         * If we want to run this after calling panic_notifiers, pass
+         * the "crash_kexec_post_notifiers" option to the kernel.
         */
-        crash_kexec(NULL);
+        if (!crash_kexec_post_notifiers)
+                crash_kexec(NULL);
        /*
         * Note smp_send_stop is the usual smp shutdown function, which
@@ -131,6 +134,15 @@ void panic(const char *fmt, ...)
        kmsg_dump(KMSG_DUMP_PANIC);
+        /*
+         * If you doubt kdump always works fine in any situation,
+         * "crash_kexec_post_notifiers" offers you a chance to run
+         * panic_notifiers and dumping kmsg before kdump.
+         * Note: since some panic_notifiers can make crashed kernel
+         * more unstable, it can increase risks of the kdump failure too.
+         */
+        crash_kexec(NULL);
        bust_spinlocks(0);
        if (!panic_blink)
@@ -472,6 +484,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
 core_param(panic, panic_timeout, int, 0644);
 core_param(pause_on_oops, pause_on_oops, int, 0644);
+static int __init setup_crash_kexec_post_notifiers(char *s)
+{
+        crash_kexec_post_notifiers = true;
+        return 0;
+}
+early_param("crash_kexec_post_notifiers", setup_crash_kexec_post_notifiers);
 static int __init oops_setup(char *s)
 {
        if (!s)
diff --git a/kernel/params.c b/kernel/params.c
index b00142e7f3ba..1e52ca233fd9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -177,13 +177,13 @@ static char *next_arg(char *args, char **param, char **val)
 }
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
-int parse_args(const char *doing,
+char *parse_args(const char *doing,
-               char *args,
+                 char *args,
-               const struct kernel_param *params,
+                 const struct kernel_param *params,
-               unsigned num,
+                 unsigned num,
-               s16 min_level,
+                 s16 min_level,
-               s16 max_level,
+                 s16 max_level,
-               int (*unknown)(char *param, char *val, const char *doing))
+                 int (*unknown)(char *param, char *val, const char *doing))
 {
        char *param, *val;
@@ -198,6 +198,9 @@ int parse_args(const char *doing,
                int irq_was_disabled;
                args = next_arg(args, &param, &val);
+                /* Stop at -- */
+                if (!val && strcmp(param, "--") == 0)
+                        return args;
                irq_was_disabled = irqs_disabled();
                ret = parse_one(param, val, doing, params, num,
                                min_level, max_level, unknown);
@@ -208,22 +211,22 @@ int parse_args(const char *doing,
                switch (ret) {
                case -ENOENT:
                        pr_err("%s: Unknown parameter `%s'\n", doing, param);
-                        return ret;
+                        return ERR_PTR(ret);
                case -ENOSPC:
                        pr_err("%s: `%s' too large for parameter `%s'\n",
                               doing, val ?: "", param);
-                        return ret;
+                        return ERR_PTR(ret);
                case 0:
                        break;
                default:
                        pr_err("%s: `%s' invalid for parameter `%s'\n",
                               doing, val ?: "", param);
-                        return ret;
+                        return ERR_PTR(ret);
                }
        }
        /* All parsed OK. */
-        return 0;
+        return NULL;
 }
 /* Lazy bastard, eh? */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2fac9cc79b3d..9a83d780facd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -257,8 +257,7 @@ config ARCH_HAS_OPP
        bool
 config PM_OPP
-        bool "Operating Performance Point (OPP) Layer library"
+        bool
-        depends on ARCH_HAS_OPP
        ---help---
          SOCs have a standard set of tuples consisting of frequency and
          voltage pairs that the device will support per voltage domain. This
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f4f2073711d3..fcc2611d3f14 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,14 +28,16 @@
 #include <linux/syscore_ops.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
+#include <trace/events/power.h>
 #include "power.h"
 static int nocompress;
 static int noresume;
+static int nohibernate;
 static int resume_wait;
-static int resume_delay;
+static unsigned int resume_delay;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
@@ -61,6 +63,11 @@ bool freezer_test_done;
 static const struct platform_hibernation_ops *hibernation_ops;
+bool hibernation_available(void)
+{
+        return (nohibernate == 0);
+}
 /**
 * hibernation_set_ops - Set the global hibernate operations.
 * @ops: Hibernation operations to use in subsequent hibernation transitions.
@@ -228,19 +235,23 @@ static void platform_recover(int platform_mode)
 void swsusp_show_speed(struct timeval *start, struct timeval *stop,
                        unsigned nr_pages, char *msg)
 {
-        s64 elapsed_centisecs64;
+        u64 elapsed_centisecs64;
-        int centisecs;
+        unsigned int centisecs;
-        int k;
+        unsigned int k;
-        int kps;
+        unsigned int kps;
        elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+        /*
+         * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time,
+         * it is obvious enough for what went wrong.
+         */
        do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
        centisecs = elapsed_centisecs64;
        if (centisecs == 0)
                centisecs = 1;  /* avoid div-by-zero */
        k = nr_pages * (PAGE_SIZE / 1024);
        kps = (k * 100) / centisecs;
-        printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
+        printk(KERN_INFO "PM: %s %u kbytes in %u.%02u seconds (%u.%02u MB/s)\n",
                        msg, k,
                        centisecs / 100, centisecs % 100,
                        kps / 1000, (kps % 1000) / 10);
@@ -288,7 +299,9 @@ static int create_image(int platform_mode)
        in_suspend = 1;
        save_processor_state();
+        trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
        error = swsusp_arch_suspend();
+        trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
        if (error)
                printk(KERN_ERR "PM: Error %d creating hibernation image\n",
                        error);
@@ -595,7 +608,8 @@ static void power_down(void)
        case HIBERNATION_PLATFORM:
                hibernation_platform_enter();
        case HIBERNATION_SHUTDOWN:
-                kernel_power_off();
+                if (pm_power_off)
+                        kernel_power_off();
                break;
 #ifdef CONFIG_SUSPEND
        case HIBERNATION_SUSPEND:
@@ -623,7 +637,8 @@ static void power_down(void)
         * corruption after resume.
         */
        printk(KERN_CRIT "PM: Please power down manually\n");
-        while(1);
+        while (1)
+                cpu_relax();
 }
 /**
@@ -633,6 +648,11 @@ int hibernate(void)
 {
        int error;
+        if (!hibernation_available()) {
+                pr_debug("PM: Hibernation not available.\n");
+                return -EPERM;
+        }
        lock_system_sleep();
        /* The snapshot device should not be opened while we're running */
        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -725,7 +745,7 @@ static int software_resume(void)
        /*
         * If the user said "noresume".. bail out early.
         */
-        if (noresume)
+        if (noresume || !hibernation_available())
                return 0;
        /*
@@ -891,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
        int i;
        char *start = buf;
+        if (!hibernation_available())
+                return sprintf(buf, "[disabled]\n");
        for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
                if (!hibernation_modes[i])
                        continue;
@@ -925,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
        char *p;
        int mode = HIBERNATION_INVALID;
+        if (!hibernation_available())
+                return -EPERM;
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
@@ -1092,6 +1118,10 @@ static int __init hibernate_setup(char *str)
                noresume = 1;
        else if (!strncmp(str, "nocompress", 10))
                nocompress = 1;
+        else if (!strncmp(str, "no", 2)) {
+                noresume = 1;
+                nohibernate = 1;
+        }
        return 1;
 }
@@ -1109,13 +1139,30 @@ static int __init resumewait_setup(char *str)
 static int __init resumedelay_setup(char *str)
 {
-        resume_delay = simple_strtoul(str, NULL, 0);
+        int rc = kstrtouint(str, 0, &resume_delay);
+        if (rc)
+                return rc;
+        return 1;
+}
+static int __init nohibernate_setup(char *str)
+{
+        noresume = 1;
+        nohibernate = 1;
        return 1;
 }
+static int __init kaslr_nohibernate_setup(char *str)
+{
+        return nohibernate_setup(str);
+}
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
 __setup("hibernate=", hibernate_setup);
 __setup("resumewait", resumewait_setup);
 __setup("resumedelay=", resumedelay_setup);
+__setup("nohibernate", nohibernate_setup);
+__setup("kaslr", kaslr_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6271bc4073ef..8e90f330f139 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -279,34 +279,32 @@ static inline void pm_print_times_init(void) {}
 struct kobject *power_kobj;
 /**
- *      state - control system power state.
+ * state - control system sleep states.
 *
- *      show() returns what states are supported, which is hard-coded to
+ * show() returns available sleep state labels, which may be "mem", "standby",
- *      'freeze' (Low-Power Idle), 'standby' (Power-On Suspend),
+ * "freeze" and "disk" (hibernation).  See Documentation/power/states.txt for a
- *      'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk).
+ * description of what they mean.
 *
- *      store() accepts one of those strings, translates it into the
+ * store() accepts one of those strings, translates it into the proper
- *      proper enumerated value, and initiates a suspend transition.
+ * enumerated value, and initiates a suspend transition.
 */
 static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
                          char *buf)
 {
        char *s = buf;
 #ifdef CONFIG_SUSPEND
-        int i;
+        suspend_state_t i;
+        for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+                if (pm_states[i].state)
+                        s += sprintf(s,"%s ", pm_states[i].label);
-        for (i = 0; i < PM_SUSPEND_MAX; i++) {
-                if (pm_states[i] && valid_state(i))
-                        s += sprintf(s,"%s ", pm_states[i]);
-        }
 #endif
-#ifdef CONFIG_HIBERNATION
+        if (hibernation_available())
-        s += sprintf(s, "%s\n", "disk");
+                s += sprintf(s, "disk ");
-#else
        if (s != buf)
                /* convert the last space to a newline */
                *(s-1) = '\n';
-#endif
        return (s - buf);
 }
@@ -314,7 +312,7 @@ static suspend_state_t decode_state(const char *buf, size_t n)
 {
 #ifdef CONFIG_SUSPEND
        suspend_state_t state = PM_SUSPEND_MIN;
-        const char * const *s;
+        struct pm_sleep_state *s;
 #endif
        char *p;
        int len;
@@ -328,8 +326,9 @@ static suspend_state_t decode_state(const char *buf, size_t n)
 #ifdef CONFIG_SUSPEND
        for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
-                if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
+                if (s->state && len == strlen(s->label)
-                        return state;
+                    && !strncmp(buf, s->label, len))
+                        return s->state;
 #endif
        return PM_SUSPEND_ON;
@@ -447,8 +446,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
 #ifdef CONFIG_SUSPEND
        if (state < PM_SUSPEND_MAX)
-                return sprintf(buf, "%s\n", valid_state(state) ?
+                return sprintf(buf, "%s\n", pm_states[state].state ?
-                                                pm_states[state] : "error");
+                                        pm_states[state].label : "error");
 #endif
 #ifdef CONFIG_HIBERNATION
        return sprintf(buf, "disk\n");
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 15f37ea08719..c60f13b5270a 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,17 +178,20 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
                                unsigned int, char *);
 #ifdef CONFIG_SUSPEND
+struct pm_sleep_state {
+        const char *label;
+        suspend_state_t state;
+};
 /* kernel/power/suspend.c */
-extern const char *const pm_states[];
+extern struct pm_sleep_state pm_states[];
-extern bool valid_state(suspend_state_t state);
 extern int suspend_devices_and_enter(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
 static inline int suspend_devices_and_enter(suspend_state_t state)
 {
        return -ENOSYS;
 }
-static inline bool valid_state(suspend_state_t state) { return false; }
 #endif /* !CONFIG_SUSPEND */
 #ifdef CONFIG_PM_TEST_SUSPEND
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 06ec8869dbf1..4ee194eb524b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -17,6 +17,7 @@
 #include <linux/delay.h>
 #include <linux/workqueue.h>
 #include <linux/kmod.h>
+#include <trace/events/power.h>
 /* 
 * Timeout for stopping processes
@@ -175,6 +176,7 @@ void thaw_processes(void)
        struct task_struct *g, *p;
        struct task_struct *curr = current;
+        trace_suspend_resume(TPS("thaw_processes"), 0, true);
        if (pm_freezing)
                atomic_dec(&system_freezing_cnt);
        pm_freezing = false;
@@ -184,6 +186,7 @@ void thaw_processes(void)
        printk("Restarting tasks ... ");
+        __usermodehelper_set_disable_depth(UMH_FREEZING);
        thaw_workqueues();
        read_lock(&tasklist_lock);
@@ -201,6 +204,7 @@ void thaw_processes(void)
        schedule();
        printk("done.\n");
+        trace_suspend_resume(TPS("thaw_processes"), 0, false);
 }
 void thaw_kernel_threads(void)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8233cd4047d7..ed35a4790afe 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,13 +31,14 @@
 #include "power.h"
-const char *const pm_states[PM_SUSPEND_MAX] = {
+struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
-        [PM_SUSPEND_FREEZE]     = "freeze",
+        [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
-        [PM_SUSPEND_STANDBY]    = "standby",
+        [PM_SUSPEND_STANDBY] = { .label = "standby", },
-        [PM_SUSPEND_MEM]        = "mem",
+        [PM_SUSPEND_MEM] = { .label = "mem", },
 };
 static const struct platform_suspend_ops *suspend_ops;
+static const struct platform_freeze_ops *freeze_ops;
 static bool need_suspend_ops(suspend_state_t state)
 {
@@ -47,6 +48,13 @@ static bool need_suspend_ops(suspend_state_t state)
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
 static bool suspend_freeze_wake;
+void freeze_set_ops(const struct platform_freeze_ops *ops)
+{
+        lock_system_sleep();
+        freeze_ops = ops;
+        unlock_system_sleep();
+}
 static void freeze_begin(void)
 {
        suspend_freeze_wake = false;
@@ -54,9 +62,11 @@ static void freeze_begin(void)
 static void freeze_enter(void)
 {
+        cpuidle_use_deepest_state(true);
        cpuidle_resume();
        wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
        cpuidle_pause();
+        cpuidle_use_deepest_state(false);
 }
 void freeze_wake(void)
@@ -66,42 +76,62 @@ void freeze_wake(void)
 }
 EXPORT_SYMBOL_GPL(freeze_wake);
+static bool valid_state(suspend_state_t state)
+{
+        /*
+         * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
+         * support and need to be valid to the low level
+         * implementation, no valid callback implies that none are valid.
+         */
+        return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+}
+/*
+ * If this is set, the "mem" label always corresponds to the deepest sleep state
+ * available, the "standby" label corresponds to the second deepest sleep state
+ * available (if any), and the "freeze" label corresponds to the remaining
+ * available sleep state (if there is one).
+ */
+static bool relative_states;
+static int __init sleep_states_setup(char *str)
+{
+        relative_states = !strncmp(str, "1", 1);
+        if (relative_states) {
+                pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
+                pm_states[PM_SUSPEND_FREEZE].state = 0;
+        }
+        return 1;
+}
+__setup("relative_sleep_states=", sleep_states_setup);
 /**
 * suspend_set_ops - Set the global suspend method table.
 * @ops: Suspend operations to use.
 */
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
+        suspend_state_t i;
+        int j = PM_SUSPEND_MAX - 1;
        lock_system_sleep();
        suspend_ops = ops;
+        for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
+                if (valid_state(i))
+                        pm_states[j--].state = i;
+                else if (!relative_states)
+                        pm_states[j--].state = 0;
+        pm_states[j--].state = PM_SUSPEND_FREEZE;
+        while (j >= PM_SUSPEND_MIN)
+                pm_states[j--].state = 0;
        unlock_system_sleep();
 }
 EXPORT_SYMBOL_GPL(suspend_set_ops);
-bool valid_state(suspend_state_t state)
-{
-        if (state == PM_SUSPEND_FREEZE) {
-#ifdef CONFIG_PM_DEBUG
-                if (pm_test_level != TEST_NONE &&
-                    pm_test_level != TEST_FREEZER &&
-                    pm_test_level != TEST_DEVICES &&
-                    pm_test_level != TEST_PLATFORM) {
-                        printk(KERN_WARNING "Unsupported pm_test mode for "
-                                        "freeze state, please choose "
-                                        "none/freezer/devices/platform.\n");
-                        return false;
-                }
-#endif
-                        return true;
-        }
-        /*
-         * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
-         * support and need to be valid to the lowlevel
-         * implementation, no valid callback implies that none are valid.
-         */
-        return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
-}
 /**
 * suspend_valid_only_mem - Generic memory-only valid callback.
 *
@@ -147,7 +177,9 @@ static int suspend_prepare(suspend_state_t state)
        if (error)
                goto Finish;
+        trace_suspend_resume(TPS("freeze_processes"), 0, true);
        error = suspend_freeze_processes();
+        trace_suspend_resume(TPS("freeze_processes"), 0, false);
        if (!error)
                return 0;
@@ -210,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
         * all the devices are suspended.
         */
        if (state == PM_SUSPEND_FREEZE) {
+                trace_suspend_resume(TPS("machine_suspend"), state, true);
                freeze_enter();
+                trace_suspend_resume(TPS("machine_suspend"), state, false);
                goto Platform_wake;
        }
@@ -226,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        if (!error) {
                *wakeup = pm_wakeup_pending();
                if (!(suspend_test(TEST_CORE) || *wakeup)) {
+                        trace_suspend_resume(TPS("machine_suspend"),
+                                state, true);
                        error = suspend_ops->enter(state);
+                        trace_suspend_resume(TPS("machine_suspend"),
+                                state, false);
                        events_check_enabled = false;
                }
                syscore_resume();
@@ -264,11 +302,14 @@ int suspend_devices_and_enter(suspend_state_t state)
        if (need_suspend_ops(state) && !suspend_ops)
                return -ENOSYS;
-        trace_machine_suspend(state);
        if (need_suspend_ops(state) && suspend_ops->begin) {
                error = suspend_ops->begin(state);
                if (error)
                        goto Close;
+        } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
+                error = freeze_ops->begin();
+                if (error)
+                        goto Close;
        }
        suspend_console();
        suspend_test_start();
@@ -294,7 +335,9 @@ int suspend_devices_and_enter(suspend_state_t state)
 Close:
        if (need_suspend_ops(state) && suspend_ops->end)
                suspend_ops->end();
-        trace_machine_suspend(PWR_EVENT_EXIT);
+        else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
+                freeze_ops->end();
        return error;
 Recover_platform:
@@ -328,20 +371,31 @@ static int enter_state(suspend_state_t state)
 {
        int error;
-        if (!valid_state(state))
+        trace_suspend_resume(TPS("suspend_enter"), state, true);
-                return -ENODEV;
+        if (state == PM_SUSPEND_FREEZE) {
+#ifdef CONFIG_PM_DEBUG
+                if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
+                        pr_warning("PM: Unsupported test mode for freeze state,"
+                                   "please choose none/freezer/devices/platform.\n");
+                        return -EAGAIN;
+                }
+#endif
+        } else if (!valid_state(state)) {
+                return -EINVAL;
+        }
        if (!mutex_trylock(&pm_mutex))
                return -EBUSY;
        if (state == PM_SUSPEND_FREEZE)
                freeze_begin();
+        trace_suspend_resume(TPS("sync_filesystems"), 0, true);
        printk(KERN_INFO "PM: Syncing filesystems ... ");
        sys_sync();
        printk("done.\n");
+        trace_suspend_resume(TPS("sync_filesystems"), 0, false);
-        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
+        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
        error = suspend_prepare(state);
        if (error)
                goto Unlock;
@@ -349,7 +403,8 @@ static int enter_state(suspend_state_t state)
        if (suspend_test(TEST_FREEZER))
                goto Finish;
-        pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+        trace_suspend_resume(TPS("suspend_enter"), state, false);
+        pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
        pm_restrict_gfp_mask();
        error = suspend_devices_and_enter(state);
        pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 9b2a1d58558d..269b097e78ea 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
        }
        if (state == PM_SUSPEND_MEM) {
-                printk(info_test, pm_states[state]);
+                printk(info_test, pm_states[state].label);
                status = pm_suspend(state);
                if (status == -ENODEV)
                        state = PM_SUSPEND_STANDBY;
        }
        if (state == PM_SUSPEND_STANDBY) {
-                printk(info_test, pm_states[state]);
+                printk(info_test, pm_states[state].label);
                status = pm_suspend(state);
        }
        if (status < 0)
@@ -136,18 +136,16 @@ static char warn_bad_state[] __initdata =
 static int __init setup_test_suspend(char *value)
 {
-        unsigned i;
+        suspend_state_t i;
        /* "=mem" ==> "mem" */
        value++;
-        for (i = 0; i < PM_SUSPEND_MAX; i++) {
+        for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
-                if (!pm_states[i])
+                if (!strcmp(pm_states[i].label, value)) {
-                        continue;
+                        test_state = pm_states[i].state;
-                if (strcmp(pm_states[i], value) != 0)
+                        return 0;
-                        continue;
+                }
-                test_state = (__force suspend_state_t) i;
-                return 0;
-        }
        printk(warn_bad_state, value);
        return 0;
 }
@@ -164,8 +162,8 @@ static int __init test_suspend(void)
        /* PM is initialized by now; is that state testable? */
        if (test_state == PM_SUSPEND_ON)
                goto done;
-        if (!valid_state(test_state)) {
+        if (!pm_states[test_state].state) {
-                printk(warn_bad_state, pm_states[test_state]);
+                printk(warn_bad_state, pm_states[test_state].label);
                goto done;
        }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8c9a4819f798..aaa3261dea5d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -567,7 +567,7 @@ static int lzo_compress_threadfn(void *data)
 /**
 * save_image_lzo - Save the suspend image data compressed with LZO.
- * @handle: Swap mam handle to use for saving the image.
+ * @handle: Swap map handle to use for saving the image.
 * @snapshot: Image to read data from.
 * @nr_to_write: Number of pages to save.
 */
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 98d357584cd6..526e8911460a 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        struct snapshot_data *data;
        int error;
+        if (!hibernation_available())
+                return -EPERM;
        lock_system_sleep();
        if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 7228258b85ec..13e839dbca07 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -54,20 +54,16 @@
 #include "console_cmdline.h"
 #include "braille.h"
-/* printk's without a loglevel use this.. */
-#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
-/* We show everything that is MORE important than this.. */
-#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
-#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
 int console_printk[4] = {
-        DEFAULT_CONSOLE_LOGLEVEL,       /* console_loglevel */
+        CONSOLE_LOGLEVEL_DEFAULT,       /* console_loglevel */
        DEFAULT_MESSAGE_LOGLEVEL,       /* default_message_loglevel */
-        MINIMUM_CONSOLE_LOGLEVEL,       /* minimum_console_loglevel */
+        CONSOLE_LOGLEVEL_MIN,           /* minimum_console_loglevel */
-        DEFAULT_CONSOLE_LOGLEVEL,       /* default_console_loglevel */
+        CONSOLE_LOGLEVEL_DEFAULT,       /* default_console_loglevel */
 };
+/* Deferred messaged from sched code are marked by this special level */
+#define SCHED_MESSAGE_LOGLEVEL -2
 /*
 * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
@@ -91,6 +87,29 @@ static struct lockdep_map console_lock_dep_map = {
 #endif
 /*
+ * Helper macros to handle lockdep when locking/unlocking console_sem. We use
+ * macros instead of functions so that _RET_IP_ contains useful information.
+ */
+#define down_console_sem() do { \
+        down(&console_sem);\
+        mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
+} while (0)
+static int __down_trylock_console_sem(unsigned long ip)
+{
+        if (down_trylock(&console_sem))
+                return 1;
+        mutex_acquire(&console_lock_dep_map, 0, 1, ip);
+        return 0;
+}
+#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)
+#define up_console_sem() do { \
+        mutex_release(&console_lock_dep_map, 1, _RET_IP_);\
+        up(&console_sem);\
+} while (0)
+/*
 * This is used for debugging the mess that is the VT code by
 * keeping track if we have the console semaphore held. It's
 * definitely not the perfect debug tool (we don't know if _WE_
@@ -206,8 +225,9 @@ struct printk_log {
 };
 /*
- * The logbuf_lock protects kmsg buffer, indices, counters. It is also
+ * The logbuf_lock protects kmsg buffer, indices, counters.  This can be taken
- * used in interesting ways to provide interlocking in console_unlock();
+ * within the scheduler's rq lock. It must be released before calling
+ * console_unlock() or anything else that might wake up a process.
 */
 static DEFINE_RAW_SPINLOCK(logbuf_lock);
@@ -250,9 +270,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
 static char *log_buf = __log_buf;
 static u32 log_buf_len = __LOG_BUF_LEN;
-/* cpu currently holding logbuf_lock */
-static volatile unsigned int logbuf_cpu = UINT_MAX;
 /* human readable text of the record */
 static char *log_text(const struct printk_log *msg)
 {
@@ -297,34 +314,106 @@ static u32 log_next(u32 idx)
        return idx + msg->len;
 }
-/* insert record into the buffer, discard old ones, update heads */
+/*
-static void log_store(int facility, int level,
+ * Check whether there is enough free space for the given message.
-                      enum log_flags flags, u64 ts_nsec,
+ *
-                      const char *dict, u16 dict_len,
+ * The same values of first_idx and next_idx mean that the buffer
-                      const char *text, u16 text_len)
+ * is either empty or full.
+ *
+ * If the buffer is empty, we must respect the position of the indexes.
+ * They cannot be reset to the beginning of the buffer.
+ */
+static int logbuf_has_space(u32 msg_size, bool empty)
 {
-        struct printk_log *msg;
+        u32 free;
-        u32 size, pad_len;
-        /* number of '\0' padding bytes to next message */
+        if (log_next_idx > log_first_idx || empty)
-        size = sizeof(struct printk_log) + text_len + dict_len;
+                free = max(log_buf_len - log_next_idx, log_first_idx);
-        pad_len = (-size) & (LOG_ALIGN - 1);
+        else
-        size += pad_len;
+                free = log_first_idx - log_next_idx;
+        /*
+         * We need space also for an empty header that signalizes wrapping
+         * of the buffer.
+         */
+        return free >= msg_size + sizeof(struct printk_log);
+}
+static int log_make_free_space(u32 msg_size)
+{
        while (log_first_seq < log_next_seq) {
-                u32 free;
+                if (logbuf_has_space(msg_size, false))
+                        return 0;
+                /* drop old messages until we have enough continuous space */
+                log_first_idx = log_next(log_first_idx);
+                log_first_seq++;
+        }
-                if (log_next_idx > log_first_idx)
+        /* sequence numbers are equal, so the log buffer is empty */
-                        free = max(log_buf_len - log_next_idx, log_first_idx);
+        if (logbuf_has_space(msg_size, true))
-                else
+                return 0;
-                        free = log_first_idx - log_next_idx;
-                if (free >= size + sizeof(struct printk_log))
+        return -ENOMEM;
-                        break;
+}
-                /* drop old messages until we have enough contiuous space */
+/* compute the message size including the padding bytes */
-                log_first_idx = log_next(log_first_idx);
+static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
-                log_first_seq++;
+{
+        u32 size;
+        size = sizeof(struct printk_log) + text_len + dict_len;
+        *pad_len = (-size) & (LOG_ALIGN - 1);
+        size += *pad_len;
+        return size;
+}
+/*
+ * Define how much of the log buffer we could take at maximum. The value
+ * must be greater than two. Note that only half of the buffer is available
+ * when the index points to the middle.
+ */
+#define MAX_LOG_TAKE_PART 4
+static const char trunc_msg[] = "<truncated>";
+static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
+                        u16 *dict_len, u32 *pad_len)
+{
+        /*
+         * The message should not take the whole buffer. Otherwise, it might
+         * get removed too soon.
+         */
+        u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
+        if (*text_len > max_text_len)
+                *text_len = max_text_len;
+        /* enable the warning message */
+        *trunc_msg_len = strlen(trunc_msg);
+        /* disable the "dict" completely */
+        *dict_len = 0;
+        /* compute the size again, count also the warning message */
+        return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
+}
+/* insert record into the buffer, discard old ones, update heads */
+static int log_store(int facility, int level,
+                     enum log_flags flags, u64 ts_nsec,
+                     const char *dict, u16 dict_len,
+                     const char *text, u16 text_len)
+{
+        struct printk_log *msg;
+        u32 size, pad_len;
+        u16 trunc_msg_len = 0;
+        /* number of '\0' padding bytes to next message */
+        size = msg_used_size(text_len, dict_len, &pad_len);
+        if (log_make_free_space(size)) {
+                /* truncate the message if it is too long for empty buffer */
+                size = truncate_msg(&text_len, &trunc_msg_len,
+                                    &dict_len, &pad_len);
+                /* survive when the log buffer is too small for trunc_msg */
+                if (log_make_free_space(size))
+                        return 0;
        }
        if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
@@ -341,6 +430,10 @@ static void log_store(int facility, int level,
        msg = (struct printk_log *)(log_buf + log_next_idx);
        memcpy(log_text(msg), text, text_len);
        msg->text_len = text_len;
+        if (trunc_msg_len) {
+                memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
+                msg->text_len += trunc_msg_len;
+        }
        memcpy(log_dict(msg), dict, dict_len);
        msg->dict_len = dict_len;
        msg->facility = facility;
@@ -356,6 +449,8 @@ static void log_store(int facility, int level,
        /* insert message */
        log_next_idx += msg->len;
        log_next_seq++;
+        return msg->text_len;
 }
 #ifdef CONFIG_SECURITY_DMESG_RESTRICT
@@ -1303,7 +1398,10 @@ static void zap_locks(void)
        sema_init(&console_sem, 1);
 }
-/* Check if we have any console registered that can be called early in boot. */
+/*
+ * Check if we have any console that is capable of printing while cpu is
+ * booting or shutting down. Requires console_sem.
+ */
 static int have_callable_console(void)
 {
        struct console *con;
@@ -1333,36 +1431,22 @@ static inline int can_use_console(unsigned int cpu)
 * messages from a 'printk'. Return true (and with the
 * console_lock held, and 'console_locked' set) if it
 * is successful, false otherwise.
- *
- * This gets called with the 'logbuf_lock' spinlock held and
- * interrupts disabled. It should return with 'lockbuf_lock'
- * released but interrupts still disabled.
 */
 static int console_trylock_for_printk(unsigned int cpu)
-        __releases(&logbuf_lock)
 {
-        int retval = 0, wake = 0;
+        if (!console_trylock())
+                return 0;
-        if (console_trylock()) {
+        /*
-                retval = 1;
+         * If we can't use the console, we need to release the console
+         * semaphore by hand to avoid flushing the buffer. We need to hold the
-                /*
+         * console semaphore in order to do this test safely.
-                 * If we can't use the console, we need to release
+         */
-                 * the console semaphore by hand to avoid flushing
+        if (!can_use_console(cpu)) {
-                 * the buffer. We need to hold the console semaphore
+                console_locked = 0;
-                 * in order to do this test safely.
+                up_console_sem();
-                 */
+                return 0;
-                if (!can_use_console(cpu)) {
-                        console_locked = 0;
-                        wake = 1;
-                        retval = 0;
-                }
        }
-        logbuf_cpu = UINT_MAX;
+        return 1;
-        raw_spin_unlock(&logbuf_lock);
-        if (wake)
-                up(&console_sem);
-        return retval;
 }
 int printk_delay_msec __read_mostly;
@@ -1490,11 +1574,19 @@ asmlinkage int vprintk_emit(int facility, int level,
        static int recursion_bug;
        static char textbuf[LOG_LINE_MAX];
        char *text = textbuf;
-        size_t text_len;
+        size_t text_len = 0;
        enum log_flags lflags = 0;
        unsigned long flags;
        int this_cpu;
        int printed_len = 0;
+        bool in_sched = false;
+        /* cpu currently holding logbuf_lock in this function */
+        static volatile unsigned int logbuf_cpu = UINT_MAX;
+        if (level == SCHED_MESSAGE_LOGLEVEL) {
+                level = -1;
+                in_sched = true;
+        }
        boot_delay_msec(level);
        printk_delay();
@@ -1530,17 +1622,22 @@ asmlinkage int vprintk_emit(int facility, int level,
                        "BUG: recent printk recursion!";
                recursion_bug = 0;
-                printed_len += strlen(recursion_msg);
+                text_len = strlen(recursion_msg);
                /* emit KERN_CRIT message */
-                log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+                printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
-                          NULL, 0, recursion_msg, printed_len);
+                                         NULL, 0, recursion_msg, text_len);
        }
        /*
         * The printf needs to come first; we need the syslog
         * prefix which might be passed-in as a parameter.
         */
-        text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+        if (in_sched)
+                text_len = scnprintf(text, sizeof(textbuf),
+                                     KERN_WARNING "[sched_delayed] ");
+        text_len += vscnprintf(text + text_len,
+                               sizeof(textbuf) - text_len, fmt, args);
        /* mark and strip a trailing newline */
        if (text_len && text[text_len-1] == '\n') {
@@ -1586,9 +1683,12 @@ asmlinkage int vprintk_emit(int facility, int level,
                        cont_flush(LOG_NEWLINE);
                /* buffer line if possible, otherwise store it right away */
-                if (!cont_add(facility, level, text, text_len))
+                if (cont_add(facility, level, text, text_len))
-                        log_store(facility, level, lflags | LOG_CONT, 0,
+                        printed_len += text_len;
-                                  dict, dictlen, text, text_len);
+                else
+                        printed_len += log_store(facility, level,
+                                                 lflags | LOG_CONT, 0,
+                                                 dict, dictlen, text, text_len);
        } else {
                bool stored = false;
@@ -1607,27 +1707,30 @@ asmlinkage int vprintk_emit(int facility, int level,
                        cont_flush(LOG_NEWLINE);
                }
-                if (!stored)
+                if (stored)
-                        log_store(facility, level, lflags, 0,
+                        printed_len += text_len;
-                                  dict, dictlen, text, text_len);
+                else
+                        printed_len += log_store(facility, level, lflags, 0,
+                                                 dict, dictlen, text, text_len);
        }
-        printed_len += text_len;
-        /*
+        logbuf_cpu = UINT_MAX;
-         * Try to acquire and then immediately release the console semaphore.
+        raw_spin_unlock(&logbuf_lock);
-         * The release will print out buffers and wake up /dev/kmsg and syslog()
-         * users.
+        /* If called from the scheduler, we can not call up(). */
-         *
+        if (!in_sched) {
-         * The console_trylock_for_printk() function will release 'logbuf_lock'
+                /*
-         * regardless of whether it actually gets the console semaphore or not.
+                 * Try to acquire and then immediately release the console
-         */
+                 * semaphore.  The release will print out buffers and wake up
-        if (console_trylock_for_printk(this_cpu))
+                 * /dev/kmsg and syslog() users.
-                console_unlock();
+                 */
+                if (console_trylock_for_printk(this_cpu))
+                        console_unlock();
+        }
        lockdep_on();
 out_restore_irqs:
        local_irq_restore(flags);
        return printed_len;
 }
 EXPORT_SYMBOL(vprintk_emit);
@@ -1882,16 +1985,14 @@ void suspend_console(void)
        printk("Suspending console(s) (use no_console_suspend to debug)\n");
        console_lock();
        console_suspended = 1;
-        up(&console_sem);
+        up_console_sem();
-        mutex_release(&console_lock_dep_map, 1, _RET_IP_);
 }
 void resume_console(void)
 {
        if (!console_suspend_enabled)
                return;
-        down(&console_sem);
+        down_console_sem();
-        mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
        console_suspended = 0;
        console_unlock();
 }
@@ -1933,12 +2034,11 @@ void console_lock(void)
 {
        might_sleep();
-        down(&console_sem);
+        down_console_sem();
        if (console_suspended)
                return;
        console_locked = 1;
        console_may_schedule = 1;
-        mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
 }
 EXPORT_SYMBOL(console_lock);
@@ -1952,15 +2052,14 @@ EXPORT_SYMBOL(console_lock);
 */
 int console_trylock(void)
 {
-        if (down_trylock(&console_sem))
+        if (down_trylock_console_sem())
                return 0;
        if (console_suspended) {
-                up(&console_sem);
+                up_console_sem();
                return 0;
        }
        console_locked = 1;
        console_may_schedule = 0;
-        mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
        return 1;
 }
 EXPORT_SYMBOL(console_trylock);
@@ -2022,7 +2121,7 @@ void console_unlock(void)
        bool retry;
        if (console_suspended) {
-                up(&console_sem);
+                up_console_sem();
                return;
        }
@@ -2043,10 +2142,15 @@ again:
                }
                if (console_seq < log_first_seq) {
+                        len = sprintf(text, "** %u printk messages dropped ** ",
+                                      (unsigned)(log_first_seq - console_seq));
                        /* messages are gone, move to first one */
                        console_seq = log_first_seq;
                        console_idx = log_first_idx;
                        console_prev = 0;
+                } else {
+                        len = 0;
                }
 skip:
                if (console_seq == log_next_seq)
@@ -2071,8 +2175,8 @@ skip:
                }
                level = msg->level;
-                len = msg_print_text(msg, console_prev, false,
+                len += msg_print_text(msg, console_prev, false,
-                                     text, sizeof(text));
+                                      text + len, sizeof(text) - len);
                console_idx = log_next(console_idx);
                console_seq++;
                console_prev = msg->flags;
@@ -2084,7 +2188,6 @@ skip:
                local_irq_restore(flags);
        }
        console_locked = 0;
-        mutex_release(&console_lock_dep_map, 1, _RET_IP_);
        /* Release the exclusive_console once it is used */
        if (unlikely(exclusive_console))
@@ -2092,7 +2195,7 @@ skip:
        raw_spin_unlock(&logbuf_lock);
-        up(&console_sem);
+        up_console_sem();
        /*
         * Someone could have filled up the buffer again, so re-check if there's
@@ -2137,7 +2240,7 @@ void console_unblank(void)
         * oops_in_progress is set to 1..
         */
        if (oops_in_progress) {
-                if (down_trylock(&console_sem) != 0)
+                if (down_trylock_console_sem() != 0)
                        return;
        } else
                console_lock();
@@ -2413,6 +2516,7 @@ int unregister_console(struct console *console)
        if (console_drivers != NULL && console->flags & CON_CONSDEV)
                console_drivers->flags |= CON_CONSDEV;
+        console->flags &= ~CON_ENABLED;
        console_unlock();
        console_sysfs_notify();
        return res;
@@ -2437,21 +2541,19 @@ late_initcall(printk_late_init);
 /*
 * Delayed printk version, for scheduler-internal messages:
 */
-#define PRINTK_BUF_SIZE         512
 #define PRINTK_PENDING_WAKEUP   0x01
-#define PRINTK_PENDING_SCHED    0x02
+#define PRINTK_PENDING_OUTPUT   0x02
 static DEFINE_PER_CPU(int, printk_pending);
-static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
 static void wake_up_klogd_work_func(struct irq_work *irq_work)
 {
        int pending = __this_cpu_xchg(printk_pending, 0);
-        if (pending & PRINTK_PENDING_SCHED) {
+        if (pending & PRINTK_PENDING_OUTPUT) {
-                char *buf = __get_cpu_var(printk_sched_buf);
+                /* If trylock fails, someone else is doing the printing */
-                pr_warn("[sched_delayed] %s", buf);
+                if (console_trylock())
+                        console_unlock();
        }
        if (pending & PRINTK_PENDING_WAKEUP)
@@ -2473,23 +2575,19 @@ void wake_up_klogd(void)
        preempt_enable();
 }
-int printk_sched(const char *fmt, ...)
+int printk_deferred(const char *fmt, ...)
 {
-        unsigned long flags;
        va_list args;
-        char *buf;
        int r;
-        local_irq_save(flags);
+        preempt_disable();
-        buf = __get_cpu_var(printk_sched_buf);
        va_start(args, fmt);
-        r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args);
+        r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args);
        va_end(args);
-        __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
+        __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
        irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
-        local_irq_restore(flags);
+        preempt_enable();
        return r;
 }
diff --git a/kernel/profile.c b/kernel/profile.c
index cb980f0c731b..54bf5ba26420 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -52,9 +52,9 @@ static DEFINE_MUTEX(profile_flip_mutex);
 int profile_setup(char *str)
 {
-        static char schedstr[] = "schedule";
+        static const char schedstr[] = "schedule";
-        static char sleepstr[] = "sleep";
+        static const char sleepstr[] = "sleep";
-        static char kvmstr[] = "kvm";
+        static const char kvmstr[] = "kvm";
        int par;
        if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -64,12 +64,10 @@ int profile_setup(char *str)
                        str += strlen(sleepstr) + 1;
                if (get_option(&str, &par))
                        prof_shift = par;
-                printk(KERN_INFO
+                pr_info("kernel sleep profiling enabled (shift: %ld)\n",
-                        "kernel sleep profiling enabled (shift: %ld)\n",
                        prof_shift);
 #else
-                printk(KERN_WARNING
+                pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
-                        "kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
 #endif /* CONFIG_SCHEDSTATS */
        } else if (!strncmp(str, schedstr, strlen(schedstr))) {
                prof_on = SCHED_PROFILING;
@@ -77,8 +75,7 @@ int profile_setup(char *str)
                        str += strlen(schedstr) + 1;
                if (get_option(&str, &par))
                        prof_shift = par;
-                printk(KERN_INFO
+                pr_info("kernel schedule profiling enabled (shift: %ld)\n",
-                        "kernel schedule profiling enabled (shift: %ld)\n",
                        prof_shift);
        } else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
                prof_on = KVM_PROFILING;
@@ -86,13 +83,12 @@ int profile_setup(char *str)
                        str += strlen(kvmstr) + 1;
                if (get_option(&str, &par))
                        prof_shift = par;
-                printk(KERN_INFO
+                pr_info("kernel KVM profiling enabled (shift: %ld)\n",
-                        "kernel KVM profiling enabled (shift: %ld)\n",
                        prof_shift);
        } else if (get_option(&str, &par)) {
                prof_shift = par;
                prof_on = CPU_PROFILING;
-                printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
+                pr_info("kernel profiling enabled (shift: %ld)\n",
                        prof_shift);
        }
        return 1;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bd30bc61bc05..7fa34f86e5ba 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -58,9 +58,11 @@ torture_param(int, fqs_duration, 0,
              "Duration of fqs bursts (us), 0 to disable");
 torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
 torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
+torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
 torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
 torture_param(bool, gp_normal, false,
             "Use normal (non-expedited) GP wait primitives");
+torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
 torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
 torture_param(int, n_barrier_cbs, 0,
             "# of callbacks/kthreads for barrier testing");
@@ -138,6 +140,18 @@ static long n_barrier_attempts;
 static long n_barrier_successes;
 static struct list_head rcu_torture_removed;
+static int rcu_torture_writer_state;
+#define RTWS_FIXED_DELAY        0
+#define RTWS_DELAY              1
+#define RTWS_REPLACE            2
+#define RTWS_DEF_FREE           3
+#define RTWS_EXP_SYNC           4
+#define RTWS_COND_GET           5
+#define RTWS_COND_SYNC          6
+#define RTWS_SYNC               7
+#define RTWS_STUTTER            8
+#define RTWS_STOPPING           9
 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
 #define RCUTORTURE_RUNNABLE_INIT 1
 #else
@@ -214,6 +228,7 @@ rcu_torture_free(struct rcu_torture *p)
 */
 struct rcu_torture_ops {
+        int ttype;
        void (*init)(void);
        int (*readlock)(void);
        void (*read_delay)(struct torture_random_state *rrsp);
@@ -222,6 +237,8 @@ struct rcu_torture_ops {
        void (*deferred_free)(struct rcu_torture *p);
        void (*sync)(void);
        void (*exp_sync)(void);
+        unsigned long (*get_state)(void);
+        void (*cond_sync)(unsigned long oldstate);
        void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
        void (*cb_barrier)(void);
        void (*fqs)(void);
@@ -273,10 +290,48 @@ static int rcu_torture_completed(void)
        return rcu_batches_completed();
 }
+/*
+ * Update callback in the pipe.  This should be invoked after a grace period.
+ */
+static bool
+rcu_torture_pipe_update_one(struct rcu_torture *rp)
+{
+        int i;
+        i = rp->rtort_pipe_count;
+        if (i > RCU_TORTURE_PIPE_LEN)
+                i = RCU_TORTURE_PIPE_LEN;
+        atomic_inc(&rcu_torture_wcount[i]);
+        if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+                rp->rtort_mbtest = 0;
+                return true;
+        }
+        return false;
+}
+/*
+ * Update all callbacks in the pipe.  Suitable for synchronous grace-period
+ * primitives.
+ */
+static void
+rcu_torture_pipe_update(struct rcu_torture *old_rp)
+{
+        struct rcu_torture *rp;
+        struct rcu_torture *rp1;
+        if (old_rp)
+                list_add(&old_rp->rtort_free, &rcu_torture_removed);
+        list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) {
+                if (rcu_torture_pipe_update_one(rp)) {
+                        list_del(&rp->rtort_free);
+                        rcu_torture_free(rp);
+                }
+        }
+}
 static void
 rcu_torture_cb(struct rcu_head *p)
 {
-        int i;
        struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
        if (torture_must_stop_irq()) {
@@ -284,16 +339,10 @@ rcu_torture_cb(struct rcu_head *p)
                /* The next initialization will pick up the pieces. */
                return;
        }
-        i = rp->rtort_pipe_count;
+        if (rcu_torture_pipe_update_one(rp))
-        if (i > RCU_TORTURE_PIPE_LEN)
-                i = RCU_TORTURE_PIPE_LEN;
-        atomic_inc(&rcu_torture_wcount[i]);
-        if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
-                rp->rtort_mbtest = 0;
                rcu_torture_free(rp);
-        } else {
+        else
                cur_ops->deferred_free(rp);
-        }
 }
 static int rcu_no_completed(void)
@@ -312,6 +361,7 @@ static void rcu_sync_torture_init(void)
 }
 static struct rcu_torture_ops rcu_ops = {
+        .ttype          = RCU_FLAVOR,
        .init           = rcu_sync_torture_init,
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,
@@ -320,6 +370,8 @@ static struct rcu_torture_ops rcu_ops = {
        .deferred_free  = rcu_torture_deferred_free,
        .sync           = synchronize_rcu,
        .exp_sync       = synchronize_rcu_expedited,
+        .get_state      = get_state_synchronize_rcu,
+        .cond_sync      = cond_synchronize_rcu,
        .call           = call_rcu,
        .cb_barrier     = rcu_barrier,
        .fqs            = rcu_force_quiescent_state,
@@ -355,6 +407,7 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
 }
 static struct rcu_torture_ops rcu_bh_ops = {
+        .ttype          = RCU_BH_FLAVOR,
        .init           = rcu_sync_torture_init,
        .readlock       = rcu_bh_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
@@ -397,6 +450,7 @@ call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 static struct rcu_torture_ops rcu_busted_ops = {
+        .ttype          = INVALID_RCU_FLAVOR,
        .init           = rcu_sync_torture_init,
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
@@ -479,9 +533,11 @@ static void srcu_torture_stats(char *page)
        page += sprintf(page, "%s%s per-CPU(idx=%d):",
                       torture_type, TORTURE_FLAG, idx);
        for_each_possible_cpu(cpu) {
-                page += sprintf(page, " %d(%lu,%lu)", cpu,
+                long c0, c1;
-                               per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
-                               per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
+                c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
+                c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
+                page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1);
        }
        sprintf(page, "\n");
 }
@@ -492,6 +548,7 @@ static void srcu_torture_synchronize_expedited(void)
 }
 static struct rcu_torture_ops srcu_ops = {
+        .ttype          = SRCU_FLAVOR,
        .init           = rcu_sync_torture_init,
        .readlock       = srcu_torture_read_lock,
        .read_delay     = srcu_read_delay,
@@ -527,6 +584,7 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
 }
 static struct rcu_torture_ops sched_ops = {
+        .ttype          = RCU_SCHED_FLAVOR,
        .init           = rcu_sync_torture_init,
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
@@ -688,23 +746,59 @@ rcu_torture_fqs(void *arg)
 static int
 rcu_torture_writer(void *arg)
 {
-        bool exp;
+        unsigned long gp_snap;
+        bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
+        bool gp_sync1 = gp_sync;
        int i;
        struct rcu_torture *rp;
-        struct rcu_torture *rp1;
        struct rcu_torture *old_rp;
        static DEFINE_TORTURE_RANDOM(rand);
+        int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
+                           RTWS_COND_GET, RTWS_SYNC };
+        int nsynctypes = 0;
        VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
-        set_user_nice(current, MAX_NICE);
+        /* Initialize synctype[] array.  If none set, take default. */
+        if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
+                gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
+        if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
+                synctype[nsynctypes++] = RTWS_COND_GET;
+        else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync))
+                pr_alert("rcu_torture_writer: gp_cond without primitives.\n");
+        if (gp_exp1 && cur_ops->exp_sync)
+                synctype[nsynctypes++] = RTWS_EXP_SYNC;
+        else if (gp_exp && !cur_ops->exp_sync)
+                pr_alert("rcu_torture_writer: gp_exp without primitives.\n");
+        if (gp_normal1 && cur_ops->deferred_free)
+                synctype[nsynctypes++] = RTWS_DEF_FREE;
+        else if (gp_normal && !cur_ops->deferred_free)
+                pr_alert("rcu_torture_writer: gp_normal without primitives.\n");
+        if (gp_sync1 && cur_ops->sync)
+                synctype[nsynctypes++] = RTWS_SYNC;
+        else if (gp_sync && !cur_ops->sync)
+                pr_alert("rcu_torture_writer: gp_sync without primitives.\n");
+        if (WARN_ONCE(nsynctypes == 0,
+                      "rcu_torture_writer: No update-side primitives.\n")) {
+                /*
+                 * No updates primitives, so don't try updating.
+                 * The resulting test won't be testing much, hence the
+                 * above WARN_ONCE().
+                 */
+                rcu_torture_writer_state = RTWS_STOPPING;
+                torture_kthread_stopping("rcu_torture_writer");
+        }
        do {
+                rcu_torture_writer_state = RTWS_FIXED_DELAY;
                schedule_timeout_uninterruptible(1);
                rp = rcu_torture_alloc();
                if (rp == NULL)
                        continue;
                rp->rtort_pipe_count = 0;
+                rcu_torture_writer_state = RTWS_DELAY;
                udelay(torture_random(&rand) & 0x3ff);
+                rcu_torture_writer_state = RTWS_REPLACE;
                old_rp = rcu_dereference_check(rcu_torture_current,
                                               current == writer_task);
                rp->rtort_mbtest = 1;
@@ -716,35 +810,42 @@ rcu_torture_writer(void *arg)
                                i = RCU_TORTURE_PIPE_LEN;
                        atomic_inc(&rcu_torture_wcount[i]);
                        old_rp->rtort_pipe_count++;
-                        if (gp_normal == gp_exp)
+                        switch (synctype[torture_random(&rand) % nsynctypes]) {
-                                exp = !!(torture_random(&rand) & 0x80);
+                        case RTWS_DEF_FREE:
-                        else
+                                rcu_torture_writer_state = RTWS_DEF_FREE;
-                                exp = gp_exp;
-                        if (!exp) {
                                cur_ops->deferred_free(old_rp);
-                        } else {
+                                break;
+                        case RTWS_EXP_SYNC:
+                                rcu_torture_writer_state = RTWS_EXP_SYNC;
                                cur_ops->exp_sync();
-                                list_add(&old_rp->rtort_free,
+                                rcu_torture_pipe_update(old_rp);
-                                         &rcu_torture_removed);
+                                break;
-                                list_for_each_entry_safe(rp, rp1,
+                        case RTWS_COND_GET:
-                                                         &rcu_torture_removed,
+                                rcu_torture_writer_state = RTWS_COND_GET;
-                                                         rtort_free) {
+                                gp_snap = cur_ops->get_state();
-                                        i = rp->rtort_pipe_count;
+                                i = torture_random(&rand) % 16;
-                                        if (i > RCU_TORTURE_PIPE_LEN)
+                                if (i != 0)
-                                                i = RCU_TORTURE_PIPE_LEN;
+                                        schedule_timeout_interruptible(i);
-                                        atomic_inc(&rcu_torture_wcount[i]);
+                                udelay(torture_random(&rand) % 1000);
-                                        if (++rp->rtort_pipe_count >=
+                                rcu_torture_writer_state = RTWS_COND_SYNC;
-                                            RCU_TORTURE_PIPE_LEN) {
+                                cur_ops->cond_sync(gp_snap);
-                                                rp->rtort_mbtest = 0;
+                                rcu_torture_pipe_update(old_rp);
-                                                list_del(&rp->rtort_free);
+                                break;
-                                                rcu_torture_free(rp);
+                        case RTWS_SYNC:
-                                        }
+                                rcu_torture_writer_state = RTWS_SYNC;
-                                 }
+                                cur_ops->sync();
+                                rcu_torture_pipe_update(old_rp);
+                                break;
+                        default:
+                                WARN_ON_ONCE(1);
+                                break;
                        }
                }
                rcutorture_record_progress(++rcu_torture_current_version);
+                rcu_torture_writer_state = RTWS_STUTTER;
                stutter_wait("rcu_torture_writer");
        } while (!torture_must_stop());
+        rcu_torture_writer_state = RTWS_STOPPING;
        torture_kthread_stopping("rcu_torture_writer");
        return 0;
 }
@@ -784,7 +885,7 @@ rcu_torture_fakewriter(void *arg)
        return 0;
 }
-void rcutorture_trace_dump(void)
+static void rcutorture_trace_dump(void)
 {
        static atomic_t beenhere = ATOMIC_INIT(0);
@@ -918,11 +1019,13 @@ rcu_torture_reader(void *arg)
                __this_cpu_inc(rcu_torture_batch[completed]);
                preempt_enable();
                cur_ops->readunlock(idx);
-                schedule();
+                cond_resched();
                stutter_wait("rcu_torture_reader");
        } while (!torture_must_stop());
-        if (irqreader && cur_ops->irq_capable)
+        if (irqreader && cur_ops->irq_capable) {
                del_timer_sync(&t);
+                destroy_timer_on_stack(&t);
+        }
        torture_kthread_stopping("rcu_torture_reader");
        return 0;
 }
@@ -937,6 +1040,7 @@ rcu_torture_printk(char *page)
        int i;
        long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
        long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+        static unsigned long rtcv_snap = ULONG_MAX;
        for_each_possible_cpu(cpu) {
                for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
@@ -997,6 +1101,22 @@ rcu_torture_printk(char *page)
        page += sprintf(page, "\n");
        if (cur_ops->stats)
                cur_ops->stats(page);
+        if (rtcv_snap == rcu_torture_current_version &&
+            rcu_torture_current != NULL) {
+                int __maybe_unused flags;
+                unsigned long __maybe_unused gpnum;
+                unsigned long __maybe_unused completed;
+                rcutorture_get_gp_data(cur_ops->ttype,
+                                       &flags, &gpnum, &completed);
+                page += sprintf(page,
+                                "??? Writer stall state %d g%lu c%lu f%#x\n",
+                                rcu_torture_writer_state,
+                                gpnum, completed, flags);
+                show_rcu_gp_kthreads();
+                rcutorture_trace_dump();
+        }
+        rtcv_snap = rcu_torture_current_version;
 }
 /*
@@ -1146,7 +1266,7 @@ static int __init rcu_torture_stall_init(void)
 }
 /* Callback function for RCU barrier testing. */
-void rcu_torture_barrier_cbf(struct rcu_head *rcu)
+static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
 {
        atomic_inc(&barrier_cbs_invoked);
 }
@@ -1416,7 +1536,8 @@ rcu_torture_init(void)
                &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
        };
-        torture_init_begin(torture_type, verbose, &rcutorture_runnable);
+        if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable))
+                return -EBUSY;
        /* Process args and tell the world that the torturer is on the job. */
        for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1441,10 +1562,13 @@ rcu_torture_init(void)
        if (cur_ops->init)
                cur_ops->init(); /* no "goto unwind" prior to this point!!! */
-        if (nreaders >= 0)
+        if (nreaders >= 0) {
                nrealreaders = nreaders;
-        else
+        } else {
-                nrealreaders = 2 * num_online_cpus();
+                nrealreaders = num_online_cpus() - 1;
+                if (nrealreaders <= 0)
+                        nrealreaders = 1;
+        }
        rcu_torture_print_module_parms(cur_ops, "Start of test");
        /* Set up the freelist. */
@@ -1533,7 +1657,8 @@ rcu_torture_init(void)
                fqs_duration = 0;
        if (fqs_duration) {
                /* Create the fqs thread */
-                torture_create_kthread(rcu_torture_fqs, NULL, fqs_task);
+                firsterr = torture_create_kthread(rcu_torture_fqs, NULL,
+                                                  fqs_task);
                if (firsterr)
                        goto unwind;
        }
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 431528520562..858c56569127 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -144,7 +144,7 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
                return;
        rcp->ticks_this_gp++;
        j = jiffies;
-        js = rcp->jiffies_stall;
+        js = ACCESS_ONCE(rcp->jiffies_stall);
        if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
                pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
                       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
@@ -152,17 +152,17 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
                dump_stack();
        }
        if (*rcp->curtail && ULONG_CMP_GE(j, js))
-                rcp->jiffies_stall = jiffies +
+                ACCESS_ONCE(rcp->jiffies_stall) = jiffies +
                        3 * rcu_jiffies_till_stall_check() + 3;
        else if (ULONG_CMP_GE(j, js))
-                rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+                ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
 }
 static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
 {
        rcp->ticks_this_gp = 0;
        rcp->gp_start = jiffies;
-        rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+        ACCESS_ONCE(rcp->jiffies_stall) = jiffies + rcu_jiffies_till_stall_check();
 }
 static void check_cpu_stalls(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0c47e300210a..625d0b0cd75a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -101,7 +101,7 @@ DEFINE_PER_CPU(struct rcu_data, sname##_data)
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
-static struct rcu_state *rcu_state;
+static struct rcu_state *rcu_state_p;
 LIST_HEAD(rcu_struct_flavors);
 /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu)
        rdp->passed_quiesce = 1;
 }
+static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+        .dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+        .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+        .dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+};
+/*
+ * Let the RCU core know that this CPU has gone through the scheduler,
+ * which is a quiescent state.  This is called when the need for a
+ * quiescent state is urgent, so we burn an atomic operation and full
+ * memory barriers to let the RCU core know about it, regardless of what
+ * this CPU might (or might not) do in the near future.
+ *
+ * We inform the RCU core by emulating a zero-duration dyntick-idle
+ * period, which we in turn do by incrementing the ->dynticks counter
+ * by two.
+ */
+static void rcu_momentary_dyntick_idle(void)
+{
+        unsigned long flags;
+        struct rcu_data *rdp;
+        struct rcu_dynticks *rdtp;
+        int resched_mask;
+        struct rcu_state *rsp;
+        local_irq_save(flags);
+        /*
+         * Yes, we can lose flag-setting operations.  This is OK, because
+         * the flag will be set again after some delay.
+         */
+        resched_mask = raw_cpu_read(rcu_sched_qs_mask);
+        raw_cpu_write(rcu_sched_qs_mask, 0);
+        /* Find the flavor that needs a quiescent state. */
+        for_each_rcu_flavor(rsp) {
+                rdp = raw_cpu_ptr(rsp->rda);
+                if (!(resched_mask & rsp->flavor_mask))
+                        continue;
+                smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
+                if (ACCESS_ONCE(rdp->mynode->completed) !=
+                    ACCESS_ONCE(rdp->cond_resched_completed))
+                        continue;
+                /*
+                 * Pretend to be momentarily idle for the quiescent state.
+                 * This allows the grace-period kthread to record the
+                 * quiescent state, with no need for this CPU to do anything
+                 * further.
+                 */
+                rdtp = this_cpu_ptr(&rcu_dynticks);
+                smp_mb__before_atomic(); /* Earlier stuff before QS. */
+                atomic_add(2, &rdtp->dynticks);  /* QS. */
+                smp_mb__after_atomic(); /* Later stuff after QS. */
+                break;
+        }
+        local_irq_restore(flags);
+}
 /*
 * Note a context switch.  This is a quiescent state for RCU-sched,
 * and requires special handling for preemptible RCU.
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu)
        trace_rcu_utilization(TPS("Start context switch"));
        rcu_sched_qs(cpu);
        rcu_preempt_note_context_switch(cpu);
+        if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+                rcu_momentary_dyntick_idle();
        trace_rcu_utilization(TPS("End context switch"));
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
-        .dynticks = ATOMIC_INIT(1),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-        .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
-        .dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-};
 static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;    /* If this many pending, ignore blimit. */
 static long qlowmark = 100;     /* Once only this many pending, use blimit. */
@@ -243,7 +300,14 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
 module_param(jiffies_till_first_fqs, ulong, 0644);
 module_param(jiffies_till_next_fqs, ulong, 0644);
-static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+/*
+ * How long the grace period must be before we start recruiting
+ * quiescent-state help from rcu_note_context_switch().
+ */
+static ulong jiffies_till_sched_qs = HZ / 20;
+module_param(jiffies_till_sched_qs, ulong, 0644);
+static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
                                  struct rcu_data *rdp);
 static void force_qs_rnp(struct rcu_state *rsp,
                         int (*f)(struct rcu_data *rsp, bool *isidle,
@@ -271,6 +335,15 @@ long rcu_batches_completed_bh(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 /*
+ * Force a quiescent state.
+ */
+void rcu_force_quiescent_state(void)
+{
+        force_quiescent_state(rcu_state_p);
+}
+EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
+/*
 * Force a quiescent state for RCU BH.
 */
 void rcu_bh_force_quiescent_state(void)
@@ -280,6 +353,21 @@ void rcu_bh_force_quiescent_state(void)
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
 /*
+ * Show the state of the grace-period kthreads.
+ */
+void show_rcu_gp_kthreads(void)
+{
+        struct rcu_state *rsp;
+        for_each_rcu_flavor(rsp) {
+                pr_info("%s: wait state: %d ->state: %#lx\n",
+                        rsp->name, rsp->gp_state, rsp->gp_kthread->state);
+                /* sched_show_task(rsp->gp_kthread); */
+        }
+}
+EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
+/*
 * Record the number of times rcutorture tests have been initiated and
 * terminated.  This information allows the debugfs tracing stats to be
 * correlated to the rcutorture messages, even when the rcutorture module
@@ -294,6 +382,39 @@ void rcutorture_record_test_transition(void)
 EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
 /*
+ * Send along grace-period-related data for rcutorture diagnostics.
+ */
+void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
+                            unsigned long *gpnum, unsigned long *completed)
+{
+        struct rcu_state *rsp = NULL;
+        switch (test_type) {
+        case RCU_FLAVOR:
+                rsp = rcu_state_p;
+                break;
+        case RCU_BH_FLAVOR:
+                rsp = &rcu_bh_state;
+                break;
+        case RCU_SCHED_FLAVOR:
+                rsp = &rcu_sched_state;
+                break;
+        default:
+                break;
+        }
+        if (rsp != NULL) {
+                *flags = ACCESS_ONCE(rsp->gp_flags);
+                *gpnum = ACCESS_ONCE(rsp->gpnum);
+                *completed = ACCESS_ONCE(rsp->completed);
+                return;
+        }
+        *flags = 0;
+        *gpnum = 0;
+        *completed = 0;
+}
+EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
+/*
 * Record the number of writer passes through the current rcutorture test.
 * This is also used to correlate debugfs tracing stats with the rcutorture
 * messages.
@@ -324,6 +445,28 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
 }
 /*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+        return &rsp->node[0];
+}
+/*
+ * Is there any need for future grace periods?
+ * Interrupts must be disabled.  If the caller does not hold the root
+ * rnp_node structure's ->lock, the results are advisory only.
+ */
+static int rcu_future_needs_gp(struct rcu_state *rsp)
+{
+        struct rcu_node *rnp = rcu_get_root(rsp);
+        int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1;
+        int *fp = &rnp->need_future_gp[idx];
+        return ACCESS_ONCE(*fp);
+}
+/*
 * Does the current CPU require a not-yet-started grace period?
 * The caller must have disabled interrupts to prevent races with
 * normal callback registry.
@@ -335,7 +478,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
        if (rcu_gp_in_progress(rsp))
                return 0;  /* No, a grace period is already in progress. */
-        if (rcu_nocb_needs_gp(rsp))
+        if (rcu_future_needs_gp(rsp))
                return 1;  /* Yes, a no-CBs CPU needs one. */
        if (!rdp->nxttail[RCU_NEXT_TAIL])
                return 0;  /* No, this is a no-CBs (or offline) CPU. */
@@ -350,14 +493,6 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 }
 /*
- * Return the root node of the specified rcu_state structure.
- */
-static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
-{
-        return &rsp->node[0];
-}
-/*
 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
 *
 * If the new value of the ->dynticks_nesting counter now is zero,
@@ -387,9 +522,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
        }
        rcu_prepare_for_idle(smp_processor_id());
        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-        smp_mb__before_atomic_inc();  /* See above. */
+        smp_mb__before_atomic();  /* See above. */
        atomic_inc(&rdtp->dynticks);
-        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
+        smp_mb__after_atomic();  /* Force ordering with next sojourn. */
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
        /*
@@ -507,10 +642,10 @@ void rcu_irq_exit(void)
 static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
                               int user)
 {
-        smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
+        smp_mb__before_atomic();  /* Force ordering w/previous sojourn. */
        atomic_inc(&rdtp->dynticks);
        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
-        smp_mb__after_atomic_inc();  /* See above. */
+        smp_mb__after_atomic();  /* See above. */
        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
        rcu_cleanup_after_idle(smp_processor_id());
        trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
@@ -635,10 +770,10 @@ void rcu_nmi_enter(void)
            (atomic_read(&rdtp->dynticks) & 0x1))
                return;
        rdtp->dynticks_nmi_nesting++;
-        smp_mb__before_atomic_inc();  /* Force delay from prior write. */
+        smp_mb__before_atomic();  /* Force delay from prior write. */
        atomic_inc(&rdtp->dynticks);
        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
-        smp_mb__after_atomic_inc();  /* See above. */
+        smp_mb__after_atomic();  /* See above. */
        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
 }
@@ -657,9 +792,9 @@ void rcu_nmi_exit(void)
            --rdtp->dynticks_nmi_nesting != 0)
                return;
        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-        smp_mb__before_atomic_inc();  /* See above. */
+        smp_mb__before_atomic();  /* See above. */
        atomic_inc(&rdtp->dynticks);
-        smp_mb__after_atomic_inc();  /* Force delay to next write. */
+        smp_mb__after_atomic();  /* Force delay to next write. */
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
@@ -758,7 +893,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 {
        rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
        rcu_sysidle_check_cpu(rdp, isidle, maxj);
-        return (rdp->dynticks_snap & 0x1) == 0;
+        if ((rdp->dynticks_snap & 0x1) == 0) {
+                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
+                return 1;
+        } else {
+                return 0;
+        }
 }
 /*
@@ -777,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
                                    bool *isidle, unsigned long *maxj)
 {
        unsigned int curr;
+        int *rcrmp;
        unsigned int snap;
        curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -817,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
        }
        /*
-         * There is a possibility that a CPU in adaptive-ticks state
+         * A CPU running for an extended time within the kernel can
-         * might run in the kernel with the scheduling-clock tick disabled
+         * delay RCU grace periods.  When the CPU is in NO_HZ_FULL mode,
-         * for an extended time period.  Invoke rcu_kick_nohz_cpu() to
+         * even context-switching back and forth between a pair of
-         * force the CPU to restart the scheduling-clock tick in this
+         * in-kernel CPU-bound tasks cannot advance grace periods.
-         * CPU is in this state.
+         * So if the grace period is old enough, make the CPU pay attention.
-         */
+         * Note that the unsynchronized assignments to the per-CPU
-        rcu_kick_nohz_cpu(rdp->cpu);
+         * rcu_sched_qs_mask variable are safe.  Yes, setting of
+         * bits can be lost, but they will be set again on the next
-        /*
+         * force-quiescent-state pass.  So lost bit sets do not result
-         * Alternatively, the CPU might be running in the kernel
+         * in incorrect behavior, merely in a grace period lasting
-         * for an extended period of time without a quiescent state.
+         * a few jiffies longer than it might otherwise.  Because
-         * Attempt to force the CPU through the scheduler to gain the
+         * there are at most four threads involved, and because the
-         * needed quiescent state, but only if the grace period has gone
+         * updates are only once every few jiffies, the probability of
-         * on for an uncommonly long time.  If there are many stuck CPUs,
+         * lossage (and thus of slight grace-period extension) is
-         * we will beat on the first one until it gets unstuck, then move
+         * quite low.
-         * to the next.  Only do this for the primary flavor of RCU.
+         *
+         * Note that if the jiffies_till_sched_qs boot/sysfs parameter
+         * is set too high, we override with half of the RCU CPU stall
+         * warning delay.
         */
-        if (rdp->rsp == rcu_state &&
+        rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
+        if (ULONG_CMP_GE(jiffies,
+                         rdp->rsp->gp_start + jiffies_till_sched_qs) ||
            ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
-                rdp->rsp->jiffies_resched += 5;
+                if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
-                resched_cpu(rdp->cpu);
+                        ACCESS_ONCE(rdp->cond_resched_completed) =
+                                ACCESS_ONCE(rdp->mynode->completed);
+                        smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+                        ACCESS_ONCE(*rcrmp) =
+                                ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+                        resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+                        rdp->rsp->jiffies_resched += 5; /* Enable beating. */
+                } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+                        /* Time to beat on that CPU again! */
+                        resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+                        rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+                }
        }
        return 0;
@@ -851,7 +1008,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
        rsp->gp_start = j;
        smp_wmb(); /* Record start time before stall time. */
        j1 = rcu_jiffies_till_stall_check();
-        rsp->jiffies_stall = j + j1;
+        ACCESS_ONCE(rsp->jiffies_stall) = j + j1;
        rsp->jiffies_resched = j + j1 / 2;
 }
@@ -890,12 +1047,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        /* Only let one CPU complain about others per time interval. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        delta = jiffies - rsp->jiffies_stall;
+        delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
        if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
-        rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+        ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /*
@@ -932,9 +1089,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        print_cpu_stall_info_end();
        for_each_possible_cpu(cpu)
                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
-        pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
+        pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start),
-               rsp->gpnum, rsp->completed, totqlen);
+               (long)rsp->gpnum, (long)rsp->completed, totqlen);
        if (ndetected == 0)
                pr_err("INFO: Stall ended before state dump start\n");
        else if (!trigger_all_cpu_backtrace())
@@ -947,12 +1104,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        force_quiescent_state(rsp);  /* Kick them all. */
 }
-/*
- * This function really isn't for public consumption, but RCU is special in
- * that context switches can allow the state machine to make progress.
- */
-extern void resched_cpu(int cpu);
 static void print_cpu_stall(struct rcu_state *rsp)
 {
        int cpu;
@@ -971,14 +1122,15 @@ static void print_cpu_stall(struct rcu_state *rsp)
        print_cpu_stall_info_end();
        for_each_possible_cpu(cpu)
                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
-        pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
+        pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
-                jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
+                jiffies - rsp->gp_start,
+                (long)rsp->gpnum, (long)rsp->completed, totqlen);
        if (!trigger_all_cpu_backtrace())
                dump_stack();
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
+        if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
-                rsp->jiffies_stall = jiffies +
+                ACCESS_ONCE(rsp->jiffies_stall) = jiffies +
                                     3 * rcu_jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1062,7 +1214,7 @@ void rcu_cpu_stall_reset(void)
        struct rcu_state *rsp;
        for_each_rcu_flavor(rsp)
-                rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
+                ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
 }
 /*
@@ -1123,15 +1275,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 /*
 * Start some future grace period, as needed to handle newly arrived
 * callbacks.  The required future grace periods are recorded in each
- * rcu_node structure's ->need_future_gp field.
+ * rcu_node structure's ->need_future_gp field.  Returns true if there
+ * is reason to awaken the grace-period kthread.
 *
 * The caller must hold the specified rcu_node structure's ->lock.
 */
-static unsigned long __maybe_unused
+static bool __maybe_unused
-rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
+rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+                    unsigned long *c_out)
 {
        unsigned long c;
        int i;
+        bool ret = false;
        struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
        /*
@@ -1142,7 +1297,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
        trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
        if (rnp->need_future_gp[c & 0x1]) {
                trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
-                return c;
+                goto out;
        }
        /*
@@ -1156,7 +1311,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
            ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
                rnp->need_future_gp[c & 0x1]++;
                trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
-                return c;
+                goto out;
        }
        /*
@@ -1197,12 +1352,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
                trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
        } else {
                trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
-                rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+                ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
        }
 unlock_out:
        if (rnp != rnp_root)
                raw_spin_unlock(&rnp_root->lock);
-        return c;
+out:
+        if (c_out != NULL)
+                *c_out = c;
+        return ret;
 }
 /*
@@ -1226,25 +1384,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 }
 /*
+ * Awaken the grace-period kthread for the specified flavor of RCU.
+ * Don't do a self-awaken, and don't bother awakening when there is
+ * nothing for the grace-period kthread to do (as in several CPUs
+ * raced to awaken, and we lost), and finally don't try to awaken
+ * a kthread that has not yet been created.
+ */
+static void rcu_gp_kthread_wake(struct rcu_state *rsp)
+{
+        if (current == rsp->gp_kthread ||
+            !ACCESS_ONCE(rsp->gp_flags) ||
+            !rsp->gp_kthread)
+                return;
+        wake_up(&rsp->gp_wq);
+}
+/*
 * If there is room, assign a ->completed number to any callbacks on
 * this CPU that have not already been assigned.  Also accelerate any
 * callbacks that were previously assigned a ->completed number that has
 * since proven to be too conservative, which can happen if callbacks get
 * assigned a ->completed number while RCU is idle, but with reference to
 * a non-root rcu_node structure.  This function is idempotent, so it does
- * not hurt to call it repeatedly.
+ * not hurt to call it repeatedly.  Returns an flag saying that we should
+ * awaken the RCU grace-period kthread.
 *
 * The caller must hold rnp->lock with interrupts disabled.
 */
-static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
                               struct rcu_data *rdp)
 {
        unsigned long c;
        int i;
+        bool ret;
        /* If the CPU has no callbacks, nothing to do. */
        if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
-                return;
+                return false;
        /*
         * Starting from the sublist containing the callbacks most
@@ -1273,7 +1449,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
         * be grouped into.
         */
        if (++i >= RCU_NEXT_TAIL)
-                return;
+                return false;
        /*
         * Assign all subsequent callbacks' ->completed number to the next
@@ -1285,13 +1461,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
                rdp->nxtcompleted[i] = c;
        }
        /* Record any needed additional grace periods. */
-        rcu_start_future_gp(rnp, rdp);
+        ret = rcu_start_future_gp(rnp, rdp, NULL);
        /* Trace depending on how much we were able to accelerate. */
        if (!*rdp->nxttail[RCU_WAIT_TAIL])
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
        else
                trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
+        return ret;
 }
 /*
@@ -1300,17 +1477,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
 * sublist.  This function is idempotent, so it does not hurt to
 * invoke it repeatedly.  As long as it is not invoked -too- often...
+ * Returns true if the RCU grace-period kthread needs to be awakened.
 *
 * The caller must hold rnp->lock with interrupts disabled.
 */
-static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
                            struct rcu_data *rdp)
 {
        int i, j;
        /* If the CPU has no callbacks, nothing to do. */
        if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
-                return;
+                return false;
        /*
         * Find all callbacks whose ->completed numbers indicate that they
@@ -1334,26 +1512,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
        }
        /* Classify any remaining callbacks. */
-        rcu_accelerate_cbs(rsp, rnp, rdp);
+        return rcu_accelerate_cbs(rsp, rnp, rdp);
 }
 /*
 * Update CPU-local rcu_data state to record the beginnings and ends of
 * grace periods.  The caller must hold the ->lock of the leaf rcu_node
 * structure corresponding to the current CPU, and must have irqs disabled.
+ * Returns true if the grace-period kthread needs to be awakened.
 */
-static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
+static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
+                              struct rcu_data *rdp)
 {
+        bool ret;
        /* Handle the ends of any preceding grace periods first. */
        if (rdp->completed == rnp->completed) {
                /* No grace period end, so just accelerate recent callbacks. */
-                rcu_accelerate_cbs(rsp, rnp, rdp);
+                ret = rcu_accelerate_cbs(rsp, rnp, rdp);
        } else {
                /* Advance callbacks. */
-                rcu_advance_cbs(rsp, rnp, rdp);
+                ret = rcu_advance_cbs(rsp, rnp, rdp);
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
@@ -1372,11 +1554,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
                rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
                zero_cpu_stall_ticks(rdp);
        }
+        return ret;
 }
 static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
+        bool needwake;
        struct rcu_node *rnp;
        local_irq_save(flags);
@@ -1388,8 +1572,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
                return;
        }
        smp_mb__after_unlock_lock();
-        __note_gp_changes(rsp, rnp, rdp);
+        needwake = __note_gp_changes(rsp, rnp, rdp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        if (needwake)
+                rcu_gp_kthread_wake(rsp);
 }
 /*
@@ -1403,12 +1589,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
        rcu_bind_gp_kthread();
        raw_spin_lock_irq(&rnp->lock);
        smp_mb__after_unlock_lock();
-        if (rsp->gp_flags == 0) {
+        if (!ACCESS_ONCE(rsp->gp_flags)) {
                /* Spurious wakeup, tell caller to go back to sleep.  */
                raw_spin_unlock_irq(&rnp->lock);
                return 0;
        }
-        rsp->gp_flags = 0; /* Clear all flags: New grace period. */
+        ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */
        if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
                /*
@@ -1453,7 +1639,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
                WARN_ON_ONCE(rnp->completed != rsp->completed);
                ACCESS_ONCE(rnp->completed) = rsp->completed;
                if (rnp == rdp->mynode)
-                        __note_gp_changes(rsp, rnp, rdp);
+                        (void)__note_gp_changes(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
                trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
                                            rnp->level, rnp->grplo,
@@ -1501,7 +1687,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
        if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
                raw_spin_lock_irq(&rnp->lock);
                smp_mb__after_unlock_lock();
-                rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
+                ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS;
                raw_spin_unlock_irq(&rnp->lock);
        }
        return fqs_state;
@@ -1513,6 +1699,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 static void rcu_gp_cleanup(struct rcu_state *rsp)
 {
        unsigned long gp_duration;
+        bool needgp = false;
        int nocb = 0;
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1548,7 +1735,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
                ACCESS_ONCE(rnp->completed) = rsp->gpnum;
                rdp = this_cpu_ptr(rsp->rda);
                if (rnp == rdp->mynode)
-                        __note_gp_changes(rsp, rnp, rdp);
+                        needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
                /* smp_mb() provided by prior unlock-lock pair. */
                nocb += rcu_future_gp_cleanup(rsp, rnp);
                raw_spin_unlock_irq(&rnp->lock);
@@ -1564,9 +1751,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
        trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
        rsp->fqs_state = RCU_GP_IDLE;
        rdp = this_cpu_ptr(rsp->rda);
-        rcu_advance_cbs(rsp, rnp, rdp);  /* Reduce false positives below. */
+        /* Advance CBs to reduce false positives below. */
-        if (cpu_needs_another_gp(rsp, rdp)) {
+        needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
-                rsp->gp_flags = RCU_GP_FLAG_INIT;
+        if (needgp || cpu_needs_another_gp(rsp, rdp)) {
+                ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
                trace_rcu_grace_period(rsp->name,
                                       ACCESS_ONCE(rsp->gpnum),
                                       TPS("newreq"));
@@ -1593,6 +1781,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        trace_rcu_grace_period(rsp->name,
                                               ACCESS_ONCE(rsp->gpnum),
                                               TPS("reqwait"));
+                        rsp->gp_state = RCU_GP_WAIT_GPS;
                        wait_event_interruptible(rsp->gp_wq,
                                                 ACCESS_ONCE(rsp->gp_flags) &
                                                 RCU_GP_FLAG_INIT);
@@ -1620,6 +1809,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                        trace_rcu_grace_period(rsp->name,
                                               ACCESS_ONCE(rsp->gpnum),
                                               TPS("fqswait"));
+                        rsp->gp_state = RCU_GP_WAIT_FQS;
                        ret = wait_event_interruptible_timeout(rsp->gp_wq,
                                        ((gf = ACCESS_ONCE(rsp->gp_flags)) &
                                         RCU_GP_FLAG_FQS) ||
@@ -1665,14 +1855,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
        }
 }
-static void rsp_wakeup(struct irq_work *work)
-{
-        struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
-        /* Wake up rcu_gp_kthread() to start the grace period. */
-        wake_up(&rsp->gp_wq);
-}
 /*
 * Start a new RCU grace period if warranted, re-initializing the hierarchy
 * in preparation for detecting the next grace period.  The caller must hold
@@ -1681,8 +1863,10 @@ static void rsp_wakeup(struct irq_work *work)
 * Note that it is legal for a dying CPU (which is marked as offline) to
 * invoke this function.  This can happen when the dying CPU reports its
 * quiescent state.
+ *
+ * Returns true if the grace-period kthread must be awakened.
 */
-static void
+static bool
 rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
                      struct rcu_data *rdp)
 {
@@ -1693,20 +1877,18 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
                 * or a grace period is already in progress.
                 * Either way, don't start a new grace period.
                 */
-                return;
+                return false;
        }
-        rsp->gp_flags = RCU_GP_FLAG_INIT;
+        ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
        trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
                               TPS("newreq"));
        /*
         * We can't do wakeups while holding the rnp->lock, as that
         * could cause possible deadlocks with the rq->lock. Defer
-         * the wakeup to interrupt context.  And don't bother waking
+         * the wakeup to our caller.
-         * up the running kthread.
         */
-        if (current != rsp->gp_kthread)
+        return true;
-                irq_work_queue(&rsp->wakeup_work);
 }
 /*
@@ -1715,12 +1897,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
 * is invoked indirectly from rcu_advance_cbs(), which would result in
 * endless recursion -- or would do so if it wasn't for the self-deadlock
 * that is encountered beforehand.
+ *
+ * Returns true if the grace-period kthread needs to be awakened.
 */
-static void
+static bool rcu_start_gp(struct rcu_state *rsp)
-rcu_start_gp(struct rcu_state *rsp)
 {
        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        struct rcu_node *rnp = rcu_get_root(rsp);
+        bool ret = false;
        /*
         * If there is no grace period in progress right now, any
@@ -1730,8 +1914,9 @@ rcu_start_gp(struct rcu_state *rsp)
         * resulting in pointless grace periods.  So, advance callbacks
         * then start the grace period!
         */
-        rcu_advance_cbs(rsp, rnp, rdp);
+        ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
-        rcu_start_gp_advanced(rsp, rnp, rdp);
+        ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
+        return ret;
 }
 /*
@@ -1820,6 +2005,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        unsigned long mask;
+        bool needwake;
        struct rcu_node *rnp;
        rnp = rdp->mynode;
@@ -1848,9 +2034,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
                 * This GP can't end until cpu checks in, so all of our
                 * callbacks can be processed during the next GP.
                 */
-                rcu_accelerate_cbs(rsp, rnp, rdp);
+                needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
                rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
+                if (needwake)
+                        rcu_gp_kthread_wake(rsp);
        }
 }
@@ -1951,7 +2139,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
 {
        int i;
-        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+        struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
        /* No-CBs CPUs are handled specially. */
        if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
@@ -2320,7 +2508,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
                return;  /* Someone beat us to it. */
        }
-        rsp->gp_flags |= RCU_GP_FLAG_FQS;
+        ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS;
        raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
        wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
 }
@@ -2334,7 +2522,8 @@ static void
 __rcu_process_callbacks(struct rcu_state *rsp)
 {
        unsigned long flags;
-        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+        bool needwake;
+        struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
        WARN_ON_ONCE(rdp->beenonline == 0);
@@ -2345,8 +2534,10 @@ __rcu_process_callbacks(struct rcu_state *rsp)
        local_irq_save(flags);
        if (cpu_needs_another_gp(rsp, rdp)) {
                raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
-                rcu_start_gp(rsp);
+                needwake = rcu_start_gp(rsp);
                raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+                if (needwake)
+                        rcu_gp_kthread_wake(rsp);
        } else {
                local_irq_restore(flags);
        }
@@ -2404,6 +2595,8 @@ static void invoke_rcu_core(void)
 static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
                            struct rcu_head *head, unsigned long flags)
 {
+        bool needwake;
        /*
         * If called from an extended quiescent state, invoke the RCU
         * core in order to force a re-evaluation of RCU's idleness.
@@ -2433,8 +2626,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
                        raw_spin_lock(&rnp_root->lock);
                        smp_mb__after_unlock_lock();
-                        rcu_start_gp(rsp);
+                        needwake = rcu_start_gp(rsp);
                        raw_spin_unlock(&rnp_root->lock);
+                        if (needwake)
+                                rcu_gp_kthread_wake(rsp);
                } else {
                        /* Give the grace period a kick. */
                        rdp->blimit = LONG_MAX;
@@ -2537,6 +2732,20 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 EXPORT_SYMBOL_GPL(call_rcu_bh);
 /*
+ * Queue an RCU callback for lazy invocation after a grace period.
+ * This will likely be later named something like "call_rcu_lazy()",
+ * but this change will require some way of tagging the lazy RCU
+ * callbacks in the list of pending callbacks. Until then, this
+ * function may only be called from __kfree_rcu().
+ */
+void kfree_call_rcu(struct rcu_head *head,
+                    void (*func)(struct rcu_head *rcu))
+{
+        __call_rcu(head, func, rcu_state_p, -1, 1);
+}
+EXPORT_SYMBOL_GPL(kfree_call_rcu);
+/*
 * Because a context switch is a grace period for RCU-sched and RCU-bh,
 * any blocking grace-period wait automatically implies a grace period
 * if there is only one CPU online at any point time during execution
@@ -2659,7 +2868,7 @@ unsigned long get_state_synchronize_rcu(void)
         * time-consuming work between get_state_synchronize_rcu()
         * and cond_synchronize_rcu().
         */
-        return smp_load_acquire(&rcu_state->gpnum);
+        return smp_load_acquire(&rcu_state_p->gpnum);
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
@@ -2685,7 +2894,7 @@ void cond_synchronize_rcu(unsigned long oldstate)
         * Ensure that this load happens before any RCU-destructive
         * actions the caller might carry out after we return.
         */
-        newstate = smp_load_acquire(&rcu_state->completed);
+        newstate = smp_load_acquire(&rcu_state_p->completed);
        if (ULONG_CMP_GE(oldstate, newstate))
                synchronize_rcu();
 }
@@ -2790,7 +2999,7 @@ void synchronize_sched_expedited(void)
                s = atomic_long_read(&rsp->expedited_done);
                if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
                        /* ensure test happens before caller kfree */
-                        smp_mb__before_atomic_inc(); /* ^^^ */
+                        smp_mb__before_atomic(); /* ^^^ */
                        atomic_long_inc(&rsp->expedited_workdone1);
                        return;
                }
@@ -2808,7 +3017,7 @@ void synchronize_sched_expedited(void)
                s = atomic_long_read(&rsp->expedited_done);
                if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
                        /* ensure test happens before caller kfree */
-                        smp_mb__before_atomic_inc(); /* ^^^ */
+                        smp_mb__before_atomic(); /* ^^^ */
                        atomic_long_inc(&rsp->expedited_workdone2);
                        return;
                }
@@ -2837,7 +3046,7 @@ void synchronize_sched_expedited(void)
                s = atomic_long_read(&rsp->expedited_done);
                if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
                        /* ensure test happens before caller kfree */
-                        smp_mb__before_atomic_inc(); /* ^^^ */
+                        smp_mb__before_atomic(); /* ^^^ */
                        atomic_long_inc(&rsp->expedited_done_lost);
                        break;
                }
@@ -2988,7 +3197,7 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
 static void rcu_barrier_func(void *type)
 {
        struct rcu_state *rsp = type;
-        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+        struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
        _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
        atomic_inc(&rsp->barrier_cpu_count);
@@ -3160,7 +3369,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 * that this CPU cannot possibly have any RCU callbacks in flight yet.
 */
 static void
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
        unsigned long mask;
@@ -3173,7 +3382,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->beenonline = 1;     /* We have now been online. */
-        rdp->preemptible = preemptible;
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
@@ -3217,8 +3425,7 @@ static void rcu_prepare_cpu(int cpu)
        struct rcu_state *rsp;
        for_each_rcu_flavor(rsp)
-                rcu_init_percpu_data(cpu, rsp,
+                rcu_init_percpu_data(cpu, rsp);
-                                     strcmp(rsp->name, "rcu_preempt") == 0);
 }
 /*
@@ -3228,7 +3435,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
-        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
        struct rcu_state *rsp;
@@ -3365,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
                               "rcu_node_fqs_1",
                               "rcu_node_fqs_2",
                               "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
+        static u8 fl_mask = 0x1;
        int cpustride = 1;
        int i;
        int j;
@@ -3383,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        for (i = 1; i < rcu_num_lvls; i++)
                rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
        rcu_init_levelspread(rsp);
+        rsp->flavor_mask = fl_mask;
+        fl_mask <<= 1;
        /* Initialize the elements themselves, starting from the leaves. */
@@ -3402,8 +3612,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
                        rnp->qsmaskinit = 0;
                        rnp->grplo = j * cpustride;
                        rnp->grphi = (j + 1) * cpustride - 1;
-                        if (rnp->grphi >= NR_CPUS)
+                        if (rnp->grphi >= nr_cpu_ids)
-                                rnp->grphi = NR_CPUS - 1;
+                                rnp->grphi = nr_cpu_ids - 1;
                        if (i == 0) {
                                rnp->grpnum = 0;
                                rnp->grpmask = 0;
@@ -3422,7 +3632,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        rsp->rda = rda;
        init_waitqueue_head(&rsp->gp_wq);
-        init_irq_work(&rsp->wakeup_work, rsp_wakeup);
        rnp = rsp->level[rcu_num_lvls - 1];
        for_each_possible_cpu(i) {
                while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 75dc3c39a02a..0f69a79c5b7d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -252,7 +252,6 @@ struct rcu_data {
        bool            passed_quiesce; /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
        bool            beenonline;     /* CPU online at least once. */
-        bool            preemptible;    /* Preemptible RCU? */
        struct rcu_node *mynode;        /* This CPU's leaf of hierarchy */
        unsigned long grpmask;          /* Mask to apply to leaf qsmask. */
 #ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -308,6 +307,9 @@ struct rcu_data {
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
        unsigned long dynticks_fqs;     /* Kicked due to dynticks idle. */
        unsigned long offline_fqs;      /* Kicked due to being offline. */
+        unsigned long cond_resched_completed;
+                                        /* Grace period that needs help */
+                                        /*  from cond_resched(). */
        /* 5) __rcu_pending() statistics. */
        unsigned long n_rcu_pending;    /* rcu_pending() calls since boot. */
@@ -393,6 +395,7 @@ struct rcu_state {
        struct rcu_node *level[RCU_NUM_LVLS];   /* Hierarchy levels. */
        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
        u8 levelspread[RCU_NUM_LVLS];           /* kids/node in each level. */
+        u8 flavor_mask;                         /* bit in flavor mask. */
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
        void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
                     void (*func)(struct rcu_head *head));
@@ -406,7 +409,8 @@ struct rcu_state {
        unsigned long completed;                /* # of last completed gp. */
        struct task_struct *gp_kthread;         /* Task for grace periods. */
        wait_queue_head_t gp_wq;                /* Where GP task waits. */
-        int gp_flags;                           /* Commands for GP task. */
+        short gp_flags;                         /* Commands for GP task. */
+        short gp_state;                         /* GP kthread sleep state. */
        /* End of fields guarded by root rcu_node's lock. */
@@ -462,13 +466,17 @@ struct rcu_state {
        const char *name;                       /* Name of structure. */
        char abbr;                              /* Abbreviated name. */
        struct list_head flavors;               /* List of RCU flavors. */
-        struct irq_work wakeup_work;            /* Postponed wakeups */
 };
 /* Values for rcu_state structure's gp_flags field. */
 #define RCU_GP_FLAG_INIT 0x1    /* Need grace-period initialization. */
 #define RCU_GP_FLAG_FQS  0x2    /* Need grace-period quiescent-state forcing. */
+/* Values for rcu_state structure's gp_flags field. */
+#define RCU_GP_WAIT_INIT 0      /* Initial state. */
+#define RCU_GP_WAIT_GPS  1      /* Wait for grace-period start. */
+#define RCU_GP_WAIT_FQS  2      /* Wait for force-quiescent-state time. */
 extern struct list_head rcu_struct_flavors;
 /* Sequence through rcu_state structures for each RCU flavor. */
@@ -547,7 +555,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
 static void print_cpu_stall_info_end(void);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
-static int rcu_nocb_needs_gp(struct rcu_state *rsp);
 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
@@ -560,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
 static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void rcu_kick_nohz_cpu(int cpu);
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 962d1d589929..02ac0fb186b8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -116,7 +116,7 @@ static void __init rcu_bootup_announce_oddness(void)
 #ifdef CONFIG_TREE_PREEMPT_RCU
 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
-static struct rcu_state *rcu_state = &rcu_preempt_state;
+static struct rcu_state *rcu_state_p = &rcu_preempt_state;
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
@@ -149,15 +149,6 @@ long rcu_batches_completed(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 /*
- * Force a quiescent state for preemptible RCU.
- */
-void rcu_force_quiescent_state(void)
-{
-        force_quiescent_state(&rcu_preempt_state);
-}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-/*
 * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 * that this just means that the task currently running on the CPU is
 * not in a quiescent state.  There might be any number of tasks blocked
@@ -688,20 +679,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu);
-/*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks.  Until then, this
- * function may only be called from __kfree_rcu().
- */
-void kfree_call_rcu(struct rcu_head *head,
-                    void (*func)(struct rcu_head *rcu))
-{
-        __call_rcu(head, func, &rcu_preempt_state, -1, 1);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
 /**
 * synchronize_rcu - wait until a grace period has elapsed.
 *
@@ -970,7 +947,7 @@ void exit_rcu(void)
 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-static struct rcu_state *rcu_state = &rcu_sched_state;
+static struct rcu_state *rcu_state_p = &rcu_sched_state;
 /*
 * Tell them what RCU they are running.
@@ -991,16 +968,6 @@ long rcu_batches_completed(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 /*
- * Force a quiescent state for RCU, which, because there is no preemptible
- * RCU, becomes the same as rcu-sched.
- */
-void rcu_force_quiescent_state(void)
-{
-        rcu_sched_force_quiescent_state();
-}
-EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
-/*
 * Because preemptible RCU does not exist, we never have to check for
 * CPUs being in quiescent states.
 */
@@ -1080,22 +1047,6 @@ static void rcu_preempt_check_callbacks(int cpu)
 }
 /*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks.  Until then, this
- * function may only be called from __kfree_rcu().
- *
- * Because there is no preemptible RCU, we use RCU-sched instead.
- */
-void kfree_call_rcu(struct rcu_head *head,
-                    void (*func)(struct rcu_head *rcu))
-{
-        __call_rcu(head, func, &rcu_sched_state, -1, 1);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
-/*
 * Wait for an rcu-preempt grace period, but make it happen quickly.
 * But because preemptible RCU does not exist, map to rcu-sched.
 */
@@ -1517,11 +1468,11 @@ static int __init rcu_spawn_kthreads(void)
        for_each_possible_cpu(cpu)
                per_cpu(rcu_cpu_has_work, cpu) = 0;
        BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
-        rnp = rcu_get_root(rcu_state);
+        rnp = rcu_get_root(rcu_state_p);
-        (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+        (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
        if (NUM_RCU_NODES > 1) {
-                rcu_for_each_leaf_node(rcu_state, rnp)
+                rcu_for_each_leaf_node(rcu_state_p, rnp)
-                        (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+                        (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
        }
        return 0;
 }
@@ -1529,12 +1480,12 @@ early_initcall(rcu_spawn_kthreads);
 static void rcu_prepare_kthreads(int cpu)
 {
-        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+        struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
        /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
        if (rcu_scheduler_fully_active)
-                (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
+                (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
 }
 #else /* #ifdef CONFIG_RCU_BOOST */
@@ -1744,6 +1695,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
 static void rcu_prepare_for_idle(int cpu)
 {
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
+        bool needwake;
        struct rcu_data *rdp;
        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
        struct rcu_node *rnp;
@@ -1792,8 +1744,10 @@ static void rcu_prepare_for_idle(int cpu)
                rnp = rdp->mynode;
                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
                smp_mb__after_unlock_lock();
-                rcu_accelerate_cbs(rsp, rnp, rdp);
+                needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                if (needwake)
+                        rcu_gp_kthread_wake(rsp);
        }
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
@@ -1855,7 +1809,7 @@ static void rcu_oom_notify_cpu(void *unused)
        struct rcu_data *rdp;
        for_each_rcu_flavor(rsp) {
-                rdp = __this_cpu_ptr(rsp->rda);
+                rdp = raw_cpu_ptr(rsp->rda);
                if (rdp->qlen_lazy != 0) {
                        atomic_inc(&oom_callback_count);
                        rsp->call(&rdp->oom_head, rcu_oom_callback);
@@ -1997,7 +1951,7 @@ static void increment_cpu_stall_ticks(void)
        struct rcu_state *rsp;
        for_each_rcu_flavor(rsp)
-                __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
+                raw_cpu_inc(rsp->rda->ticks_this_gp);
 }
 #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
@@ -2068,19 +2022,6 @@ static int __init parse_rcu_nocb_poll(char *arg)
 early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
 /*
- * Do any no-CBs CPUs need another grace period?
- *
- * Interrupts must be disabled.  If the caller does not hold the root
- * rnp_node structure's ->lock, the results are advisory only.
- */
-static int rcu_nocb_needs_gp(struct rcu_state *rsp)
-{
-        struct rcu_node *rnp = rcu_get_root(rsp);
-        return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
-}
-/*
 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
 * grace period.
 */
@@ -2109,7 +2050,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 }
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
-/* Is the specified CPU a no-CPUs CPU? */
+/* Is the specified CPU a no-CBs CPU? */
 bool rcu_is_nocb_cpu(int cpu)
 {
        if (have_rcu_nocb_mask)
@@ -2243,12 +2184,15 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
        unsigned long c;
        bool d;
        unsigned long flags;
+        bool needwake;
        struct rcu_node *rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
        smp_mb__after_unlock_lock();
-        c = rcu_start_future_gp(rnp, rdp);
+        needwake = rcu_start_future_gp(rnp, rdp, &c);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        if (needwake)
+                rcu_gp_kthread_wake(rdp->rsp);
        /*
         * Wait for the grace period.  Do so interruptibly to avoid messing
@@ -2402,11 +2346,6 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static int rcu_nocb_needs_gp(struct rcu_state *rsp)
-{
-        return 0;
-}
 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 {
 }
@@ -2465,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
 * if an adaptive-ticks CPU is failing to respond to the current grace
 * period and has not be idle from an RCU perspective, kick it.
 */
-static void rcu_kick_nohz_cpu(int cpu)
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
        if (tick_nohz_full_cpu(cpu))
@@ -2523,9 +2462,9 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
        /* Record start of fully idle period. */
        j = jiffies;
        ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
-        smp_mb__before_atomic_inc();
+        smp_mb__before_atomic();
        atomic_inc(&rdtp->dynticks_idle);
-        smp_mb__after_atomic_inc();
+        smp_mb__after_atomic();
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
 }
@@ -2590,9 +2529,9 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
        }
        /* Record end of idle period. */
-        smp_mb__before_atomic_inc();
+        smp_mb__before_atomic();
        atomic_inc(&rdtp->dynticks_idle);
-        smp_mb__after_atomic_inc();
+        smp_mb__after_atomic();
        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
        /*
@@ -2657,20 +2596,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
 }
 /*
- * Bind the grace-period kthread for the sysidle flavor of RCU to the
- * timekeeping CPU.
- */
-static void rcu_bind_gp_kthread(void)
-{
-        int cpu = ACCESS_ONCE(tick_do_timer_cpu);
-        if (cpu < 0 || cpu >= nr_cpu_ids)
-                return;
-        if (raw_smp_processor_id() != cpu)
-                set_cpus_allowed_ptr(current, cpumask_of(cpu));
-}
-/*
 * Return a delay in jiffies based on the number of CPUs, rcu_node
 * leaf fanout, and jiffies tick rate.  The idea is to allow larger
 * systems more time to transition to full-idle state in order to
@@ -2734,7 +2659,8 @@ static void rcu_sysidle(unsigned long j)
 static void rcu_sysidle_cancel(void)
 {
        smp_mb();
-        ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
+        if (full_sysidle_state > RCU_SYSIDLE_SHORT)
+                ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
 }
 /*
@@ -2880,10 +2806,6 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp)
        return false;
 }
-static void rcu_bind_gp_kthread(void)
-{
-}
 static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
                                  unsigned long maxj)
 {
@@ -2914,3 +2836,19 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
 #endif /* #ifdef CONFIG_NO_HZ_FULL */
        return 0;
 }
+/*
+ * Bind the grace-period kthread for the sysidle flavor of RCU to the
+ * timekeeping CPU.
+ */
+static void rcu_bind_gp_kthread(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+        int cpu = ACCESS_ONCE(tick_do_timer_cpu);
+        if (cpu < 0 || cpu >= nr_cpu_ids)
+                return;
+        if (raw_smp_processor_id() != cpu)
+                set_cpus_allowed_ptr(current, cpumask_of(cpu));
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4c0a9b0af469..bc7883570530 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -200,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf)
 EXPORT_SYMBOL_GPL(wait_rcu_gp);
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-static inline void debug_init_rcu_head(struct rcu_head *head)
+void init_rcu_head(struct rcu_head *head)
 {
        debug_object_init(head, &rcuhead_debug_descr);
 }
-static inline void debug_rcu_head_free(struct rcu_head *head)
+void destroy_rcu_head(struct rcu_head *head)
 {
        debug_object_free(head, &rcuhead_debug_descr);
 }
@@ -320,6 +320,18 @@ int rcu_jiffies_till_stall_check(void)
        return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
 }
+void rcu_sysrq_start(void)
+{
+        if (!rcu_cpu_stall_suppress)
+                rcu_cpu_stall_suppress = 2;
+}
+void rcu_sysrq_end(void)
+{
+        if (rcu_cpu_stall_suppress == 2)
+                rcu_cpu_stall_suppress = 0;
+}
 static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
 {
        rcu_cpu_stall_suppress = 1;
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 662c83fc16b7..a3a9e240fcdb 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -388,15 +388,22 @@ static int __init reboot_setup(char *str)
                        break;
                case 's':
-                        if (isdigit(*(str+1)))
+                {
-                                reboot_cpu = simple_strtoul(str+1, NULL, 0);
+                        int rc;
-                        else if (str[1] == 'm' && str[2] == 'p' &&
-                                                        isdigit(*(str+3)))
+                        if (isdigit(*(str+1))) {
-                                reboot_cpu = simple_strtoul(str+3, NULL, 0);
+                                rc = kstrtoint(str+1, 0, &reboot_cpu);
-                        else
+                                if (rc)
+                                        return rc;
+                        } else if (str[1] == 'm' && str[2] == 'p' &&
+                                   isdigit(*(str+3))) {
+                                rc = kstrtoint(str+3, 0, &reboot_cpu);
+                                if (rc)
+                                        return rc;
+                        } else
                                reboot_mode = REBOOT_SOFT;
                        break;
+                }
                case 'g':
                        reboot_mode = REBOOT_GPIO;
                        break;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 51dbac6a3633..e791130f85a7 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf,
        /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */
        if (*buf == '-') {
-                res = simple_strtoull(buf + 1, &end, 10);
+                int rc = kstrtoull(buf + 1, 10, &res);
-                if (res != 1 || *end != '\0')
+                if (rc)
+                        return rc;
+                if (res != 1)
                        return -EINVAL;
                *resp = RES_COUNTER_MAX;
                return 0;
diff --git a/kernel/resource.c b/kernel/resource.c
index 8957d686e29b..3c2237ac32db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1288,13 +1288,10 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
                if (p->flags & IORESOURCE_BUSY)
                        continue;
-                printk(KERN_WARNING "resource map sanity check conflict: "
+                printk(KERN_WARNING "resource sanity check: requesting [mem %#010llx-%#010llx], which spans more than %s %pR\n",
-                       "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
                       (unsigned long long)addr,
                       (unsigned long long)(addr + size - 1),
-                       (unsigned long long)p->start,
+                       p->name, p);
-                       (unsigned long long)p->end,
-                       p->name);
                err = -1;
                break;
        }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d9d8ece46a15..bc1638b33449 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,6 +90,22 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
+#ifdef smp_mb__before_atomic
+void __smp_mb__before_atomic(void)
+{
+        smp_mb__before_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__before_atomic);
+#endif
+#ifdef smp_mb__after_atomic
+void __smp_mb__after_atomic(void)
+{
+        smp_mb__after_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__after_atomic);
+#endif
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
        unsigned long delta;
@@ -506,6 +522,71 @@ static inline void init_hrtick(void)
 #endif  /* CONFIG_SCHED_HRTICK */
 /*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val)                                              \
+({      typeof(*(ptr)) __old, __val = *(ptr);                           \
+        for (;;) {                                                      \
+                __old = cmpxchg((ptr), __val, __val | (val));           \
+                if (__old == __val)                                     \
+                        break;                                          \
+                __val = __old;                                          \
+        }                                                               \
+        __old;                                                          \
+})
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+        struct thread_info *ti = task_thread_info(p);
+        return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+        struct thread_info *ti = task_thread_info(p);
+        typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+        for (;;) {
+                if (!(val & _TIF_POLLING_NRFLAG))
+                        return false;
+                if (val & _TIF_NEED_RESCHED)
+                        return true;
+                old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+                if (old == val)
+                        break;
+                val = old;
+        }
+        return true;
+}
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+        set_tsk_need_resched(p);
+        return true;
+}
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+        return false;
+}
+#endif
+#endif
+/*
 * resched_task - mark a task 'to be rescheduled now'.
 *
 * On UP this means the setting of the need_resched flag, on SMP it
@@ -521,18 +602,18 @@ void resched_task(struct task_struct *p)
        if (test_tsk_need_resched(p))
                return;
-        set_tsk_need_resched(p);
        cpu = task_cpu(p);
        if (cpu == smp_processor_id()) {
+                set_tsk_need_resched(p);
                set_preempt_need_resched();
                return;
        }
-        /* NEED_RESCHED must be visible before we test polling */
+        if (set_nr_and_not_polling(p))
-        smp_mb();
-        if (!tsk_is_polling(p))
                smp_send_reschedule(cpu);
+        else
+                trace_sched_wake_idle_without_ipi(cpu);
 }
 void resched_cpu(int cpu)
@@ -595,27 +676,10 @@ static void wake_up_idle_cpu(int cpu)
        if (cpu == smp_processor_id())
                return;
-        /*
+        if (set_nr_and_not_polling(rq->idle))
-         * This is safe, as this function is called with the timer
-         * wheel base lock of (cpu) held. When the CPU is on the way
-         * to idle and has not yet set rq->curr to idle then it will
-         * be serialized on the timer wheel base lock and take the new
-         * timer into account automatically.
-         */
-        if (rq->curr != rq->idle)
-                return;
-        /*
-         * We can set TIF_RESCHED on the idle task of the other CPU
-         * lockless. The worst case is that the other CPU runs the
-         * idle task through an additional NOOP schedule()
-         */
-        set_tsk_need_resched(rq->idle);
-        /* NEED_RESCHED must be visible before we test polling */
-        smp_mb();
-        if (!tsk_is_polling(rq->idle))
                smp_send_reschedule(cpu);
+        else
+                trace_sched_wake_idle_without_ipi(cpu);
 }
 static bool wake_up_full_nohz_cpu(int cpu)
@@ -841,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
        rq->clock_task += delta;
 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-        if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+        if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
                sched_rt_avg_update(rq, irq_delta + steal);
 #endif
 }
@@ -1320,7 +1384,7 @@ out:
                 * leave kernel.
                 */
                if (p->mm && printk_ratelimit()) {
-                        printk_sched("process %d (%s) no longer affine to cpu%d\n",
+                        printk_deferred("process %d (%s) no longer affine to cpu%d\n",
                                        task_pid_nr(p), p->comm, cpu);
                }
        }
@@ -1474,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 }
 #ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
 {
        struct rq *rq = this_rq();
        struct llist_node *llist = llist_del_all(&rq->wake_list);
        struct task_struct *p;
+        unsigned long flags;
-        raw_spin_lock(&rq->lock);
+        if (!llist)
+                return;
+        raw_spin_lock_irqsave(&rq->lock, flags);
        while (llist) {
                p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1488,7 +1556,7 @@ static void sched_ttwu_pending(void)
                ttwu_do_activate(rq, p, 0);
        }
-        raw_spin_unlock(&rq->lock);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 void scheduler_ipi(void)
@@ -1534,8 +1602,14 @@ void scheduler_ipi(void)
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
-        if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
+        struct rq *rq = cpu_rq(cpu);
-                smp_send_reschedule(cpu);
+        if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+                if (!set_nr_if_polling(rq->idle))
+                        smp_send_reschedule(cpu);
+                else
+                        trace_sched_wake_idle_without_ipi(cpu);
+        }
 }
 bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -2480,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                defined(CONFIG_PREEMPT_TRACER))
-void __kprobes preempt_count_add(int val)
+void preempt_count_add(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
@@ -2506,8 +2580,9 @@ void __kprobes preempt_count_add(int val)
        }
 }
 EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
-void __kprobes preempt_count_sub(int val)
+void preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
@@ -2528,6 +2603,7 @@ void __kprobes preempt_count_sub(int val)
        __preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
 #endif
@@ -2592,8 +2668,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
        if (likely(prev->sched_class == class &&
                   rq->nr_running == rq->cfs.h_nr_running)) {
                p = fair_sched_class.pick_next_task(rq, prev);
-                if (likely(p && p != RETRY_TASK))
+                if (unlikely(p == RETRY_TASK))
-                        return p;
+                        goto again;
+                /* assumes fair_sched_class->next == idle_sched_class */
+                if (unlikely(!p))
+                        p = idle_sched_class.pick_next_task(rq, prev);
+                return p;
        }
 again:
@@ -2804,6 +2886,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
                barrier();
        } while (need_resched());
 }
+NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
 #endif /* CONFIG_PREEMPT */
@@ -2996,7 +3079,7 @@ EXPORT_SYMBOL(set_user_nice);
 int can_nice(const struct task_struct *p, const int nice)
 {
        /* convert nice value [19,-20] to rlimit style value [1,40] */
-        int nice_rlim = 20 - nice;
+        int nice_rlim = nice_to_rlimit(nice);
        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                capable(CAP_SYS_NICE));
@@ -3020,17 +3103,10 @@ SYSCALL_DEFINE1(nice, int, increment)
         * We don't have to worry. Conceptually one call occurs first
         * and we have a single winner.
         */
-        if (increment < -40)
+        increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
-                increment = -40;
-        if (increment > 40)
-                increment = 40;
        nice = task_nice(current) + increment;
-        if (nice < MIN_NICE)
-                nice = MIN_NICE;
-        if (nice > MAX_NICE)
-                nice = MAX_NICE;
+        nice = clamp_val(nice, MIN_NICE, MAX_NICE);
        if (increment < 0 && !can_nice(current, nice))
                return -EPERM;
@@ -3124,6 +3200,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
        dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
        dl_se->dl_throttled = 0;
        dl_se->dl_new = 1;
+        dl_se->dl_yielded = 0;
 }
 static void __setscheduler_params(struct task_struct *p,
@@ -3188,17 +3265,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
 * We ask for the deadline not being zero, and greater or equal
 * than the runtime, as well as the period of being zero or
 * greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution (1us); we
+ * user parameters are above the internal resolution of 1us (we
- * check sched_runtime only since it is always the smaller one.
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
 */
 static bool
 __checkparam_dl(const struct sched_attr *attr)
 {
-        return attr && attr->sched_deadline != 0 &&
+        /* deadline != 0 */
-                (attr->sched_period == 0 ||
+        if (attr->sched_deadline == 0)
-                (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
+                return false;
-                (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
-                attr->sched_runtime >= (2 << (DL_SCALE - 1));
+        /*
+         * Since we truncate DL_SCALE bits, make sure we're at least
+         * that big.
+         */
+        if (attr->sched_runtime < (1ULL << DL_SCALE))
+                return false;
+        /*
+         * Since we use the MSB for wrap-around and sign issues, make
+         * sure it's not set (mind that period can be equal to zero).
+         */
+        if (attr->sched_deadline & (1ULL << 63) ||
+            attr->sched_period & (1ULL << 63))
+                return false;
+        /* runtime <= deadline <= period (if period != 0) */
+        if ((attr->sched_period != 0 &&
+             attr->sched_period < attr->sched_deadline) ||
+            attr->sched_deadline < attr->sched_runtime)
+                return false;
+        return true;
 }
 /*
@@ -3596,13 +3696,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
         */
        attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
-out:
+        return 0;
-        return ret;
 err_size:
        put_user(sizeof(*attr), &uattr->size);
-        ret = -E2BIG;
+        return -E2BIG;
-        goto out;
 }
 /**
@@ -3639,6 +3737,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 * sys_sched_setattr - same as above, but with extended sched_attr
 * @pid: the pid in question.
 * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
 */
 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
                               unsigned int, flags)
@@ -3650,8 +3749,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
        if (!uattr || pid < 0 || flags)
                return -EINVAL;
-        if (sched_copy_attr(uattr, &attr))
+        retval = sched_copy_attr(uattr, &attr);
-                return -EFAULT;
+        if (retval)
+                return retval;
+        if ((int)attr.sched_policy < 0)
+                return -EINVAL;
        rcu_read_lock();
        retval = -ESRCH;
@@ -3701,7 +3804,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 */
 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
-        struct sched_param lp;
+        struct sched_param lp = { .sched_priority = 0 };
        struct task_struct *p;
        int retval;
@@ -3718,11 +3821,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
        if (retval)
                goto out_unlock;
-        if (task_has_dl_policy(p)) {
+        if (task_has_rt_policy(p))
-                retval = -EINVAL;
+                lp.sched_priority = p->rt_priority;
-                goto out_unlock;
-        }
-        lp.sched_priority = p->rt_priority;
        rcu_read_unlock();
        /*
@@ -3760,7 +3860,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
                for (; addr < end; addr++) {
                        if (*addr)
-                                goto err_size;
+                                return -EFBIG;
                }
                attr->size = usize;
@@ -3770,12 +3870,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
        if (ret)
                return -EFAULT;
-out:
+        return 0;
-        return ret;
-err_size:
-        ret = -E2BIG;
-        goto out;
 }
 /**
@@ -3783,6 +3878,7 @@ err_size:
 * @pid: the pid in question.
 * @uattr: structure containing the extended parameters.
 * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
 */
 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
                unsigned int, size, unsigned int, flags)
@@ -4145,7 +4241,7 @@ EXPORT_SYMBOL(yield);
 *      false (0) if we failed to boost the target.
 *      -ESRCH if there's no task to yield to.
 */
-bool __sched yield_to(struct task_struct *p, bool preempt)
+int __sched yield_to(struct task_struct *p, bool preempt)
 {
        struct task_struct *curr = current;
        struct rq *rq, *p_rq;
@@ -5039,11 +5135,20 @@ static struct notifier_block migration_notifier = {
        .priority = CPU_PRI_MIGRATION,
 };
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+        int cpu = smp_processor_id();
+        struct rq *rq = cpu_rq(cpu);
+        rq->age_stamp = sched_clock_cpu(cpu);
+}
 static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_STARTING:
+                set_cpu_rq_start_time();
+                return NOTIFY_OK;
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -5162,14 +5267,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                }
                /*
-                 * Even though we initialize ->power to something semi-sane,
+                 * Even though we initialize ->capacity to something semi-sane,
-                 * we leave power_orig unset. This allows us to detect if
+                 * we leave capacity_orig unset. This allows us to detect if
                 * domain iteration is still funny without causing /0 traps.
                 */
-                if (!group->sgp->power_orig) {
+                if (!group->sgc->capacity_orig) {
                        printk(KERN_CONT "\n");
-                        printk(KERN_ERR "ERROR: domain->cpu_power not "
+                        printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
-                                        "set\n");
                        break;
                }
@@ -5191,9 +5295,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                printk(KERN_CONT " %s", str);
-                if (group->sgp->power != SCHED_POWER_SCALE) {
+                if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
-                        printk(KERN_CONT " (cpu_power = %d)",
+                        printk(KERN_CONT " (cpu_capacity = %d)",
-                                group->sgp->power);
+                                group->sgc->capacity);
                }
                group = group->next;
@@ -5251,8 +5355,9 @@ static int sd_degenerate(struct sched_domain *sd)
                         SD_BALANCE_NEWIDLE |
                         SD_BALANCE_FORK |
                         SD_BALANCE_EXEC |
-                         SD_SHARE_CPUPOWER |
+                         SD_SHARE_CPUCAPACITY |
-                         SD_SHARE_PKG_RESOURCES)) {
+                         SD_SHARE_PKG_RESOURCES |
+                         SD_SHARE_POWERDOMAIN)) {
                if (sd->groups != sd->groups->next)
                        return 0;
        }
@@ -5281,9 +5386,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_NEWIDLE |
                                SD_BALANCE_FORK |
                                SD_BALANCE_EXEC |
-                                SD_SHARE_CPUPOWER |
+                                SD_SHARE_CPUCAPACITY |
                                SD_SHARE_PKG_RESOURCES |
-                                SD_PREFER_SIBLING);
+                                SD_PREFER_SIBLING |
+                                SD_SHARE_POWERDOMAIN);
                if (nr_node_ids == 1)
                        pflags &= ~SD_SERIALIZE;
        }
@@ -5405,7 +5511,7 @@ static struct root_domain *alloc_rootdomain(void)
        return rd;
 }
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
 {
        struct sched_group *tmp, *first;
@@ -5416,8 +5522,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
        do {
                tmp = sg->next;
-                if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+                if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
-                        kfree(sg->sgp);
+                        kfree(sg->sgc);
                kfree(sg);
                sg = tmp;
@@ -5435,7 +5541,7 @@ static void free_sched_domain(struct rcu_head *rcu)
        if (sd->flags & SD_OVERLAP) {
                free_sched_groups(sd->groups, 1);
        } else if (atomic_dec_and_test(&sd->groups->ref)) {
-                kfree(sd->groups->sgp);
+                kfree(sd->groups->sgc);
                kfree(sd->groups);
        }
        kfree(sd);
@@ -5557,17 +5663,6 @@ static int __init isolated_cpu_setup(char *str)
 __setup("isolcpus=", isolated_cpu_setup);
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
-        return cpumask_of_node(cpu_to_node(cpu));
-}
-struct sd_data {
-        struct sched_domain **__percpu sd;
-        struct sched_group **__percpu sg;
-        struct sched_group_power **__percpu sgp;
-};
 struct s_data {
        struct sched_domain ** __percpu sd;
        struct root_domain      *rd;
@@ -5580,21 +5675,6 @@ enum s_alloc {
        sa_none,
 };
-struct sched_domain_topology_level;
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-#define SDTL_OVERLAP    0x01
-struct sched_domain_topology_level {
-        sched_domain_init_f init;
-        sched_domain_mask_f mask;
-        int                 flags;
-        int                 numa_level;
-        struct sd_data      data;
-};
 /*
 * Build an iteration mask that can exclude certain CPUs from the upwards
 * domain traversal.
@@ -5672,17 +5752,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_or(covered, covered, sg_span);
-                sg->sgp = *per_cpu_ptr(sdd->sgp, i);
+                sg->sgc = *per_cpu_ptr(sdd->sgc, i);
-                if (atomic_inc_return(&sg->sgp->ref) == 1)
+                if (atomic_inc_return(&sg->sgc->ref) == 1)
                        build_group_mask(sd, sg);
                /*
-                 * Initialize sgp->power such that even if we mess up the
+                 * Initialize sgc->capacity such that even if we mess up the
                 * domains and no possible iteration will get us here, we won't
                 * die on a /0 trap.
                 */
-                sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+                sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
-                sg->sgp->power_orig = sg->sgp->power;
+                sg->sgc->capacity_orig = sg->sgc->capacity;
                /*
                 * Make sure the first group of this domain contains the
@@ -5720,8 +5800,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
        if (sg) {
                *sg = *per_cpu_ptr(sdd->sg, cpu);
-                (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+                (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
-                atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+                atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
        }
        return cpu;
@@ -5730,7 +5810,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 /*
 * build_sched_groups will build a circular linked list of the groups
 * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
+ * and ->cpu_capacity to 0.
 *
 * Assumes the sched_domain tree is fully constructed
 */
@@ -5762,8 +5842,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                        continue;
                group = get_group(i, sdd, &sg);
-                cpumask_clear(sched_group_cpus(sg));
-                sg->sgp->power = 0;
                cpumask_setall(sched_group_mask(sg));
                for_each_cpu(j, span) {
@@ -5786,16 +5864,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 }
 /*
- * Initialize sched groups cpu_power.
+ * Initialize sched groups cpu_capacity.
 *
- * cpu_power indicates the capacity of sched group, which is used while
+ * cpu_capacity indicates the capacity of sched group, which is used while
 * distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
+ * Typically cpu_capacity for all the groups in a sched domain will be same
- * there are asymmetries in the topology. If there are asymmetries, group
+ * unless there are asymmetries in the topology. If there are asymmetries,
- * having more cpu_power will pickup more load compared to the group having
+ * group having more cpu_capacity will pickup more load compared to the
- * less cpu_power.
+ * group having less cpu_capacity.
 */
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 {
        struct sched_group *sg = sd->groups;
@@ -5809,13 +5887,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        if (cpu != group_balance_cpu(sg))
                return;
-        update_group_power(sd, cpu);
+        update_group_capacity(sd, cpu);
-        atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+        atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
-}
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
 }
 /*
@@ -5823,34 +5896,6 @@ int __weak arch_sd_sibling_asym_packing(void)
 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
 */
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type)         sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type)         do { } while (0)
-#endif
-#define SD_INIT_FUNC(type)                                              \
-static noinline struct sched_domain *                                   \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu)         \
-{                                                                       \
-        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
-        *sd = SD_##type##_INIT;                                         \
-        SD_INIT_NAME(sd, type);                                         \
-        sd->private = &tl->data;                                        \
-        return sd;                                                      \
-}
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
 static int default_relax_domain_level = -1;
 int sched_domain_level_max;
@@ -5934,101 +5979,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
        if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
                *per_cpu_ptr(sdd->sg, cpu) = NULL;
-        if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+        if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
-                *per_cpu_ptr(sdd->sgp, cpu) = NULL;
+                *per_cpu_ptr(sdd->sgc, cpu) = NULL;
 }
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
-        return topology_thread_cpumask(cpu);
-}
-#endif
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-        { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
-        { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
-        { sd_init_BOOK, cpu_book_mask, },
-#endif
-        { sd_init_CPU, cpu_cpu_mask, },
-        { NULL, },
-};
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-#define for_each_sd_topology(tl)                        \
-        for (tl = sched_domain_topology; tl->init; tl++)
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
+#endif
-static inline int sd_local_flags(int level)
+/*
-{
+ * SD_flags allowed in topology descriptions.
-        if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
+ *
-                return 0;
+ * SD_SHARE_CPUCAPACITY      - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
-        return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+ * SD_NUMA                - describes NUMA topologies
-}
+ * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING        - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS               \
+        (SD_SHARE_CPUCAPACITY |         \
+         SD_SHARE_PKG_RESOURCES |       \
+         SD_NUMA |                      \
+         SD_ASYM_PACKING |              \
+         SD_SHARE_POWERDOMAIN)
 static struct sched_domain *
-sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl, int cpu)
 {
        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-        int level = tl->numa_level;
+        int sd_weight, sd_flags = 0;
-        int sd_weight = cpumask_weight(
-                        sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+#ifdef CONFIG_NUMA
+        /*
+         * Ugly hack to pass state to sd_numa_mask()...
+         */
+        sched_domains_curr_level = tl->numa_level;
+#endif
+        sd_weight = cpumask_weight(tl->mask(cpu));
+        if (tl->sd_flags)
+                sd_flags = (*tl->sd_flags)();
+        if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+                        "wrong sd_flags in topology description\n"))
+                sd_flags &= ~TOPOLOGY_SD_FLAGS;
        *sd = (struct sched_domain){
                .min_interval           = sd_weight,
                .max_interval           = 2*sd_weight,
                .busy_factor            = 32,
                .imbalance_pct          = 125,
-                .cache_nice_tries       = 2,
-                .busy_idx               = 3,
+                .cache_nice_tries       = 0,
-                .idle_idx               = 2,
+                .busy_idx               = 0,
+                .idle_idx               = 0,
                .newidle_idx            = 0,
                .wake_idx               = 0,
                .forkexec_idx           = 0,
                .flags                  = 1*SD_LOAD_BALANCE
                                        | 1*SD_BALANCE_NEWIDLE
-                                        | 0*SD_BALANCE_EXEC
+                                        | 1*SD_BALANCE_EXEC
-                                        | 0*SD_BALANCE_FORK
+                                        | 1*SD_BALANCE_FORK
                                        | 0*SD_BALANCE_WAKE
-                                        | 0*SD_WAKE_AFFINE
+                                        | 1*SD_WAKE_AFFINE
-                                        | 0*SD_SHARE_CPUPOWER
+                                        | 0*SD_SHARE_CPUCAPACITY
                                        | 0*SD_SHARE_PKG_RESOURCES
-                                        | 1*SD_SERIALIZE
+                                        | 0*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
-                                        | 1*SD_NUMA
+                                        | 0*SD_NUMA
-                                        | sd_local_flags(level)
+                                        | sd_flags
                                        ,
                .last_balance           = jiffies,
                .balance_interval       = sd_weight,
+                .smt_gain               = 0,
+                .max_newidle_lb_cost    = 0,
+                .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+                .name                   = tl->name,
+#endif
        };
-        SD_INIT_NAME(sd, NUMA);
-        sd->private = &tl->data;
        /*
-         * Ugly hack to pass state to sd_numa_mask()...
+         * Convert topological properties into behaviour.
         */
-        sched_domains_curr_level = tl->numa_level;
+        if (sd->flags & SD_SHARE_CPUCAPACITY) {
+                sd->imbalance_pct = 110;
+                sd->smt_gain = 1178; /* ~15% */
+        } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+                sd->imbalance_pct = 117;
+                sd->cache_nice_tries = 1;
+                sd->busy_idx = 2;
+#ifdef CONFIG_NUMA
+        } else if (sd->flags & SD_NUMA) {
+                sd->cache_nice_tries = 2;
+                sd->busy_idx = 3;
+                sd->idle_idx = 2;
+                sd->flags |= SD_SERIALIZE;
+                if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+                        sd->flags &= ~(SD_BALANCE_EXEC |
+                                       SD_BALANCE_FORK |
+                                       SD_WAKE_AFFINE);
+                }
+#endif
+        } else {
+                sd->flags |= SD_PREFER_SIBLING;
+                sd->cache_nice_tries = 1;
+                sd->busy_idx = 2;
+                sd->idle_idx = 1;
+        }
+        sd->private = &tl->data;
        return sd;
 }
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+        { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+        { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+        { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+        { NULL, },
+};
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+#define for_each_sd_topology(tl)                        \
+        for (tl = sched_domain_topology; tl->mask; tl++)
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+        sched_domain_topology = tl;
+}
+#ifdef CONFIG_NUMA
 static const struct cpumask *sd_numa_mask(int cpu)
 {
        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6172,7 +6274,10 @@ static void sched_init_numa(void)
                }
        }
-        tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+        /* Compute default topology size */
+        for (i = 0; sched_domain_topology[i].mask; i++);
+        tl = kzalloc((i + level + 1) *
                        sizeof(struct sched_domain_topology_level), GFP_KERNEL);
        if (!tl)
                return;
@@ -6180,18 +6285,19 @@ static void sched_init_numa(void)
        /*
         * Copy the default topology bits..
         */
-        for (i = 0; default_topology[i].init; i++)
+        for (i = 0; sched_domain_topology[i].mask; i++)
-                tl[i] = default_topology[i];
+                tl[i] = sched_domain_topology[i];
        /*
         * .. and append 'j' levels of NUMA goodness.
         */
        for (j = 0; j < level; i++, j++) {
                tl[i] = (struct sched_domain_topology_level){
-                        .init = sd_numa_init,
                        .mask = sd_numa_mask,
+                        .sd_flags = cpu_numa_flags,
                        .flags = SDTL_OVERLAP,
                        .numa_level = j,
+                        SD_INIT_NAME(NUMA)
                };
        }
@@ -6276,14 +6382,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                if (!sdd->sg)
                        return -ENOMEM;
-                sdd->sgp = alloc_percpu(struct sched_group_power *);
+                sdd->sgc = alloc_percpu(struct sched_group_capacity *);
-                if (!sdd->sgp)
+                if (!sdd->sgc)
                        return -ENOMEM;
                for_each_cpu(j, cpu_map) {
                        struct sched_domain *sd;
                        struct sched_group *sg;
-                        struct sched_group_power *sgp;
+                        struct sched_group_capacity *sgc;
                        sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
@@ -6301,12 +6407,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                        *per_cpu_ptr(sdd->sg, j) = sg;
-                        sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
+                        sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
-                        if (!sgp)
+                        if (!sgc)
                                return -ENOMEM;
-                        *per_cpu_ptr(sdd->sgp, j) = sgp;
+                        *per_cpu_ptr(sdd->sgc, j) = sgc;
                }
        }
@@ -6333,15 +6439,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
                        if (sdd->sg)
                                kfree(*per_cpu_ptr(sdd->sg, j));
-                        if (sdd->sgp)
+                        if (sdd->sgc)
-                                kfree(*per_cpu_ptr(sdd->sgp, j));
+                                kfree(*per_cpu_ptr(sdd->sgc, j));
                }
                free_percpu(sdd->sd);
                sdd->sd = NULL;
                free_percpu(sdd->sg);
                sdd->sg = NULL;
-                free_percpu(sdd->sgp);
+                free_percpu(sdd->sgc);
-                sdd->sgp = NULL;
+                sdd->sgc = NULL;
        }
 }
@@ -6349,7 +6455,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                struct sched_domain *child, int cpu)
 {
-        struct sched_domain *sd = tl->init(tl, cpu);
+        struct sched_domain *sd = sd_init(tl, cpu);
        if (!sd)
                return child;
@@ -6411,14 +6517,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
                }
        }
-        /* Calculate CPU power for physical packages and nodes */
+        /* Calculate CPU capacity for physical packages and nodes */
        for (i = nr_cpumask_bits-1; i >= 0; i--) {
                if (!cpumask_test_cpu(i, cpu_map))
                        continue;
                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
                        claim_allocations(i, sd);
-                        init_sched_groups_power(i, sd);
+                        init_sched_groups_capacity(i, sd);
                }
        }
@@ -6861,7 +6967,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
-                rq->cpu_power = SCHED_POWER_SCALE;
+                rq->cpu_capacity = SCHED_CAPACITY_SCALE;
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
@@ -6919,6 +7025,7 @@ void __init sched_init(void)
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
        idle_thread_set_boot_cpu();
+        set_cpu_rq_start_time();
 #endif
        init_sched_fair_class();
@@ -7586,7 +7693,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 {
        struct task_group *tg = css_tg(css);
-        struct task_group *parent = css_tg(css_parent(css));
+        struct task_group *parent = css_tg(css->parent);
        if (parent)
                sched_online_group(tg, parent);
@@ -7717,8 +7824,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        /* restart the period timer (if active) to handle new period expiry */
        if (runtime_enabled && cfs_b->timer_active) {
                /* force a reprogram */
-                cfs_b->timer_active = 0;
+                __start_cfs_bandwidth(cfs_b, true);
-                __start_cfs_bandwidth(cfs_b);
        }
        raw_spin_unlock_irq(&cfs_b->lock);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index c143ee380e3a..9cf350c94ec4 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -46,7 +46,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk)
 static inline struct cpuacct *parent_ca(struct cpuacct *ca)
 {
-        return css_ca(css_parent(&ca->css));
+        return css_ca(ca->css.parent);
 }
 static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42b2d47..bd95963dae80 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -13,6 +13,7 @@
 #include <linux/gfp.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include "cpudeadline.h"
 static inline int parent(int i)
@@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b)
 {
        int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
-        swap(cp->elements[a], cp->elements[b]);
+        swap(cp->elements[a].cpu, cp->elements[b].cpu);
-        swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
+        swap(cp->elements[a].dl , cp->elements[b].dl );
+        swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
 }
 static void cpudl_heapify(struct cpudl *cp, int idx)
@@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
        WARN_ON(!cpu_present(cpu));
        raw_spin_lock_irqsave(&cp->lock, flags);
-        old_idx = cp->cpu_to_idx[cpu];
+        old_idx = cp->elements[cpu].idx;
        if (!is_valid) {
                /* remove item */
                if (old_idx == IDX_INVALID) {
@@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
                cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
                cp->elements[old_idx].cpu = new_cpu;
                cp->size--;
-                cp->cpu_to_idx[new_cpu] = old_idx;
+                cp->elements[new_cpu].idx = old_idx;
-                cp->cpu_to_idx[cpu] = IDX_INVALID;
+                cp->elements[cpu].idx = IDX_INVALID;
                while (old_idx > 0 && dl_time_before(
                                cp->elements[parent(old_idx)].dl,
                                cp->elements[old_idx].dl)) {
@@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
                cp->size++;
                cp->elements[cp->size - 1].dl = 0;
                cp->elements[cp->size - 1].cpu = cpu;
-                cp->cpu_to_idx[cpu] = cp->size - 1;
+                cp->elements[cpu].idx = cp->size - 1;
                cpudl_change_key(cp, cp->size - 1, dl);
                cpumask_clear_cpu(cpu, cp->free_cpus);
        } else {
@@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp)
        memset(cp, 0, sizeof(*cp));
        raw_spin_lock_init(&cp->lock);
        cp->size = 0;
-        for (i = 0; i < NR_CPUS; i++)
-                cp->cpu_to_idx[i] = IDX_INVALID;
+        cp->elements = kcalloc(nr_cpu_ids,
-        if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
+                               sizeof(struct cpudl_item),
+                               GFP_KERNEL);
+        if (!cp->elements)
+                return -ENOMEM;
+        if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+                kfree(cp->elements);
                return -ENOMEM;
+        }
+        for_each_possible_cpu(i)
+                cp->elements[i].idx = IDX_INVALID;
        cpumask_setall(cp->free_cpus);
        return 0;
@@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp)
 */
 void cpudl_cleanup(struct cpudl *cp)
 {
-        /*
+        free_cpumask_var(cp->free_cpus);
-         * nothing to do for the moment
+        kfree(cp->elements);
-         */
 }
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index a202789a412c..538c9796ad4a 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -5,17 +5,17 @@
 #define IDX_INVALID     -1
-struct array_item {
+struct cpudl_item {
        u64 dl;
        int cpu;
+        int idx;
 };
 struct cpudl {
        raw_spinlock_t lock;
        int size;
-        int cpu_to_idx[NR_CPUS];
-        struct array_item elements[NR_CPUS];
        cpumask_var_t free_cpus;
+        struct cpudl_item *elements;
 };
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 8b836b376d91..981fcd7dc394 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -30,6 +30,7 @@
 #include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
+#include <linux/slab.h>
 #include "cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
        int idx = 0;
        int task_pri = convert_prio(p->prio);
-        if (task_pri >= MAX_RT_PRIO)
+        BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
-                return 0;
        for (idx = 0; idx < task_pri; idx++) {
                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
@@ -165,7 +165,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
                 * do a write memory barrier, and then update the count, to
                 * make sure the vector is visible when count is set.
                 */
-                smp_mb__before_atomic_inc();
+                smp_mb__before_atomic();
                atomic_inc(&(vec)->count);
                do_mb = 1;
        }
@@ -185,14 +185,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
                 * the new priority vec.
                 */
                if (do_mb)
-                        smp_mb__after_atomic_inc();
+                        smp_mb__after_atomic();
                /*
                 * When removing from the vector, we decrement the counter first
                 * do a memory barrier and then clear the mask.
                 */
                atomic_dec(&(vec)->count);
-                smp_mb__after_atomic_inc();
+                smp_mb__after_atomic();
                cpumask_clear_cpu(cpu, vec->mask);
        }
@@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
                        goto cleanup;
        }
+        cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+        if (!cp->cpu_to_pri)
+                goto cleanup;
        for_each_possible_cpu(i)
                cp->cpu_to_pri[i] = CPUPRI_INVALID;
        return 0;
 cleanup:
@@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
 {
        int i;
+        kfree(cp->cpu_to_pri);
        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
                free_cpumask_var(cp->pri_to_cpu[i].mask);
 }
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..6b033347fdfd 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
 struct cpupri {
        struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
-        int               cpu_to_pri[NR_CPUS];
+        int *cpu_to_pri;
 };
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a95097cb4591..72fdf06ef865 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -332,50 +332,50 @@ out:
 * softirq as those do not count in task exec_runtime any more.
 */
 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                                struct rq *rq)
+                                         struct rq *rq, int ticks)
 {
-        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+        cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+        u64 cputime = (__force u64) cputime_one_jiffy;
        u64 *cpustat = kcpustat_this_cpu->cpustat;
        if (steal_account_process_tick())
                return;
+        cputime *= ticks;
+        scaled *= ticks;
        if (irqtime_account_hi_update()) {
-                cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+                cpustat[CPUTIME_IRQ] += cputime;
        } else if (irqtime_account_si_update()) {
-                cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+                cpustat[CPUTIME_SOFTIRQ] += cputime;
        } else if (this_cpu_ksoftirqd() == p) {
                /*
                 * ksoftirqd time do not get accounted in cpu_softirq_time.
                 * So, we have to handle it separately here.
                 * Also, p->stime needs to be updated for ksoftirqd.
                 */
-                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
-                                        CPUTIME_SOFTIRQ);
        } else if (user_tick) {
-                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+                account_user_time(p, cputime, scaled);
        } else if (p == rq->idle) {
-                account_idle_time(cputime_one_jiffy);
+                account_idle_time(cputime);
        } else if (p->flags & PF_VCPU) { /* System time or guest time */
-                account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+                account_guest_time(p, cputime, scaled);
        } else {
-                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                __account_system_time(p, cputime, scaled,       CPUTIME_SYSTEM);
-                                        CPUTIME_SYSTEM);
        }
 }
 static void irqtime_account_idle_ticks(int ticks)
 {
-        int i;
        struct rq *rq = this_rq();
-        for (i = 0; i < ticks; i++)
+        irqtime_account_process_tick(current, 0, rq, ticks);
-                irqtime_account_process_tick(current, 0, rq);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 static inline void irqtime_account_idle_ticks(int ticks) {}
 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                                struct rq *rq) {}
+                                                struct rq *rq, int nr_ticks) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 /*
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
        if (sched_clock_irqtime) {
-                irqtime_account_process_tick(p, user_tick, rq);
+                irqtime_account_process_tick(p, user_tick, rq, 1);
                return;
        }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b08095786cb8..fc4f98b1258f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
        dl_b->dl_runtime = runtime;
 }
-extern unsigned long to_ratio(u64 period, u64 runtime);
 void init_dl_bw(struct dl_bw *dl_b)
 {
        raw_spin_lock_init(&dl_b->lock);
@@ -348,12 +346,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
         * entity.
         */
        if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
-                static bool lag_once = false;
+                printk_deferred_once("sched: DL replenish lagged to much\n");
-                if (!lag_once) {
-                        lag_once = true;
-                        printk_sched("sched: DL replenish lagged to much\n");
-                }
                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
                dl_se->runtime = pi_se->dl_runtime;
        }
@@ -513,14 +506,22 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                                                     struct sched_dl_entity,
                                                     dl_timer);
        struct task_struct *p = dl_task_of(dl_se);
-        struct rq *rq = task_rq(p);
+        struct rq *rq;
+again:
+        rq = task_rq(p);
        raw_spin_lock(&rq->lock);
+        if (rq != task_rq(p)) {
+                /* Task was moved, retrying. */
+                raw_spin_unlock(&rq->lock);
+                goto again;
+        }
        /*
         * We need to take care of a possible races here. In fact, the
         * task might have changed its scheduling policy to something
         * different from SCHED_DEADLINE or changed its reservation
-         * parameters (through sched_setscheduler()).
+         * parameters (through sched_setattr()).
         */
        if (!dl_task(p) || dl_se->dl_new)
                goto unlock;
@@ -528,6 +529,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
        sched_clock_tick();
        update_rq_clock(rq);
        dl_se->dl_throttled = 0;
+        dl_se->dl_yielded = 0;
        if (p->on_rq) {
                enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
                if (task_has_dl_policy(rq->curr))
@@ -740,7 +742,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
        WARN_ON(!dl_prio(prio));
        dl_rq->dl_nr_running++;
-        inc_nr_running(rq_of_dl_rq(dl_rq));
+        add_nr_running(rq_of_dl_rq(dl_rq), 1);
        inc_dl_deadline(dl_rq, deadline);
        inc_dl_migration(dl_se, dl_rq);
@@ -754,7 +756,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
        WARN_ON(!dl_prio(prio));
        WARN_ON(!dl_rq->dl_nr_running);
        dl_rq->dl_nr_running--;
-        dec_nr_running(rq_of_dl_rq(dl_rq));
+        sub_nr_running(rq_of_dl_rq(dl_rq), 1);
        dec_dl_deadline(dl_rq, dl_se->deadline);
        dec_dl_migration(dl_se, dl_rq);
@@ -893,10 +895,10 @@ static void yield_task_dl(struct rq *rq)
         * We make the task go to sleep until its current deadline by
         * forcing its runtime to zero. This way, update_curr_dl() stops
         * it and the bandwidth timer will wake it up and will give it
-         * new scheduling parameters (thanks to dl_new=1).
+         * new scheduling parameters (thanks to dl_yielded=1).
         */
        if (p->dl.runtime > 0) {
-                rq->curr->dl.dl_new = 1;
+                rq->curr->dl.dl_yielded = 1;
                p->dl.runtime = 0;
        }
        update_curr_dl(rq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 695f9773bb60..627b3c34b821 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                avg_atom = p->se.sum_exec_runtime;
                if (nr_switches)
-                        do_div(avg_atom, nr_switches);
+                        avg_atom = div64_ul(avg_atom, nr_switches);
                else
                        avg_atom = -1LL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7570dd969c28..fea7d3335e1f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 static unsigned long weighted_cpuload(const int cpu);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
-static unsigned long power_of(int cpu);
+static unsigned long capacity_of(int cpu);
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
 /* Cached statistics for all CPUs within a node */
@@ -1026,11 +1026,11 @@ struct numa_stats {
        unsigned long load;
        /* Total compute capacity of CPUs on a node */
-        unsigned long power;
+        unsigned long compute_capacity;
        /* Approximate capacity in terms of runnable tasks on a node */
-        unsigned long capacity;
+        unsigned long task_capacity;
-        int has_capacity;
+        int has_free_capacity;
 };
 /*
@@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
                ns->nr_running += rq->nr_running;
                ns->load += weighted_cpuload(cpu);
-                ns->power += power_of(cpu);
+                ns->compute_capacity += capacity_of(cpu);
                cpus++;
        }
@@ -1056,15 +1056,16 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
         * the @ns structure is NULL'ed and task_numa_compare() will
         * not find this node attractive.
         *
-         * We'll either bail at !has_capacity, or we'll detect a huge imbalance
+         * We'll either bail at !has_free_capacity, or we'll detect a huge
-         * and bail there.
+         * imbalance and bail there.
         */
        if (!cpus)
                return;
-        ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+        ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
-        ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+        ns->task_capacity =
-        ns->has_capacity = (ns->nr_running < ns->capacity);
+                DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+        ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
 }
 struct task_numa_env {
@@ -1095,6 +1096,34 @@ static void task_numa_assign(struct task_numa_env *env,
        env->best_cpu = env->dst_cpu;
 }
+static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
+                                long src_load, long dst_load,
+                                struct task_numa_env *env)
+{
+        long imb, old_imb;
+        /* We care about the slope of the imbalance, not the direction. */
+        if (dst_load < src_load)
+                swap(dst_load, src_load);
+        /* Is the difference below the threshold? */
+        imb = dst_load * 100 - src_load * env->imbalance_pct;
+        if (imb <= 0)
+                return false;
+        /*
+         * The imbalance is above the allowed threshold.
+         * Compare it with the old imbalance.
+         */
+        if (orig_dst_load < orig_src_load)
+                swap(orig_dst_load, orig_src_load);
+        old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+        /* Would this change make things worse? */
+        return (imb > old_imb);
+}
 /*
 * This checks if the overall compute and NUMA accesses of the system would
 * be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1107,7 +1136,8 @@ static void task_numa_compare(struct task_numa_env *env,
        struct rq *src_rq = cpu_rq(env->src_cpu);
        struct rq *dst_rq = cpu_rq(env->dst_cpu);
        struct task_struct *cur;
-        long dst_load, src_load;
+        long orig_src_load, src_load;
+        long orig_dst_load, dst_load;
        long load;
        long imp = (groupimp > 0) ? groupimp : taskimp;
@@ -1166,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env,
        if (!cur) {
                /* Is there capacity at our destination? */
-                if (env->src_stats.has_capacity &&
+                if (env->src_stats.has_free_capacity &&
-                    !env->dst_stats.has_capacity)
+                    !env->dst_stats.has_free_capacity)
                        goto unlock;
                goto balance;
@@ -1181,13 +1211,13 @@ static void task_numa_compare(struct task_numa_env *env,
         * In the overloaded case, try and keep the load balanced.
         */
 balance:
-        dst_load = env->dst_stats.load;
+        orig_dst_load = env->dst_stats.load;
-        src_load = env->src_stats.load;
+        orig_src_load = env->src_stats.load;
-        /* XXX missing power terms */
+        /* XXX missing capacity terms */
        load = task_h_load(env->p);
-        dst_load += load;
+        dst_load = orig_dst_load + load;
-        src_load -= load;
+        src_load = orig_src_load - load;
        if (cur) {
                load = task_h_load(cur);
@@ -1195,11 +1225,8 @@ balance:
                src_load += load;
        }
-        /* make src_load the smaller */
+        if (load_too_imbalanced(orig_src_load, orig_dst_load,
-        if (dst_load < src_load)
+                                src_load, dst_load, env))
-                swap(dst_load, src_load);
-        if (src_load * env->imbalance_pct < dst_load * 100)
                goto unlock;
 assign:
@@ -1275,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p)
        groupimp = group_weight(p, env.dst_nid) - groupweight;
        update_numa_stats(&env.dst_stats, env.dst_nid);
-        /* If the preferred nid has capacity, try to use it. */
+        /* If the preferred nid has free capacity, try to use it. */
-        if (env.dst_stats.has_capacity)
+        if (env.dst_stats.has_free_capacity)
                task_numa_find_cpu(&env, taskimp, groupimp);
        /* No space available on the preferred nid. Look elsewhere. */
@@ -1301,7 +1328,16 @@ static int task_numa_migrate(struct task_struct *p)
        if (env.best_cpu == -1)
                return -EAGAIN;
-        sched_setnuma(p, env.dst_nid);
+        /*
+         * If the task is part of a workload that spans multiple NUMA nodes,
+         * and is migrating into one of the workload's active nodes, remember
+         * this node as the task's preferred numa node, so the workload can
+         * settle down.
+         * A task that migrated to a second choice node will be better off
+         * trying for a better one later. Do not set the preferred node here.
+         */
+        if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
+                sched_setnuma(p, env.dst_nid);
        /*
         * Reset the scan period if the task is being rescheduled on an
@@ -1326,12 +1362,15 @@ static int task_numa_migrate(struct task_struct *p)
 /* Attempt to migrate a task to a CPU on the preferred node. */
 static void numa_migrate_preferred(struct task_struct *p)
 {
+        unsigned long interval = HZ;
        /* This task has no NUMA fault statistics yet */
        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
                return;
        /* Periodically retry migrating the task to the preferred node */
-        p->numa_migrate_retry = jiffies + HZ;
+        interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+        p->numa_migrate_retry = jiffies + interval;
        /* Success if task is already running on preferred CPU */
        if (task_node(p) == p->numa_preferred_nid)
@@ -1707,18 +1746,19 @@ no_join:
 void task_numa_free(struct task_struct *p)
 {
        struct numa_group *grp = p->numa_group;
-        int i;
        void *numa_faults = p->numa_faults_memory;
+        unsigned long flags;
+        int i;
        if (grp) {
-                spin_lock_irq(&grp->lock);
+                spin_lock_irqsave(&grp->lock, flags);
                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
                        grp->faults[i] -= p->numa_faults_memory[i];
                grp->total_faults -= p->total_numa_faults;
                list_del(&p->numa_entry);
                grp->nr_tasks--;
-                spin_unlock_irq(&grp->lock);
+                spin_unlock_irqrestore(&grp->lock, flags);
                rcu_assign_pointer(p->numa_group, NULL);
                put_numa_group(grp);
        }
@@ -1738,6 +1778,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        struct task_struct *p = current;
        bool migrated = flags & TNF_MIGRATED;
        int cpu_node = task_node(current);
+        int local = !!(flags & TNF_FAULT_LOCAL);
        int priv;
        if (!numabalancing_enabled)
@@ -1786,6 +1827,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
                        task_numa_group(p, last_cpupid, flags, &priv);
        }
+        /*
+         * If a workload spans multiple NUMA nodes, a shared fault that
+         * occurs wholly within the set of nodes that the workload is
+         * actively using should be counted as local. This allows the
+         * scan rate to slow down when a workload has settled down.
+         */
+        if (!priv && !local && p->numa_group &&
+                        node_isset(cpu_node, p->numa_group->active_nodes) &&
+                        node_isset(mem_node, p->numa_group->active_nodes))
+                local = 1;
        task_numa_placement(p);
        /*
@@ -1800,7 +1852,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
        p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
-        p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
+        p->numa_faults_locality[local] += pages;
 }
 static void reset_ptenuma_scan(struct task_struct *p)
@@ -3129,7 +3181,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                 */
                if (!cfs_b->timer_active) {
                        __refill_cfs_bandwidth_runtime(cfs_b);
-                        __start_cfs_bandwidth(cfs_b);
+                        __start_cfs_bandwidth(cfs_b, false);
                }
                if (cfs_b->runtime > 0) {
@@ -3174,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
         * has not truly expired.
         *
         * Fortunately we can check determine whether this the case by checking
-         * whether the global deadline has advanced.
+         * whether the global deadline has advanced. It is valid to compare
+         * cfs_b->runtime_expires without any locks since we only care about
+         * exact equality, so a partial write will still work.
         */
-        if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+        if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
                /* extend local deadline, drift is bounded above by 2 ticks */
                cfs_rq->runtime_expires += TICK_NSEC;
        } else {
@@ -3301,14 +3355,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        }
        if (!se)
-                rq->nr_running -= task_delta;
+                sub_nr_running(rq, task_delta);
        cfs_rq->throttled = 1;
        cfs_rq->throttled_clock = rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
        if (!cfs_b->timer_active)
-                __start_cfs_bandwidth(cfs_b);
+                __start_cfs_bandwidth(cfs_b, false);
        raw_spin_unlock(&cfs_b->lock);
 }
@@ -3352,7 +3406,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
        }
        if (!se)
-                rq->nr_running += task_delta;
+                add_nr_running(rq, task_delta);
        /* determine whether we need to wake up potentially idle cpu */
        if (rq->curr == rq->idle && rq->cfs.nr_running)
@@ -3406,21 +3460,21 @@ next:
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 {
        u64 runtime, runtime_expires;
-        int idle = 1, throttled;
+        int throttled;
-        raw_spin_lock(&cfs_b->lock);
        /* no need to continue the timer with no bandwidth constraint */
        if (cfs_b->quota == RUNTIME_INF)
-                goto out_unlock;
+                goto out_deactivate;
        throttled = !list_empty(&cfs_b->throttled_cfs_rq);
-        /* idle depends on !throttled (for the case of a large deficit) */
-        idle = cfs_b->idle && !throttled;
        cfs_b->nr_periods += overrun;
-        /* if we're going inactive then everything else can be deferred */
+        /*
-        if (idle)
+         * idle depends on !throttled (for the case of a large deficit), and if
-                goto out_unlock;
+         * we're going inactive then everything else can be deferred
+         */
+        if (cfs_b->idle && !throttled)
+                goto out_deactivate;
        /*
         * if we have relooped after returning idle once, we need to update our
@@ -3434,7 +3488,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
        if (!throttled) {
                /* mark as potentially idle for the upcoming period */
                cfs_b->idle = 1;
-                goto out_unlock;
+                return 0;
        }
        /* account preceding periods in which throttling occurred */
@@ -3474,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
         * timer to remain active while there are any throttled entities.)
         */
        cfs_b->idle = 0;
-out_unlock:
-        if (idle)
-                cfs_b->timer_active = 0;
-        raw_spin_unlock(&cfs_b->lock);
-        return idle;
+        return 0;
+out_deactivate:
+        cfs_b->timer_active = 0;
+        return 1;
 }
 /* a cfs_rq won't donate quota below this amount */
@@ -3656,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
        int overrun;
        int idle = 0;
+        raw_spin_lock(&cfs_b->lock);
        for (;;) {
                now = hrtimer_cb_get_time(timer);
                overrun = hrtimer_forward(timer, now, cfs_b->period);
@@ -3665,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
                idle = do_sched_cfs_period_timer(cfs_b, overrun);
        }
+        raw_spin_unlock(&cfs_b->lock);
        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
@@ -3690,7 +3746,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 }
 /* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
 {
        /*
         * The timer may be active because we're trying to set a new bandwidth
@@ -3705,7 +3761,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
                cpu_relax();
                raw_spin_lock(&cfs_b->lock);
                /* if someone else restarted the timer then we're done */
-                if (cfs_b->timer_active)
+                if (!force && cfs_b->timer_active)
                        return;
        }
@@ -3724,8 +3780,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
        struct cfs_rq *cfs_rq;
        for_each_leaf_cfs_rq(rq, cfs_rq) {
-                struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
                if (!cfs_rq->runtime_enabled)
                        continue;
@@ -3733,7 +3787,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
                 * clock_task is not advancing so we just need to make sure
                 * there's some valid quota amount
                 */
-                cfs_rq->runtime_remaining = cfs_b->quota;
+                cfs_rq->runtime_remaining = 1;
                if (cfs_rq_throttled(cfs_rq))
                        unthrottle_cfs_rq(cfs_rq);
        }
@@ -3884,7 +3938,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        if (!se) {
                update_rq_runnable_avg(rq, rq->nr_running);
-                inc_nr_running(rq);
+                add_nr_running(rq, 1);
        }
        hrtick_update(rq);
 }
@@ -3944,7 +3998,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        }
        if (!se) {
-                dec_nr_running(rq);
+                sub_nr_running(rq, 1);
                update_rq_runnable_avg(rq, 1);
        }
        hrtick_update(rq);
@@ -3990,9 +4044,9 @@ static unsigned long target_load(int cpu, int type)
        return max(rq->cpu_load[type-1], total);
 }
-static unsigned long power_of(int cpu)
+static unsigned long capacity_of(int cpu)
 {
-        return cpu_rq(cpu)->cpu_power;
+        return cpu_rq(cpu)->cpu_capacity;
 }
 static unsigned long cpu_avg_load_per_task(int cpu)
@@ -4014,8 +4068,8 @@ static void record_wakee(struct task_struct *p)
         * about the boundary, really active task won't care
         * about the loss.
         */
-        if (jiffies > current->wakee_flip_decay_ts + HZ) {
+        if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
-                current->wakee_flips = 0;
+                current->wakee_flips >>= 1;
                current->wakee_flip_decay_ts = jiffies;
        }
@@ -4235,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
                s64 this_eff_load, prev_eff_load;
                this_eff_load = 100;
-                this_eff_load *= power_of(prev_cpu);
+                this_eff_load *= capacity_of(prev_cpu);
                this_eff_load *= this_load +
                        effective_load(tg, this_cpu, weight, weight);
                prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-                prev_eff_load *= power_of(this_cpu);
+                prev_eff_load *= capacity_of(this_cpu);
                prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
                balanced = this_eff_load <= prev_eff_load;
@@ -4316,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                        avg_load += load;
                }
-                /* Adjust by relative CPU power of the group */
+                /* Adjust by relative CPU capacity of the group */
-                avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
+                avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
                if (local_group) {
                        this_load = avg_load;
@@ -4449,10 +4503,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                        sd = tmp;
        }
-        if (affine_sd) {
+        if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-                if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+                prev_cpu = cpu;
-                        prev_cpu = cpu;
+        if (sd_flag & SD_BALANCE_WAKE) {
                new_cpu = select_idle_sibling(p, prev_cpu);
                goto unlock;
        }
@@ -4520,6 +4574,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
                atomic_long_add(se->avg.load_avg_contrib,
                                                &cfs_rq->removed_load);
        }
+        /* We have migrated, no longer consider this task hot */
+        se->exec_start = 0;
 }
 #endif /* CONFIG_SMP */
@@ -4894,14 +4951,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 *
 *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
 *
- * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
+ * C_i is the compute capacity of cpu i, typically it is the
 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
 * can also include other factors [XXX].
 *
 * To achieve this balance we define a measure of imbalance which follows
 * directly from (1):
 *
- *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
+ *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
 *
 * We them move tasks around to minimize the imbalance. In the continuous
 * function space it is obvious this converges, in the discrete case we get
@@ -5070,6 +5127,7 @@ task_hot(struct task_struct *p, u64 now)
 /* Returns true if the destination node has incurred more faults */
 static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
+        struct numa_group *numa_group = rcu_dereference(p->numa_group);
        int src_nid, dst_nid;
        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
@@ -5083,21 +5141,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
        if (src_nid == dst_nid)
                return false;
-        /* Always encourage migration to the preferred node. */
+        if (numa_group) {
-        if (dst_nid == p->numa_preferred_nid)
+                /* Task is already in the group's interleave set. */
-                return true;
+                if (node_isset(src_nid, numa_group->active_nodes))
+                        return false;
+                /* Task is moving into the group's interleave set. */
+                if (node_isset(dst_nid, numa_group->active_nodes))
+                        return true;
+                return group_faults(p, dst_nid) > group_faults(p, src_nid);
+        }
-        /* If both task and group weight improve, this move is a winner. */
+        /* Encourage migration to the preferred node. */
-        if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+        if (dst_nid == p->numa_preferred_nid)
-            group_weight(p, dst_nid) > group_weight(p, src_nid))
                return true;
-        return false;
+        return task_faults(p, dst_nid) > task_faults(p, src_nid);
 }
 static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
+        struct numa_group *numa_group = rcu_dereference(p->numa_group);
        int src_nid, dst_nid;
        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
@@ -5112,16 +5178,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
        if (src_nid == dst_nid)
                return false;
+        if (numa_group) {
+                /* Task is moving within/into the group's interleave set. */
+                if (node_isset(dst_nid, numa_group->active_nodes))
+                        return false;
+                /* Task is moving out of the group's interleave set. */
+                if (node_isset(src_nid, numa_group->active_nodes))
+                        return true;
+                return group_faults(p, dst_nid) < group_faults(p, src_nid);
+        }
        /* Migrating away from the preferred node is always bad. */
        if (src_nid == p->numa_preferred_nid)
                return true;
-        /* If either task or group weight get worse, don't do it. */
+        return task_faults(p, dst_nid) < task_faults(p, src_nid);
-        if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
-            group_weight(p, dst_nid) < group_weight(p, src_nid))
-                return true;
-        return false;
 }
 #else
@@ -5460,13 +5533,13 @@ struct sg_lb_stats {
        unsigned long group_load; /* Total load over the CPUs of the group */
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long load_per_task;
-        unsigned long group_power;
+        unsigned long group_capacity;
        unsigned int sum_nr_running; /* Nr tasks running in the group */
-        unsigned int group_capacity;
+        unsigned int group_capacity_factor;
        unsigned int idle_cpus;
        unsigned int group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
-        int group_has_capacity; /* Is there extra capacity in the group? */
+        int group_has_free_capacity;
 #ifdef CONFIG_NUMA_BALANCING
        unsigned int nr_numa_running;
        unsigned int nr_preferred_running;
@@ -5481,7 +5554,7 @@ struct sd_lb_stats {
        struct sched_group *busiest;    /* Busiest group in this sd */
        struct sched_group *local;      /* Local group in this sd */
        unsigned long total_load;       /* Total load of all groups in sd */
-        unsigned long total_pwr;        /* Total power of all groups in sd */
+        unsigned long total_capacity;   /* Total capacity of all groups in sd */
        unsigned long avg_load; /* Average load across all groups in sd */
        struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -5500,7 +5573,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
                .busiest = NULL,
                .local = NULL,
                .total_load = 0UL,
-                .total_pwr = 0UL,
+                .total_capacity = 0UL,
                .busiest_stat = {
                        .avg_load = 0UL,
                },
@@ -5535,17 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
        return load_idx;
 }
-static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
 {
-        return SCHED_POWER_SCALE;
+        return SCHED_CAPACITY_SCALE;
 }
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
 {
-        return default_scale_freq_power(sd, cpu);
+        return default_scale_capacity(sd, cpu);
 }
-static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
 {
        unsigned long weight = sd->span_weight;
        unsigned long smt_gain = sd->smt_gain;
@@ -5555,15 +5628,16 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
        return smt_gain;
 }
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
 {
-        return default_scale_smt_power(sd, cpu);
+        return default_scale_smt_capacity(sd, cpu);
 }
-static unsigned long scale_rt_power(int cpu)
+static unsigned long scale_rt_capacity(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        u64 total, available, age_stamp, avg;
+        s64 delta;
        /*
         * Since we're reading these variables without serialization make sure
@@ -5572,74 +5646,78 @@ static unsigned long scale_rt_power(int cpu)
        age_stamp = ACCESS_ONCE(rq->age_stamp);
        avg = ACCESS_ONCE(rq->rt_avg);
-        total = sched_avg_period() + (rq_clock(rq) - age_stamp);
+        delta = rq_clock(rq) - age_stamp;
+        if (unlikely(delta < 0))
+                delta = 0;
+        total = sched_avg_period() + delta;
        if (unlikely(total < avg)) {
-                /* Ensures that power won't end up being negative */
+                /* Ensures that capacity won't end up being negative */
                available = 0;
        } else {
                available = total - avg;
        }
-        if (unlikely((s64)total < SCHED_POWER_SCALE))
+        if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
-                total = SCHED_POWER_SCALE;
+                total = SCHED_CAPACITY_SCALE;
-        total >>= SCHED_POWER_SHIFT;
+        total >>= SCHED_CAPACITY_SHIFT;
        return div_u64(available, total);
 }
-static void update_cpu_power(struct sched_domain *sd, int cpu)
+static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
        unsigned long weight = sd->span_weight;
-        unsigned long power = SCHED_POWER_SCALE;
+        unsigned long capacity = SCHED_CAPACITY_SCALE;
        struct sched_group *sdg = sd->groups;
-        if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+        if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
-                if (sched_feat(ARCH_POWER))
+                if (sched_feat(ARCH_CAPACITY))
-                        power *= arch_scale_smt_power(sd, cpu);
+                        capacity *= arch_scale_smt_capacity(sd, cpu);
                else
-                        power *= default_scale_smt_power(sd, cpu);
+                        capacity *= default_scale_smt_capacity(sd, cpu);
-                power >>= SCHED_POWER_SHIFT;
+                capacity >>= SCHED_CAPACITY_SHIFT;
        }
-        sdg->sgp->power_orig = power;
+        sdg->sgc->capacity_orig = capacity;
-        if (sched_feat(ARCH_POWER))
+        if (sched_feat(ARCH_CAPACITY))
-                power *= arch_scale_freq_power(sd, cpu);
+                capacity *= arch_scale_freq_capacity(sd, cpu);
        else
-                power *= default_scale_freq_power(sd, cpu);
+                capacity *= default_scale_capacity(sd, cpu);
-        power >>= SCHED_POWER_SHIFT;
+        capacity >>= SCHED_CAPACITY_SHIFT;
-        power *= scale_rt_power(cpu);
+        capacity *= scale_rt_capacity(cpu);
-        power >>= SCHED_POWER_SHIFT;
+        capacity >>= SCHED_CAPACITY_SHIFT;
-        if (!power)
+        if (!capacity)
-                power = 1;
+                capacity = 1;
-        cpu_rq(cpu)->cpu_power = power;
+        cpu_rq(cpu)->cpu_capacity = capacity;
-        sdg->sgp->power = power;
+        sdg->sgc->capacity = capacity;
 }
-void update_group_power(struct sched_domain *sd, int cpu)
+void update_group_capacity(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
-        unsigned long power, power_orig;
+        unsigned long capacity, capacity_orig;
        unsigned long interval;
        interval = msecs_to_jiffies(sd->balance_interval);
        interval = clamp(interval, 1UL, max_load_balance_interval);
-        sdg->sgp->next_update = jiffies + interval;
+        sdg->sgc->next_update = jiffies + interval;
        if (!child) {
-                update_cpu_power(sd, cpu);
+                update_cpu_capacity(sd, cpu);
                return;
        }
-        power_orig = power = 0;
+        capacity_orig = capacity = 0;
        if (child->flags & SD_OVERLAP) {
                /*
@@ -5648,31 +5726,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
                 */
                for_each_cpu(cpu, sched_group_cpus(sdg)) {
-                        struct sched_group_power *sgp;
+                        struct sched_group_capacity *sgc;
                        struct rq *rq = cpu_rq(cpu);
                        /*
-                         * build_sched_domains() -> init_sched_groups_power()
+                         * build_sched_domains() -> init_sched_groups_capacity()
                         * gets here before we've attached the domains to the
                         * runqueues.
                         *
-                         * Use power_of(), which is set irrespective of domains
+                         * Use capacity_of(), which is set irrespective of domains
-                         * in update_cpu_power().
+                         * in update_cpu_capacity().
                         *
-                         * This avoids power/power_orig from being 0 and
+                         * This avoids capacity/capacity_orig from being 0 and
                         * causing divide-by-zero issues on boot.
                         *
-                         * Runtime updates will correct power_orig.
+                         * Runtime updates will correct capacity_orig.
                         */
                        if (unlikely(!rq->sd)) {
-                                power_orig += power_of(cpu);
+                                capacity_orig += capacity_of(cpu);
-                                power += power_of(cpu);
+                                capacity += capacity_of(cpu);
                                continue;
                        }
-                        sgp = rq->sd->groups->sgp;
+                        sgc = rq->sd->groups->sgc;
-                        power_orig += sgp->power_orig;
+                        capacity_orig += sgc->capacity_orig;
-                        power += sgp->power;
+                        capacity += sgc->capacity;
                }
        } else  {
                /*
@@ -5682,14 +5760,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
                group = child->groups;
                do {
-                        power_orig += group->sgp->power_orig;
+                        capacity_orig += group->sgc->capacity_orig;
-                        power += group->sgp->power;
+                        capacity += group->sgc->capacity;
                        group = group->next;
                } while (group != child->groups);
        }
-        sdg->sgp->power_orig = power_orig;
+        sdg->sgc->capacity_orig = capacity_orig;
-        sdg->sgp->power = power;
+        sdg->sgc->capacity = capacity;
 }
 /*
@@ -5703,15 +5781,15 @@ static inline int
 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 {
        /*
-         * Only siblings can have significantly less than SCHED_POWER_SCALE
+         * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
         */
-        if (!(sd->flags & SD_SHARE_CPUPOWER))
+        if (!(sd->flags & SD_SHARE_CPUCAPACITY))
                return 0;
        /*
-         * If ~90% of the cpu_power is still there, we're good.
+         * If ~90% of the cpu_capacity is still there, we're good.
         */
-        if (group->sgp->power * 32 > group->sgp->power_orig * 29)
+        if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
                return 1;
        return 0;
@@ -5748,34 +5826,35 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 static inline int sg_imbalanced(struct sched_group *group)
 {
-        return group->sgp->imbalance;
+        return group->sgc->imbalance;
 }
 /*
- * Compute the group capacity.
+ * Compute the group capacity factor.
 *
- * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
+ * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
 * first dividing out the smt factor and computing the actual number of cores
- * and limit power unit capacity with that.
+ * and limit unit capacity with that.
 */
-static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
+static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
 {
-        unsigned int capacity, smt, cpus;
+        unsigned int capacity_factor, smt, cpus;
-        unsigned int power, power_orig;
+        unsigned int capacity, capacity_orig;
-        power = group->sgp->power;
+        capacity = group->sgc->capacity;
-        power_orig = group->sgp->power_orig;
+        capacity_orig = group->sgc->capacity_orig;
        cpus = group->group_weight;
-        /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
+        /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
-        smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
+        smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
-        capacity = cpus / smt; /* cores */
+        capacity_factor = cpus / smt; /* cores */
-        capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
+        capacity_factor = min_t(unsigned,
-        if (!capacity)
+                capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
-                capacity = fix_small_capacity(env->sd, group);
+        if (!capacity_factor)
+                capacity_factor = fix_small_capacity(env->sd, group);
-        return capacity;
+        return capacity_factor;
 }
 /**
@@ -5815,9 +5894,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        sgs->idle_cpus++;
        }
-        /* Adjust by relative CPU power of the group */
+        /* Adjust by relative CPU capacity of the group */
-        sgs->group_power = group->sgp->power;
+        sgs->group_capacity = group->sgc->capacity;
-        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
+        sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
        if (sgs->sum_nr_running)
                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
@@ -5825,10 +5904,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        sgs->group_weight = group->group_weight;
        sgs->group_imb = sg_imbalanced(group);
-        sgs->group_capacity = sg_capacity(env, group);
+        sgs->group_capacity_factor = sg_capacity_factor(env, group);
-        if (sgs->group_capacity > sgs->sum_nr_running)
+        if (sgs->group_capacity_factor > sgs->sum_nr_running)
-                sgs->group_has_capacity = 1;
+                sgs->group_has_free_capacity = 1;
 }
 /**
@@ -5852,7 +5931,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
        if (sgs->avg_load <= sds->busiest_stat.avg_load)
                return false;
-        if (sgs->sum_nr_running > sgs->group_capacity)
+        if (sgs->sum_nr_running > sgs->group_capacity_factor)
                return true;
        if (sgs->group_imb)
@@ -5932,8 +6011,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                        sgs = &sds->local_stat;
                        if (env->idle != CPU_NEWLY_IDLE ||
-                            time_after_eq(jiffies, sg->sgp->next_update))
+                            time_after_eq(jiffies, sg->sgc->next_update))
-                                update_group_power(env->sd, env->dst_cpu);
+                                update_group_capacity(env->sd, env->dst_cpu);
                }
                update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
@@ -5943,17 +6022,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                /*
                 * In case the child domain prefers tasks go to siblings
-                 * first, lower the sg capacity to one so that we'll try
+                 * first, lower the sg capacity factor to one so that we'll try
                 * and move all the excess tasks away. We lower the capacity
                 * of a group only if the local group has the capacity to fit
-                 * these excess tasks, i.e. nr_running < group_capacity. The
+                 * these excess tasks, i.e. nr_running < group_capacity_factor. The
                 * extra check prevents the case where you always pull from the
                 * heaviest group when it is already under-utilized (possible
                 * with a large weight task outweighs the tasks on the system).
                 */
                if (prefer_sibling && sds->local &&
-                    sds->local_stat.group_has_capacity)
+                    sds->local_stat.group_has_free_capacity)
-                        sgs->group_capacity = min(sgs->group_capacity, 1U);
+                        sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
                        sds->busiest = sg;
@@ -5963,7 +6042,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 next_group:
                /* Now, start updating sd_lb_stats */
                sds->total_load += sgs->group_load;
-                sds->total_pwr += sgs->group_power;
+                sds->total_capacity += sgs->group_capacity;
                sg = sg->next;
        } while (sg != env->sd->groups);
@@ -6010,8 +6089,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
                return 0;
        env->imbalance = DIV_ROUND_CLOSEST(
-                sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
+                sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
-                SCHED_POWER_SCALE);
+                SCHED_CAPACITY_SCALE);
        return 1;
 }
@@ -6026,7 +6105,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 static inline
 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 {
-        unsigned long tmp, pwr_now = 0, pwr_move = 0;
+        unsigned long tmp, capa_now = 0, capa_move = 0;
        unsigned int imbn = 2;
        unsigned long scaled_busy_load_per_task;
        struct sg_lb_stats *local, *busiest;
@@ -6040,8 +6119,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
                imbn = 1;
        scaled_busy_load_per_task =
-                (busiest->load_per_task * SCHED_POWER_SCALE) /
+                (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
-                busiest->group_power;
+                busiest->group_capacity;
        if (busiest->avg_load + scaled_busy_load_per_task >=
            local->avg_load + (scaled_busy_load_per_task * imbn)) {
@@ -6051,38 +6130,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
        /*
         * OK, we don't have enough imbalance to justify moving tasks,
-         * however we may be able to increase total CPU power used by
+         * however we may be able to increase total CPU capacity used by
         * moving them.
         */
-        pwr_now += busiest->group_power *
+        capa_now += busiest->group_capacity *
                        min(busiest->load_per_task, busiest->avg_load);
-        pwr_now += local->group_power *
+        capa_now += local->group_capacity *
                        min(local->load_per_task, local->avg_load);
-        pwr_now /= SCHED_POWER_SCALE;
+        capa_now /= SCHED_CAPACITY_SCALE;
        /* Amount of load we'd subtract */
        if (busiest->avg_load > scaled_busy_load_per_task) {
-                pwr_move += busiest->group_power *
+                capa_move += busiest->group_capacity *
                            min(busiest->load_per_task,
                                busiest->avg_load - scaled_busy_load_per_task);
        }
        /* Amount of load we'd add */
-        if (busiest->avg_load * busiest->group_power <
+        if (busiest->avg_load * busiest->group_capacity <
-            busiest->load_per_task * SCHED_POWER_SCALE) {
+            busiest->load_per_task * SCHED_CAPACITY_SCALE) {
-                tmp = (busiest->avg_load * busiest->group_power) /
+                tmp = (busiest->avg_load * busiest->group_capacity) /
-                      local->group_power;
+                      local->group_capacity;
        } else {
-                tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
+                tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
-                      local->group_power;
+                      local->group_capacity;
        }
-        pwr_move += local->group_power *
+        capa_move += local->group_capacity *
                    min(local->load_per_task, local->avg_load + tmp);
-        pwr_move /= SCHED_POWER_SCALE;
+        capa_move /= SCHED_CAPACITY_SCALE;
        /* Move if we gain throughput */
-        if (pwr_move > pwr_now)
+        if (capa_move > capa_now)
                env->imbalance = busiest->load_per_task;
 }
@@ -6112,7 +6191,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
        /*
         * In the presence of smp nice balancing, certain scenarios can have
         * max load less than avg load(as we skip the groups at or below
-         * its cpu_power, while calculating max_load..)
+         * its cpu_capacity, while calculating max_load..)
         */
        if (busiest->avg_load <= sds->avg_load ||
            local->avg_load >= sds->avg_load) {
@@ -6127,10 +6206,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                 * have to drop below capacity to reach cpu-load equilibrium.
                 */
                load_above_capacity =
-                        (busiest->sum_nr_running - busiest->group_capacity);
+                        (busiest->sum_nr_running - busiest->group_capacity_factor);
-                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
+                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
-                load_above_capacity /= busiest->group_power;
+                load_above_capacity /= busiest->group_capacity;
        }
        /*
@@ -6145,9 +6224,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
        /* How much load to actually move to equalise the imbalance */
        env->imbalance = min(
-                max_pull * busiest->group_power,
+                max_pull * busiest->group_capacity,
-                (sds->avg_load - local->avg_load) * local->group_power
+                (sds->avg_load - local->avg_load) * local->group_capacity
-        ) / SCHED_POWER_SCALE;
+        ) / SCHED_CAPACITY_SCALE;
        /*
         * if *imbalance is less than the average load per runnable task
@@ -6201,7 +6280,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
        if (!sds.busiest || busiest->sum_nr_running == 0)
                goto out_balanced;
-        sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+        sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
+                                                / sds.total_capacity;
        /*
         * If the busiest group is imbalanced the below checks don't
@@ -6212,8 +6292,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                goto force_balance;
        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-        if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
+        if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
-            !busiest->group_has_capacity)
+            !busiest->group_has_free_capacity)
                goto force_balance;
        /*
@@ -6267,11 +6347,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                                     struct sched_group *group)
 {
        struct rq *busiest = NULL, *rq;
-        unsigned long busiest_load = 0, busiest_power = 1;
+        unsigned long busiest_load = 0, busiest_capacity = 1;
        int i;
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-                unsigned long power, capacity, wl;
+                unsigned long capacity, capacity_factor, wl;
                enum fbq_type rt;
                rq = cpu_rq(i);
@@ -6299,34 +6379,34 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                if (rt > env->fbq_type)
                        continue;
-                power = power_of(i);
+                capacity = capacity_of(i);
-                capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
+                capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
-                if (!capacity)
+                if (!capacity_factor)
-                        capacity = fix_small_capacity(env->sd, group);
+                        capacity_factor = fix_small_capacity(env->sd, group);
                wl = weighted_cpuload(i);
                /*
                 * When comparing with imbalance, use weighted_cpuload()
-                 * which is not scaled with the cpu power.
+                 * which is not scaled with the cpu capacity.
                 */
-                if (capacity && rq->nr_running == 1 && wl > env->imbalance)
+                if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
                        continue;
                /*
                 * For the load comparisons with the other cpu's, consider
-                 * the weighted_cpuload() scaled with the cpu power, so that
+                 * the weighted_cpuload() scaled with the cpu capacity, so
-                 * the load can be moved away from the cpu that is potentially
+                 * that the load can be moved away from the cpu that is
-                 * running at a lower capacity.
+                 * potentially running at a lower capacity.
                 *
-                 * Thus we're looking for max(wl_i / power_i), crosswise
+                 * Thus we're looking for max(wl_i / capacity_i), crosswise
                 * multiplication to rid ourselves of the division works out
-                 * to: wl_i * power_j > wl_j * power_i;  where j is our
+                 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
-                 * previous maximum.
+                 * our previous maximum.
                 */
-                if (wl * busiest_power > busiest_load * power) {
+                if (wl * busiest_capacity > busiest_load * capacity) {
                        busiest_load = wl;
-                        busiest_power = power;
+                        busiest_capacity = capacity;
                        busiest = rq;
                }
        }
@@ -6534,7 +6614,7 @@ more_balance:
                 * We failed to reach balance because of affinity.
                 */
                if (sd_parent) {
-                        int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+                        int *group_imbalance = &sd_parent->groups->sgc->imbalance;
                        if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
                                *group_imbalance = 1;
@@ -6640,27 +6720,62 @@ out:
        return ld_moved;
 }
+static inline unsigned long
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
+{
+        unsigned long interval = sd->balance_interval;
+        if (cpu_busy)
+                interval *= sd->busy_factor;
+        /* scale ms to jiffies */
+        interval = msecs_to_jiffies(interval);
+        interval = clamp(interval, 1UL, max_load_balance_interval);
+        return interval;
+}
+static inline void
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+{
+        unsigned long interval, next;
+        interval = get_sd_balance_interval(sd, cpu_busy);
+        next = sd->last_balance + interval;
+        if (time_after(*next_balance, next))
+                *next_balance = next;
+}
 /*
 * idle_balance is called by schedule() if this_cpu is about to become
 * idle. Attempts to pull tasks from other CPUs.
 */
 static int idle_balance(struct rq *this_rq)
 {
+        unsigned long next_balance = jiffies + HZ;
+        int this_cpu = this_rq->cpu;
        struct sched_domain *sd;
        int pulled_task = 0;
-        unsigned long next_balance = jiffies + HZ;
        u64 curr_cost = 0;
-        int this_cpu = this_rq->cpu;
        idle_enter_fair(this_rq);
        /*
         * We must set idle_stamp _before_ calling idle_balance(), such that we
         * measure the duration of idle_balance() as idle time.
         */
        this_rq->idle_stamp = rq_clock(this_rq);
-        if (this_rq->avg_idle < sysctl_sched_migration_cost)
+        if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+                rcu_read_lock();
+                sd = rcu_dereference_check_sched_domain(this_rq->sd);
+                if (sd)
+                        update_next_balance(sd, 0, &next_balance);
+                rcu_read_unlock();
                goto out;
+        }
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6670,20 +6785,20 @@ static int idle_balance(struct rq *this_rq)
        update_blocked_averages(this_cpu);
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
-                unsigned long interval;
                int continue_balancing = 1;
                u64 t0, domain_cost;
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
-                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
+                        update_next_balance(sd, 0, &next_balance);
                        break;
+                }
                if (sd->flags & SD_BALANCE_NEWIDLE) {
                        t0 = sched_clock_cpu(this_cpu);
-                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance(this_cpu, this_rq,
                                                   sd, CPU_NEWLY_IDLE,
                                                   &continue_balancing);
@@ -6695,42 +6810,37 @@ static int idle_balance(struct rq *this_rq)
                        curr_cost += domain_cost;
                }
-                interval = msecs_to_jiffies(sd->balance_interval);
+                update_next_balance(sd, 0, &next_balance);
-                if (time_after(next_balance, sd->last_balance + interval))
-                        next_balance = sd->last_balance + interval;
+                /*
-                if (pulled_task)
+                 * Stop searching for tasks to pull if there are
+                 * now runnable tasks on this rq.
+                 */
+                if (pulled_task || this_rq->nr_running > 0)
                        break;
        }
        rcu_read_unlock();
        raw_spin_lock(&this_rq->lock);
+        if (curr_cost > this_rq->max_idle_balance_cost)
+                this_rq->max_idle_balance_cost = curr_cost;
        /*
-         * While browsing the domains, we released the rq lock.
+         * While browsing the domains, we released the rq lock, a task could
-         * A task could have be enqueued in the meantime
+         * have been enqueued in the meantime. Since we're not going idle,
+         * pretend we pulled a task.
         */
-        if (this_rq->cfs.h_nr_running && !pulled_task) {
+        if (this_rq->cfs.h_nr_running && !pulled_task)
                pulled_task = 1;
-                goto out;
-        }
-        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
+out:
-                /*
+        /* Move the next balance forward */
-                 * We are going idle. next_balance may be set based on
+        if (time_after(this_rq->next_balance, next_balance))
-                 * a busy processor. So reset next_balance.
-                 */
                this_rq->next_balance = next_balance;
-        }
-        if (curr_cost > this_rq->max_idle_balance_cost)
-                this_rq->max_idle_balance_cost = curr_cost;
-out:
        /* Is there a task of a high priority class? */
-        if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
+        if (this_rq->nr_running != this_rq->cfs.h_nr_running)
-            ((this_rq->stop && this_rq->stop->on_rq) ||
-             this_rq->dl.dl_nr_running ||
-             (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
                pulled_task = -1;
        if (pulled_task) {
@@ -6891,7 +7001,7 @@ static inline void set_cpu_sd_state_busy(void)
                goto unlock;
        sd->nohz_idle = 0;
-        atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+        atomic_inc(&sd->groups->sgc->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -6908,7 +7018,7 @@ void set_cpu_sd_state_idle(void)
                goto unlock;
        sd->nohz_idle = 1;
-        atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+        atomic_dec(&sd->groups->sgc->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -7011,16 +7121,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                        break;
                }
-                interval = sd->balance_interval;
+                interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
-                if (idle != CPU_IDLE)
-                        interval *= sd->busy_factor;
-                /* scale ms to jiffies */
-                interval = msecs_to_jiffies(interval);
-                interval = clamp(interval, 1UL, max_load_balance_interval);
                need_serialize = sd->flags & SD_SERIALIZE;
                if (need_serialize) {
                        if (!spin_trylock(&balancing))
                                goto out;
@@ -7036,6 +7139,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
                        }
                        sd->last_balance = jiffies;
+                        interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
                }
                if (need_serialize)
                        spin_unlock(&balancing);
@@ -7093,12 +7197,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                rq = cpu_rq(balance_cpu);
-                raw_spin_lock_irq(&rq->lock);
+                /*
-                update_rq_clock(rq);
+                 * If time for next balance is due,
-                update_idle_cpu_load(rq);
+                 * do the balance.
-                raw_spin_unlock_irq(&rq->lock);
+                 */
+                if (time_after_eq(jiffies, rq->next_balance)) {
-                rebalance_domains(rq, CPU_IDLE);
+                        raw_spin_lock_irq(&rq->lock);
+                        update_rq_clock(rq);
+                        update_idle_cpu_load(rq);
+                        raw_spin_unlock_irq(&rq->lock);
+                        rebalance_domains(rq, CPU_IDLE);
+                }
                if (time_after(this_rq->next_balance, rq->next_balance))
                        this_rq->next_balance = rq->next_balance;
@@ -7113,7 +7222,7 @@ end:
 * of an idle cpu is the system.
 *   - This rq has more than one task.
 *   - At any scheduler domain level, this cpu's scheduler group has multiple
- *     busy cpu's exceeding the group's power.
+ *     busy cpu's exceeding the group's capacity.
 *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
 *     domain span are idle.
 */
@@ -7121,7 +7230,7 @@ static inline int nohz_kick_needed(struct rq *rq)
 {
        unsigned long now = jiffies;
        struct sched_domain *sd;
-        struct sched_group_power *sgp;
+        struct sched_group_capacity *sgc;
        int nr_busy, cpu = rq->cpu;
        if (unlikely(rq->idle_balance))
@@ -7151,8 +7260,8 @@ static inline int nohz_kick_needed(struct rq *rq)
        sd = rcu_dereference(per_cpu(sd_busy, cpu));
        if (sd) {
-                sgp = sd->groups->sgp;
+                sgc = sd->groups->sgc;
-                nr_busy = atomic_read(&sgp->nr_busy_cpus);
+                nr_busy = atomic_read(&sgc->nr_busy_cpus);
                if (nr_busy > 1)
                        goto need_kick_unlock;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 5716929a2e3a..90284d117fe6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
 SCHED_FEAT(WAKEUP_PREEMPTION, true)
 /*
- * Use arch dependent cpu power functions
+ * Use arch dependent cpu capacity functions
 */
-SCHED_FEAT(ARCH_POWER, true)
+SCHED_FEAT(ARCH_CAPACITY, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
 /*
- * Decrement CPU power based on time not spent running tasks
+ * Decrement CPU capacity based on time not spent running tasks
 */
-SCHED_FEAT(NONTASK_POWER, true)
+SCHED_FEAT(NONTASK_CAPACITY, true)
 /*
 * Queue remote wakeups on the target CPU and process them
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f4390a079c7..cf009fb0bc25 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -12,6 +12,8 @@
 #include <trace/events/power.h>
+#include "sched.h"
 static int __read_mostly cpu_idle_force_poll;
 void cpu_idle_poll_ctrl(bool enable)
@@ -67,24 +69,25 @@ void __weak arch_cpu_idle(void)
 * cpuidle_idle_call - the main idle function
 *
 * NOTE: no locks or semaphores should be used here
- * return non-zero on failure
+ *
+ * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * set, and it returns with polling set.  If it ever stops polling, it
+ * must clear the polling bit.
 */
-static int cpuidle_idle_call(void)
+static void cpuidle_idle_call(void)
 {
        struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
        struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
-        int next_state, entered_state, ret;
+        int next_state, entered_state;
        bool broadcast;
        /*
         * Check if the idle task must be rescheduled. If it is the
-         * case, exit the function after re-enabling the local irq and
+         * case, exit the function after re-enabling the local irq.
-         * set again the polling flag
         */
-        if (current_clr_polling_and_test()) {
+        if (need_resched()) {
                local_irq_enable();
-                __current_set_polling();
+                return;
-                return 0;
        }
        /*
@@ -101,104 +104,99 @@ static int cpuidle_idle_call(void)
        rcu_idle_enter();
        /*
-         * Check if the cpuidle framework is ready, otherwise fallback
+         * Ask the cpuidle framework to choose a convenient idle state.
-         * to the default arch specific idle method
+         * Fall back to the default arch idle method on errors.
         */
-        ret = cpuidle_enabled(drv, dev);
+        next_state = cpuidle_select(drv, dev);
+        if (next_state < 0) {
-        if (!ret) {
+use_default:
                /*
-                 * Ask the governor to choose an idle state it thinks
+                 * We can't use the cpuidle framework, let's use the default
-                 * it is convenient to go to. There is *always* a
+                 * idle routine.
-                 * convenient idle state
                 */
-                next_state = cpuidle_select(drv, dev);
+                if (current_clr_polling_and_test())
-                /*
-                 * The idle task must be scheduled, it is pointless to
-                 * go to idle, just update no idle residency and get
-                 * out of this function
-                 */
-                if (current_clr_polling_and_test()) {
-                        dev->last_residency = 0;
-                        entered_state = next_state;
                        local_irq_enable();
-                } else {
+                else
-                        broadcast = !!(drv->states[next_state].flags &
+                        arch_cpu_idle();
-                                       CPUIDLE_FLAG_TIMER_STOP);
+                goto exit_idle;
-                        if (broadcast)
-                                /*
-                                 * Tell the time framework to switch
-                                 * to a broadcast timer because our
-                                 * local timer will be shutdown. If a
-                                 * local timer is used from another
-                                 * cpu as a broadcast timer, this call
-                                 * may fail if it is not available
-                                 */
-                                ret = clockevents_notify(
-                                        CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
-                                        &dev->cpu);
-                        if (!ret) {
-                                trace_cpu_idle_rcuidle(next_state, dev->cpu);
-                                /*
-                                 * Enter the idle state previously
-                                 * returned by the governor
-                                 * decision. This function will block
-                                 * until an interrupt occurs and will
-                                 * take care of re-enabling the local
-                                 * interrupts
-                                 */
-                                entered_state = cpuidle_enter(drv, dev,
-                                                              next_state);
-                                trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
-                                                       dev->cpu);
-                                if (broadcast)
-                                        clockevents_notify(
-                                                CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
-                                                &dev->cpu);
-                                /*
-                                 * Give the governor an opportunity to reflect on the
-                                 * outcome
-                                 */
-                                cpuidle_reflect(dev, entered_state);
-                        }
-                }
        }
+        /*
+         * The idle task must be scheduled, it is pointless to
+         * go to idle, just update no idle residency and get
+         * out of this function
+         */
+        if (current_clr_polling_and_test()) {
+                dev->last_residency = 0;
+                entered_state = next_state;
+                local_irq_enable();
+                goto exit_idle;
+        }
+        broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
        /*
-         * We can't use the cpuidle framework, let's use the default
+         * Tell the time framework to switch to a broadcast timer
-         * idle routine
+         * because our local timer will be shutdown. If a local timer
+         * is used from another cpu as a broadcast timer, this call may
+         * fail if it is not available
         */
-        if (ret)
+        if (broadcast &&
-                arch_cpu_idle();
+            clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+                goto use_default;
+        trace_cpu_idle_rcuidle(next_state, dev->cpu);
+        /*
+         * Enter the idle state previously returned by the governor decision.
+         * This function will block until an interrupt occurs and will take
+         * care of re-enabling the local interrupts
+         */
+        entered_state = cpuidle_enter(drv, dev, next_state);
+        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+        if (broadcast)
+                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+        /*
+         * Give the governor an opportunity to reflect on the outcome
+         */
+        cpuidle_reflect(dev, entered_state);
+exit_idle:
        __current_set_polling();
        /*
-         * It is up to the idle functions to enable back the local
+         * It is up to the idle functions to reenable local interrupts
-         * interrupt
         */
        if (WARN_ON_ONCE(irqs_disabled()))
                local_irq_enable();
        rcu_idle_exit();
        start_critical_timings();
-        return 0;
 }
 /*
 * Generic idle loop implementation
+ *
+ * Called with polling cleared.
 */
 static void cpu_idle_loop(void)
 {
        while (1) {
+                /*
+                 * If the arch has a polling bit, we maintain an invariant:
+                 *
+                 * Our polling bit is clear if we're not scheduled (i.e. if
+                 * rq->curr != rq->idle).  This means that, if rq->idle has
+                 * the polling bit set, then setting need_resched is
+                 * guaranteed to cause the cpu to reschedule.
+                 */
+                __current_set_polling();
                tick_nohz_idle_enter();
                while (!need_resched()) {
@@ -238,6 +236,17 @@ static void cpu_idle_loop(void)
                 */
                preempt_set_need_resched();
                tick_nohz_idle_exit();
+                __current_clr_polling();
+                /*
+                 * We promise to call sched_ttwu_pending and reschedule
+                 * if need_resched is set while polling is set.  That
+                 * means that clearing polling needs to be visible
+                 * before doing these things.
+                 */
+                smp_mb__after_atomic();
+                sched_ttwu_pending();
                schedule_preempt_disabled();
        }
 }
@@ -259,7 +268,6 @@ void cpu_startup_entry(enum cpuhp_state state)
         */
        boot_init_stack_canary();
 #endif
-        __current_set_polling();
        arch_cpu_idle_prepare();
        cpu_idle_loop();
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bd2267ad404f..a49083192c64 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -79,6 +79,8 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        rt_rq->overloaded = 0;
        plist_head_init(&rt_rq->pushable_tasks);
 #endif
+        /* We start is dequeued state, because no RT tasks are queued */
+        rt_rq->rt_queued = 0;
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
@@ -112,6 +114,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
        return rt_se->rt_rq;
 }
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+        struct rt_rq *rt_rq = rt_se->rt_rq;
+        return rt_rq->rq;
+}
 void free_rt_sched_group(struct task_group *tg)
 {
        int i;
@@ -211,10 +220,16 @@ static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
        return container_of(rt_rq, struct rq, rt);
 }
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 {
        struct task_struct *p = rt_task_of(rt_se);
-        struct rq *rq = task_rq(p);
+        return task_rq(p);
+}
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+        struct rq *rq = rq_of_rt_se(rt_se);
        return &rq->rt;
 }
@@ -391,6 +406,9 @@ static inline void set_post_schedule(struct rq *rq)
 }
 #endif /* CONFIG_SMP */
+static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
        return !list_empty(&rt_se->run_list);
@@ -452,8 +470,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
        rt_se = rt_rq->tg->rt_se[cpu];
        if (rt_rq->rt_nr_running) {
-                if (rt_se && !on_rt_rq(rt_se))
+                if (!rt_se)
+                        enqueue_top_rt_rq(rt_rq);
+                else if (!on_rt_rq(rt_se))
                        enqueue_rt_entity(rt_se, false);
                if (rt_rq->highest_prio.curr < curr->prio)
                        resched_task(curr);
        }
@@ -466,10 +487,17 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
        rt_se = rt_rq->tg->rt_se[cpu];
-        if (rt_se && on_rt_rq(rt_se))
+        if (!rt_se)
+                dequeue_top_rt_rq(rt_rq);
+        else if (on_rt_rq(rt_se))
                dequeue_rt_entity(rt_se);
 }
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 {
        struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -532,12 +560,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
-        if (rt_rq->rt_nr_running)
+        struct rq *rq = rq_of_rt_rq(rt_rq);
-                resched_task(rq_of_rt_rq(rt_rq)->curr);
+        if (!rt_rq->rt_nr_running)
+                return;
+        enqueue_top_rt_rq(rt_rq);
+        resched_task(rq->curr);
 }
 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
+        dequeue_top_rt_rq(rt_rq);
+}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+        return rt_rq->rt_throttled;
 }
 static inline const struct cpumask *sched_rt_period_mask(void)
@@ -851,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
                 * but accrue some time due to boosting.
                 */
                if (likely(rt_b->rt_runtime)) {
-                        static bool once = false;
                        rt_rq->rt_throttled = 1;
+                        printk_deferred_once("sched: RT throttling activated\n");
-                        if (!once) {
-                                once = true;
-                                printk_sched("sched: RT throttling activated\n");
-                        }
                } else {
                        /*
                         * In case we did anyway, make it go away,
@@ -885,7 +918,6 @@ static void update_curr_rt(struct rq *rq)
 {
        struct task_struct *curr = rq->curr;
        struct sched_rt_entity *rt_se = &curr->rt;
-        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        u64 delta_exec;
        if (curr->sched_class != &rt_sched_class)
@@ -910,7 +942,7 @@ static void update_curr_rt(struct rq *rq)
                return;
        for_each_sched_rt_entity(rt_se) {
-                rt_rq = rt_rq_of_se(rt_se);
+                struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
                        raw_spin_lock(&rt_rq->rt_runtime_lock);
@@ -922,6 +954,38 @@ static void update_curr_rt(struct rq *rq)
        }
 }
+static void
+dequeue_top_rt_rq(struct rt_rq *rt_rq)
+{
+        struct rq *rq = rq_of_rt_rq(rt_rq);
+        BUG_ON(&rq->rt != rt_rq);
+        if (!rt_rq->rt_queued)
+                return;
+        BUG_ON(!rq->nr_running);
+        sub_nr_running(rq, rt_rq->rt_nr_running);
+        rt_rq->rt_queued = 0;
+}
+static void
+enqueue_top_rt_rq(struct rt_rq *rt_rq)
+{
+        struct rq *rq = rq_of_rt_rq(rt_rq);
+        BUG_ON(&rq->rt != rt_rq);
+        if (rt_rq->rt_queued)
+                return;
+        if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
+                return;
+        add_nr_running(rq, rt_rq->rt_nr_running);
+        rt_rq->rt_queued = 1;
+}
 #if defined CONFIG_SMP
 static void
@@ -1045,12 +1109,23 @@ void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 #endif /* CONFIG_RT_GROUP_SCHED */
 static inline
+unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
+{
+        struct rt_rq *group_rq = group_rt_rq(rt_se);
+        if (group_rq)
+                return group_rq->rt_nr_running;
+        else
+                return 1;
+}
+static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
        int prio = rt_se_prio(rt_se);
        WARN_ON(!rt_prio(prio));
-        rt_rq->rt_nr_running++;
+        rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
        inc_rt_prio(rt_rq, prio);
        inc_rt_migration(rt_se, rt_rq);
@@ -1062,7 +1137,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
        WARN_ON(!rt_prio(rt_se_prio(rt_se)));
        WARN_ON(!rt_rq->rt_nr_running);
-        rt_rq->rt_nr_running--;
+        rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
        dec_rt_prio(rt_rq, rt_se_prio(rt_se));
        dec_rt_migration(rt_se, rt_rq);
@@ -1119,6 +1194,8 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
                back = rt_se;
        }
+        dequeue_top_rt_rq(rt_rq_of_se(back));
        for (rt_se = back; rt_se; rt_se = rt_se->back) {
                if (on_rt_rq(rt_se))
                        __dequeue_rt_entity(rt_se);
@@ -1127,13 +1204,18 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 {
+        struct rq *rq = rq_of_rt_se(rt_se);
        dequeue_rt_stack(rt_se);
        for_each_sched_rt_entity(rt_se)
                __enqueue_rt_entity(rt_se, head);
+        enqueue_top_rt_rq(&rq->rt);
 }
 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 {
+        struct rq *rq = rq_of_rt_se(rt_se);
        dequeue_rt_stack(rt_se);
        for_each_sched_rt_entity(rt_se) {
@@ -1142,6 +1224,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
                if (rt_rq && rt_rq->rt_nr_running)
                        __enqueue_rt_entity(rt_se, false);
        }
+        enqueue_top_rt_rq(&rq->rt);
 }
 /*
@@ -1159,8 +1242,6 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
-        inc_nr_running(rq);
 }
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1171,8 +1252,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        dequeue_rt_entity(rt_se);
        dequeue_pushable_task(rq, p);
-        dec_nr_running(rq);
 }
 /*
@@ -1377,10 +1456,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
        if (prev->sched_class == &rt_sched_class)
                update_curr_rt(rq);
-        if (!rt_rq->rt_nr_running)
+        if (!rt_rq->rt_queued)
-                return NULL;
-        if (rt_rq_throttled(rt_rq))
                return NULL;
        put_prev_task(rq, prev);
@@ -1892,9 +1968,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
         */
        if (p->on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
-                if (rq->rt.overloaded && push_rt_task(rq) &&
+                if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
                    /* Don't resched if we changed runqueues */
-                    rq != task_rq(p))
+                    push_rt_task(rq) && rq != task_rq(p))
                        check_resched = 0;
 #endif /* CONFIG_SMP */
                if (check_resched && p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 456e492a3dca..31cc02ebc54e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -278,7 +278,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
 extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
 extern void free_rt_sched_group(struct task_group *tg);
@@ -409,6 +409,8 @@ struct rt_rq {
        int overloaded;
        struct plist_head pushable_tasks;
 #endif
+        int rt_queued;
        int rt_throttled;
        u64 rt_time;
        u64 rt_runtime;
@@ -423,18 +425,6 @@ struct rt_rq {
 #endif
 };
-#ifdef CONFIG_RT_GROUP_SCHED
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-        return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
-#else
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-        return rt_rq->rt_throttled;
-}
-#endif
 /* Deadline class' related fields in a runqueue */
 struct dl_rq {
        /* runqueue is an rbtree, ordered by deadline */
@@ -577,7 +567,7 @@ struct rq {
        struct root_domain *rd;
        struct sched_domain *sd;
-        unsigned long cpu_power;
+        unsigned long cpu_capacity;
        unsigned char idle_balance;
        /* For active balancing */
@@ -680,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
 #ifdef CONFIG_SMP
+extern void sched_ttwu_pending(void);
 #define rcu_dereference_check_sched_domain(p) \
        rcu_dereference_check((p), \
                              lockdep_is_held(&sched_domains_mutex))
@@ -738,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
-struct sched_group_power {
+struct sched_group_capacity {
        atomic_t ref;
        /*
-         * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+         * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
-         * single CPU.
+         * for a single CPU.
         */
-        unsigned int power, power_orig;
+        unsigned int capacity, capacity_orig;
        unsigned long next_update;
-        int imbalance; /* XXX unrelated to power but shared group state */
+        int imbalance; /* XXX unrelated to capacity but shared group state */
        /*
         * Number of busy cpus in this group.
         */
@@ -760,7 +752,7 @@ struct sched_group {
        atomic_t ref;
        unsigned int group_weight;
-        struct sched_group_power *sgp;
+        struct sched_group_capacity *sgc;
        /*
         * The CPUs this group covers.
@@ -783,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
 */
 static inline struct cpumask *sched_group_mask(struct sched_group *sg)
 {
-        return to_cpumask(sg->sgp->cpumask);
+        return to_cpumask(sg->sgc->cpumask);
 }
 /**
@@ -797,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
 extern int group_balance_cpu(struct sched_group *sg);
+#else
+static inline void sched_ttwu_pending(void) { }
 #endif /* CONFIG_SMP */
 #include "stats.h"
@@ -1177,7 +1173,7 @@ extern const struct sched_class idle_sched_class;
 #ifdef CONFIG_SMP
-extern void update_group_power(struct sched_domain *sd, int cpu);
+extern void update_group_capacity(struct sched_domain *sd, int cpu);
 extern void trigger_load_balance(struct rq *rq);
@@ -1216,12 +1212,14 @@ extern void update_idle_cpu_load(struct rq *this_rq);
 extern void init_task_runnable_average(struct task_struct *p);
-static inline void inc_nr_running(struct rq *rq)
+static inline void add_nr_running(struct rq *rq, unsigned count)
 {
-        rq->nr_running++;
+        unsigned prev_nr = rq->nr_running;
+        rq->nr_running = prev_nr + count;
 #ifdef CONFIG_NO_HZ_FULL
-        if (rq->nr_running == 2) {
+        if (prev_nr < 2 && rq->nr_running >= 2) {
                if (tick_nohz_full_cpu(rq->cpu)) {
                        /* Order rq->nr_running write against the IPI */
                        smp_wmb();
@@ -1231,9 +1229,9 @@ static inline void inc_nr_running(struct rq *rq)
 #endif
 }
-static inline void dec_nr_running(struct rq *rq)
+static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
-        rq->nr_running--;
+        rq->nr_running -= count;
 }
 static inline void rq_last_tick_reset(struct rq *rq)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index d6ce65dde541..bfe0edadbfbb 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -41,13 +41,13 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
-        inc_nr_running(rq);
+        add_nr_running(rq, 1);
 }
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
-        dec_nr_running(rq);
+        sub_nr_running(rq, 1);
 }
 static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 7d50f794e248..0ffa20ae657b 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -394,7 +394,7 @@ EXPORT_SYMBOL(__wake_up_bit);
 *
 * In order for this to function properly, as it uses waitqueue_active()
 * internally, some kind of memory barrier must be done prior to calling
- * this. Typically, this will be smp_mb__after_clear_bit(), but in some
+ * this. Typically, this will be smp_mb__after_atomic(), but in some
 * cases where bitflags are manipulated non-atomically under a lock, one
 * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
 * because spin_unlock() does not guarantee a memory barrier.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b35c21503a36..301bbc24739c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -39,7 +39,7 @@
 *         is only needed for handling filters shared across tasks.
 * @prev: points to a previously installed, or inherited, filter
 * @len: the number of instructions in the program
- * @insns: the BPF program instructions to evaluate
+ * @insnsi: the BPF program instructions to evaluate
 *
 * seccomp_filter objects are organized in a tree linked via the @prev
 * pointer.  For any task, it appears to be a singly-linked list starting
@@ -54,8 +54,7 @@
 struct seccomp_filter {
        atomic_t usage;
        struct seccomp_filter *prev;
-        unsigned short len;  /* Instruction count */
+        struct sk_filter *prog;
-        struct sock_filter_int insnsi[];
 };
 /* Limit any path through the tree to 256KB worth of instructions. */
@@ -104,60 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
                u32 k = ftest->k;
                switch (code) {
-                case BPF_S_LD_W_ABS:
+                case BPF_LD | BPF_W | BPF_ABS:
                        ftest->code = BPF_LDX | BPF_W | BPF_ABS;
                        /* 32-bit aligned and not out of bounds. */
                        if (k >= sizeof(struct seccomp_data) || k & 3)
                                return -EINVAL;
                        continue;
-                case BPF_S_LD_W_LEN:
+                case BPF_LD | BPF_W | BPF_LEN:
                        ftest->code = BPF_LD | BPF_IMM;
                        ftest->k = sizeof(struct seccomp_data);
                        continue;
-                case BPF_S_LDX_W_LEN:
+                case BPF_LDX | BPF_W | BPF_LEN:
                        ftest->code = BPF_LDX | BPF_IMM;
                        ftest->k = sizeof(struct seccomp_data);
                        continue;
                /* Explicitly include allowed calls. */
-                case BPF_S_RET_K:
+                case BPF_RET | BPF_K:
-                case BPF_S_RET_A:
+                case BPF_RET | BPF_A:
-                case BPF_S_ALU_ADD_K:
+                case BPF_ALU | BPF_ADD | BPF_K:
-                case BPF_S_ALU_ADD_X:
+                case BPF_ALU | BPF_ADD | BPF_X:
-                case BPF_S_ALU_SUB_K:
+                case BPF_ALU | BPF_SUB | BPF_K:
-                case BPF_S_ALU_SUB_X:
+                case BPF_ALU | BPF_SUB | BPF_X:
-                case BPF_S_ALU_MUL_K:
+                case BPF_ALU | BPF_MUL | BPF_K:
-                case BPF_S_ALU_MUL_X:
+                case BPF_ALU | BPF_MUL | BPF_X:
-                case BPF_S_ALU_DIV_X:
+                case BPF_ALU | BPF_DIV | BPF_K:
-                case BPF_S_ALU_AND_K:
+                case BPF_ALU | BPF_DIV | BPF_X:
-                case BPF_S_ALU_AND_X:
+                case BPF_ALU | BPF_AND | BPF_K:
-                case BPF_S_ALU_OR_K:
+                case BPF_ALU | BPF_AND | BPF_X:
-                case BPF_S_ALU_OR_X:
+                case BPF_ALU | BPF_OR | BPF_K:
-                case BPF_S_ALU_XOR_K:
+                case BPF_ALU | BPF_OR | BPF_X:
-                case BPF_S_ALU_XOR_X:
+                case BPF_ALU | BPF_XOR | BPF_K:
-                case BPF_S_ALU_LSH_K:
+                case BPF_ALU | BPF_XOR | BPF_X:
-                case BPF_S_ALU_LSH_X:
+                case BPF_ALU | BPF_LSH | BPF_K:
-                case BPF_S_ALU_RSH_K:
+                case BPF_ALU | BPF_LSH | BPF_X:
-                case BPF_S_ALU_RSH_X:
+                case BPF_ALU | BPF_RSH | BPF_K:
-                case BPF_S_ALU_NEG:
+                case BPF_ALU | BPF_RSH | BPF_X:
-                case BPF_S_LD_IMM:
+                case BPF_ALU | BPF_NEG:
-                case BPF_S_LDX_IMM:
+                case BPF_LD | BPF_IMM:
-                case BPF_S_MISC_TAX:
+                case BPF_LDX | BPF_IMM:
-                case BPF_S_MISC_TXA:
+                case BPF_MISC | BPF_TAX:
-                case BPF_S_ALU_DIV_K:
+                case BPF_MISC | BPF_TXA:
-                case BPF_S_LD_MEM:
+                case BPF_LD | BPF_MEM:
-                case BPF_S_LDX_MEM:
+                case BPF_LDX | BPF_MEM:
-                case BPF_S_ST:
+                case BPF_ST:
-                case BPF_S_STX:
+                case BPF_STX:
-                case BPF_S_JMP_JA:
+                case BPF_JMP | BPF_JA:
-                case BPF_S_JMP_JEQ_K:
+                case BPF_JMP | BPF_JEQ | BPF_K:
-                case BPF_S_JMP_JEQ_X:
+                case BPF_JMP | BPF_JEQ | BPF_X:
-                case BPF_S_JMP_JGE_K:
+                case BPF_JMP | BPF_JGE | BPF_K:
-                case BPF_S_JMP_JGE_X:
+                case BPF_JMP | BPF_JGE | BPF_X:
-                case BPF_S_JMP_JGT_K:
+                case BPF_JMP | BPF_JGT | BPF_K:
-                case BPF_S_JMP_JGT_X:
+                case BPF_JMP | BPF_JGT | BPF_X:
-                case BPF_S_JMP_JSET_K:
+                case BPF_JMP | BPF_JSET | BPF_K:
-                case BPF_S_JMP_JSET_X:
+                case BPF_JMP | BPF_JSET | BPF_X:
-                        sk_decode_filter(ftest, ftest);
                        continue;
                default:
                        return -EINVAL;
@@ -189,7 +187,8 @@ static u32 seccomp_run_filters(int syscall)
         * value always takes priority (ignoring the DATA).
         */
        for (f = current->seccomp.filter; f; f = f->prev) {
-                u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi);
+                u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
                if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
                        ret = cur_ret;
        }
@@ -215,12 +214,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
                return -EINVAL;
        for (filter = current->seccomp.filter; filter; filter = filter->prev)
-                total_insns += filter->len + 4;  /* include a 4 instr penalty */
+                total_insns += filter->prog->len + 4;  /* include a 4 instr penalty */
        if (total_insns > MAX_INSNS_PER_PATH)
                return -ENOMEM;
        /*
-         * Installing a seccomp filter requires that the task have
+         * Installing a seccomp filter requires that the task has
         * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
         * This avoids scenarios where unprivileged tasks can affect the
         * behavior of privileged children.
@@ -256,19 +255,25 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
        /* Allocate a new seccomp_filter */
        ret = -ENOMEM;
-        filter = kzalloc(sizeof(struct seccomp_filter) +
+        filter = kzalloc(sizeof(struct seccomp_filter),
-                         sizeof(struct sock_filter_int) * new_len,
                         GFP_KERNEL|__GFP_NOWARN);
        if (!filter)
                goto free_prog;
-        ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len);
+        filter->prog = kzalloc(sk_filter_size(new_len),
-        if (ret)
+                               GFP_KERNEL|__GFP_NOWARN);
+        if (!filter->prog)
                goto free_filter;
+        ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
+        if (ret)
+                goto free_filter_prog;
        kfree(fp);
        atomic_set(&filter->usage, 1);
-        filter->len = new_len;
+        filter->prog->len = new_len;
+        sk_filter_select_runtime(filter->prog);
        /*
         * If there is an existing filter, make it the prev and don't drop its
@@ -278,6 +283,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
        current->seccomp.filter = filter;
        return 0;
+free_filter_prog:
+        kfree(filter->prog);
 free_filter:
        kfree(filter);
 free_prog:
@@ -330,6 +337,7 @@ void put_seccomp_filter(struct task_struct *tsk)
        while (orig && atomic_dec_and_test(&orig->usage)) {
                struct seccomp_filter *freeme = orig;
                orig = orig->prev;
+                sk_filter_free(freeme->prog);
                kfree(freeme);
        }
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 6ea13c09ae56..a4077e90f19f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -277,6 +277,7 @@ void task_clear_jobctl_trapping(struct task_struct *task)
 {
        if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
                task->jobctl &= ~JOBCTL_TRAPPING;
+                smp_mb();       /* advised by wake_up_bit() */
                wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
        }
 }
@@ -705,11 +706,8 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)
 * Returns 1 if any signals were found.
 *
 * All callers must be holding the siglock.
- *
- * This version takes a sigset mask and looks at all signals,
- * not just those in the first mask word.
 */
-static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
+static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s)
 {
        struct sigqueue *q, *n;
        sigset_t m;
@@ -727,29 +725,6 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
        }
        return 1;
 }
-/*
- * Remove signals in mask from the pending set and queue.
- * Returns 1 if any signals were found.
- *
- * All callers must be holding the siglock.
- */
-static int rm_from_queue(unsigned long mask, struct sigpending *s)
-{
-        struct sigqueue *q, *n;
-        if (!sigtestsetmask(&s->signal, mask))
-                return 0;
-        sigdelsetmask(&s->signal, mask);
-        list_for_each_entry_safe(q, n, &s->list, list) {
-                if (q->info.si_signo < SIGRTMIN &&
-                    (mask & sigmask(q->info.si_signo))) {
-                        list_del_init(&q->list);
-                        __sigqueue_free(q);
-                }
-        }
-        return 1;
-}
 static inline int is_si_special(const struct siginfo *info)
 {
@@ -861,6 +836,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
 {
        struct signal_struct *signal = p->signal;
        struct task_struct *t;
+        sigset_t flush;
        if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
                if (signal->flags & SIGNAL_GROUP_COREDUMP)
@@ -872,26 +848,25 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
                /*
                 * This is a stop signal.  Remove SIGCONT from all queues.
                 */
-                rm_from_queue(sigmask(SIGCONT), &signal->shared_pending);
+                siginitset(&flush, sigmask(SIGCONT));
-                t = p;
+                flush_sigqueue_mask(&flush, &signal->shared_pending);
-                do {
+                for_each_thread(p, t)
-                        rm_from_queue(sigmask(SIGCONT), &t->pending);
+                        flush_sigqueue_mask(&flush, &t->pending);
-                } while_each_thread(p, t);
        } else if (sig == SIGCONT) {
                unsigned int why;
                /*
                 * Remove all stop signals from all queues, wake all threads.
                 */
-                rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
+                siginitset(&flush, SIG_KERNEL_STOP_MASK);
-                t = p;
+                flush_sigqueue_mask(&flush, &signal->shared_pending);
-                do {
+                for_each_thread(p, t) {
+                        flush_sigqueue_mask(&flush, &t->pending);
                        task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
-                        rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
                        if (likely(!(t->ptrace & PT_SEIZED)))
                                wake_up_state(t, __TASK_STOPPED);
                        else
                                ptrace_trap_notify(t);
-                } while_each_thread(p, t);
+                }
                /*
                 * Notify the parent with CLD_CONTINUED if we were stopped.
@@ -2854,7 +2829,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
                spin_lock_irq(&tsk->sighand->siglock);
                __set_task_blocked(tsk, &tsk->real_blocked);
-                siginitset(&tsk->real_blocked, 0);
+                sigemptyset(&tsk->real_blocked);
                sig = dequeue_signal(tsk, &mask, info);
        }
        spin_unlock_irq(&tsk->sighand->siglock);
@@ -3091,18 +3066,39 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
 }
 #endif
+/*
+ * For kthreads only, must not be used if cloned with CLONE_SIGHAND
+ */
+void kernel_sigaction(int sig, __sighandler_t action)
+{
+        spin_lock_irq(&current->sighand->siglock);
+        current->sighand->action[sig - 1].sa.sa_handler = action;
+        if (action == SIG_IGN) {
+                sigset_t mask;
+                sigemptyset(&mask);
+                sigaddset(&mask, sig);
+                flush_sigqueue_mask(&mask, &current->signal->shared_pending);
+                flush_sigqueue_mask(&mask, &current->pending);
+                recalc_sigpending();
+        }
+        spin_unlock_irq(&current->sighand->siglock);
+}
+EXPORT_SYMBOL(kernel_sigaction);
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
-        struct task_struct *t = current;
+        struct task_struct *p = current, *t;
        struct k_sigaction *k;
        sigset_t mask;
        if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
                return -EINVAL;
-        k = &t->sighand->action[sig-1];
+        k = &p->sighand->action[sig-1];
-        spin_lock_irq(&current->sighand->siglock);
+        spin_lock_irq(&p->sighand->siglock);
        if (oact)
                *oact = *k;
@@ -3121,21 +3117,20 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
                 *   (for example, SIGCHLD), shall cause the pending signal to
                 *   be discarded, whether or not it is blocked"
                 */
-                if (sig_handler_ignored(sig_handler(t, sig), sig)) {
+                if (sig_handler_ignored(sig_handler(p, sig), sig)) {
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
-                        rm_from_queue_full(&mask, &t->signal->shared_pending);
+                        flush_sigqueue_mask(&mask, &p->signal->shared_pending);
-                        do {
+                        for_each_thread(p, t)
-                                rm_from_queue_full(&mask, &t->pending);
+                                flush_sigqueue_mask(&mask, &t->pending);
-                        } while_each_thread(current, t);
                }
        }
-        spin_unlock_irq(&current->sighand->siglock);
+        spin_unlock_irq(&p->sighand->siglock);
        return 0;
 }
-static int 
+static int
 do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
 {
        stack_t oss;
@@ -3496,7 +3491,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
 }
 #endif
-#ifdef __ARCH_WANT_SYS_SGETMASK
+#ifdef CONFIG_SGETMASK_SYSCALL
 /*
 * For backwards compatibility.  Functionality superseded by sigprocmask.
@@ -3517,7 +3512,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
        return old;
 }
-#endif /* __ARCH_WANT_SGETMASK */
+#endif /* CONFIG_SGETMASK_SYSCALL */
 #ifdef __ARCH_WANT_SYS_SIGNAL
 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index 06d574e42c72..80c33f8de14f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
+static void flush_smp_call_function_queue(bool warn_cpu_offline);
 static int
 hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
+                /* Fall-through to the CPU_DEAD[_FROZEN] case. */
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                free_cpumask_var(cfd->cpumask);
                free_percpu(cfd->csd);
                break;
+        case CPU_DYING:
+        case CPU_DYING_FROZEN:
+                /*
+                 * The IPIs for the smp-call-function callbacks queued by other
+                 * CPUs might arrive late, either due to hardware latencies or
+                 * because this CPU disabled interrupts (inside stop-machine)
+                 * before the IPIs were sent. So flush out any pending callbacks
+                 * explicitly (without waiting for the IPIs to arrive), to
+                 * ensure that the outgoing CPU doesn't go offline with work
+                 * still pending.
+                 */
+                flush_smp_call_function_queue(false);
+                break;
 #endif
        };
@@ -177,23 +194,59 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
        return 0;
 }
-/*
+/**
- * Invoked by arch to handle an IPI for call function single. Must be
+ * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
- * called from the arch with interrupts disabled.
+ *
+ * Invoked by arch to handle an IPI for call function single.
+ * Must be called with interrupts disabled.
 */
 void generic_smp_call_function_single_interrupt(void)
 {
+        flush_smp_call_function_queue(true);
+}
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ *
+ * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
+ *                    offline CPU. Skip this check if set to 'false'.
+ *
+ * Flush any pending smp-call-function callbacks queued on this CPU. This is
+ * invoked by the generic IPI handler, as well as by a CPU about to go offline,
+ * to ensure that all pending IPI callbacks are run before it goes completely
+ * offline.
+ *
+ * Loop through the call_single_queue and run all the queued callbacks.
+ * Must be called with interrupts disabled.
+ */
+static void flush_smp_call_function_queue(bool warn_cpu_offline)
+{
+        struct llist_head *head;
        struct llist_node *entry;
        struct call_single_data *csd, *csd_next;
+        static bool warned;
-        /*
+        WARN_ON(!irqs_disabled());
-         * Shouldn't receive this interrupt on a cpu that is not yet online.
-         */
-        WARN_ON_ONCE(!cpu_online(smp_processor_id()));
-        entry = llist_del_all(&__get_cpu_var(call_single_queue));
+        head = &__get_cpu_var(call_single_queue);
+        entry = llist_del_all(head);
        entry = llist_reverse_order(entry);
+        /* There shouldn't be any pending callbacks on an offline CPU. */
+        if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
+                     !warned && !llist_empty(head))) {
+                warned = true;
+                WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
+                /*
+                 * We don't have to use the _safe() variant here
+                 * because we are not invoking the IPI handlers yet.
+                 */
+                llist_for_each_entry(csd, entry, llist)
+                        pr_warn("IPI callback %pS sent to offline CPU\n",
+                                csd->func);
+        }
        llist_for_each_entry_safe(csd, csd_next, entry, llist) {
                csd->func(csd->info);
                csd_unlock(csd);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 92f24f5e8d52..5918d227730f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -232,7 +232,6 @@ asmlinkage __visible void __do_softirq(void)
        bool in_hardirq;
        __u32 pending;
        int softirq_bit;
-        int cpu;
        /*
         * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -247,7 +246,6 @@ asmlinkage __visible void __do_softirq(void)
        __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
        in_hardirq = lockdep_softirq_start();
-        cpu = smp_processor_id();
 restart:
        /* Reset the pending bitmask before enabling irqs */
        set_softirq_pending(0);
@@ -276,11 +274,11 @@ restart:
                               prev_count, preempt_count());
                        preempt_count_set(prev_count);
                }
-                rcu_bh_qs(cpu);
                h++;
                pending >>= softirq_bit;
        }
+        rcu_bh_qs(smp_processor_id());
        local_irq_disable();
        pending = local_softirq_pending();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 01fbae5b97b7..695f0c6cd169 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 * @cpu: cpu to stop
 * @fn: function to execute
 * @arg: argument to @fn
+ * @work_buf: pointer to cpu_stop_work structure
 *
 * Similar to stop_one_cpu() but doesn't wait for completion.  The
 * caller is responsible for ensuring @work_buf is currently unused
diff --git a/kernel/sys.c b/kernel/sys.c
index fba0f29401ea..66a751ebf9d9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                        else
                                p = current;
                        if (p) {
-                                niceval = 20 - task_nice(p);
+                                niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
                        }
@@ -261,7 +261,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                        else
                                pgrp = task_pgrp(current);
                        do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
-                                niceval = 20 - task_nice(p);
+                                niceval = nice_to_rlimit(task_nice(p));
                                if (niceval > retval)
                                        retval = niceval;
                        } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
@@ -277,7 +277,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
                        do_each_thread(g, p) {
                                if (uid_eq(task_uid(p), uid)) {
-                                        niceval = 20 - task_nice(p);
+                                        niceval = nice_to_rlimit(task_nice(p));
                                        if (niceval > retval)
                                                retval = niceval;
                                }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bc8d1b74a6b9..36441b51b5df 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16);
 cond_syscall(sys_setresuid16);
 cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
+cond_syscall(sys_sgetmask);
+cond_syscall(sys_ssetmask);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
 cond_syscall(sys_ipc);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 74f5b580fe34..75b22e22a72c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
-static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
 static const int cap_last_cap = CAP_LAST_CAP;
@@ -152,10 +151,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
 #ifdef CONFIG_SPARC
 #endif
-#ifdef CONFIG_SPARC64
-extern int sysctl_tsb_ratio;
-#endif
 #ifdef __hppa__
 extern int pwrsw_enabled;
 #endif
@@ -173,6 +168,13 @@ extern int no_unaligned_warning;
 #endif
 #ifdef CONFIG_PROC_SYSCTL
+#define SYSCTL_WRITES_LEGACY    -1
+#define SYSCTL_WRITES_WARN       0
+#define SYSCTL_WRITES_STRICT     1
+static int sysctl_writes_strict = SYSCTL_WRITES_WARN;
 static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
 static int proc_taint(struct ctl_table *table, int write,
@@ -195,7 +197,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 /* Note: sysrq code uses it's own private copy */
 static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
-static int sysrq_sysctl_handler(ctl_table *table, int write,
+static int sysrq_sysctl_handler(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp,
                                loff_t *ppos)
 {
@@ -495,6 +497,15 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_taint,
        },
+        {
+                .procname       = "sysctl_writes_strict",
+                .data           = &sysctl_writes_strict,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &neg_one,
+                .extra2         = &one,
+        },
 #endif
 #ifdef CONFIG_LATENCYTOP
        {
@@ -643,7 +654,7 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif
+#ifdef CONFIG_UEVENT_HELPER
        {
                .procname       = "hotplug",
                .data           = &uevent_helper,
@@ -651,7 +662,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dostring,
        },
+#endif
 #ifdef CONFIG_CHR_DEV_SG
        {
                .procname       = "sg-big-buff",
@@ -849,6 +860,17 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+#ifdef CONFIG_SMP
+        {
+                .procname       = "softlockup_all_cpu_backtrace",
+                .data           = &sysctl_softlockup_all_cpu_backtrace,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+#endif /* CONFIG_SMP */
        {
                .procname       = "nmi_watchdog",
                .data           = &watchdog_user_enabled,
@@ -1305,7 +1327,7 @@ static struct ctl_table vm_table[] = {
                .maxlen         = sizeof(percpu_pagelist_fraction),
                .mode           = 0644,
                .proc_handler   = percpu_pagelist_fraction_sysctl_handler,
-                .extra1         = &min_percpu_pagelist_fract,
+                .extra1         = &zero,
        },
 #ifdef CONFIG_MMU
        {
@@ -1418,8 +1440,13 @@ static struct ctl_table vm_table[] = {
   (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
        {
                .procname       = "vdso_enabled",
+#ifdef CONFIG_X86_32
+                .data           = &vdso32_enabled,
+                .maxlen         = sizeof(vdso32_enabled),
+#else
                .data           = &vdso_enabled,
                .maxlen         = sizeof(vdso_enabled),
+#endif
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
                .extra1         = &zero,
@@ -1698,8 +1725,8 @@ int __init sysctl_init(void)
 #ifdef CONFIG_PROC_SYSCTL
-static int _proc_do_string(void* data, int maxlen, int write,
+static int _proc_do_string(char *data, int maxlen, int write,
-                           void __user *buffer,
+                           char __user *buffer,
                           size_t *lenp, loff_t *ppos)
 {
        size_t len;
@@ -1712,21 +1739,30 @@ static int _proc_do_string(void* data, int maxlen, int write,
        }
        if (write) {
-                len = 0;
+                if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) {
+                        /* Only continue writes not past the end of buffer. */
+                        len = strlen(data);
+                        if (len > maxlen - 1)
+                                len = maxlen - 1;
+                        if (*ppos > len)
+                                return 0;
+                        len = *ppos;
+                } else {
+                        /* Start writing from beginning of buffer. */
+                        len = 0;
+                }
+                *ppos += *lenp;
                p = buffer;
-                while (len < *lenp) {
+                while ((p - buffer) < *lenp && len < maxlen - 1) {
                        if (get_user(c, p++))
                                return -EFAULT;
                        if (c == 0 || c == '\n')
                                break;
-                        len++;
+                        data[len++] = c;
                }
-                if (len >= maxlen)
+                data[len] = 0;
-                        len = maxlen-1;
-                if(copy_from_user(data, buffer, len))
-                        return -EFAULT;
-                ((char *) data)[len] = 0;
-                *ppos += *lenp;
        } else {
                len = strlen(data);
                if (len > maxlen)
@@ -1743,10 +1779,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
                if (len > *lenp)
                        len = *lenp;
                if (len)
-                        if(copy_to_user(buffer, data, len))
+                        if (copy_to_user(buffer, data, len))
                                return -EFAULT;
                if (len < *lenp) {
-                        if(put_user('\n', ((char __user *) buffer) + len))
+                        if (put_user('\n', buffer + len))
                                return -EFAULT;
                        len++;
                }
@@ -1756,6 +1792,14 @@ static int _proc_do_string(void* data, int maxlen, int write,
        return 0;
 }
+static void warn_sysctl_write(struct ctl_table *table)
+{
+        pr_warn_once("%s wrote to %s when file position was not 0!\n"
+                "This will not be supported in the future. To silence this\n"
+                "warning, set kernel.sysctl_writes_strict = -1\n",
+                current->comm, table->procname);
+}
 /**
 * proc_dostring - read a string sysctl
 * @table: the sysctl table
@@ -1776,8 +1820,11 @@ static int _proc_do_string(void* data, int maxlen, int write,
 int proc_dostring(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-        return _proc_do_string(table->data, table->maxlen, write,
+        if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN)
-                               buffer, lenp, ppos);
+                warn_sysctl_write(table);
+        return _proc_do_string((char *)(table->data), table->maxlen, write,
+                               (char __user *)buffer, lenp, ppos);
 }
 static size_t proc_skip_spaces(char **buf)
@@ -1951,6 +1998,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
                conv = do_proc_dointvec_conv;
        if (write) {
+                if (*ppos) {
+                        switch (sysctl_writes_strict) {
+                        case SYSCTL_WRITES_STRICT:
+                                goto out;
+                        case SYSCTL_WRITES_WARN:
+                                warn_sysctl_write(table);
+                                break;
+                        default:
+                                break;
+                        }
+                }
                if (left > PAGE_SIZE - 1)
                        left = PAGE_SIZE - 1;
                page = __get_free_page(GFP_TEMPORARY);
@@ -2008,6 +2067,7 @@ free:
                        return err ? : -EINVAL;
        }
        *lenp -= left;
+out:
        *ppos += *lenp;
        return err;
 }
@@ -2200,6 +2260,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
        left = *lenp;
        if (write) {
+                if (*ppos) {
+                        switch (sysctl_writes_strict) {
+                        case SYSCTL_WRITES_STRICT:
+                                goto out;
+                        case SYSCTL_WRITES_WARN:
+                                warn_sysctl_write(table);
+                                break;
+                        default:
+                                break;
+                        }
+                }
                if (left > PAGE_SIZE - 1)
                        left = PAGE_SIZE - 1;
                page = __get_free_page(GFP_TEMPORARY);
@@ -2255,6 +2327,7 @@ free:
                        return err ? : -EINVAL;
        }
        *lenp -= left;
+out:
        *ppos += *lenp;
        return err;
 }
@@ -2501,11 +2574,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
        bool first = 1;
        size_t left = *lenp;
        unsigned long bitmap_len = table->maxlen;
-        unsigned long *bitmap = (unsigned long *) table->data;
+        unsigned long *bitmap = *(unsigned long **) table->data;
        unsigned long *tmp_bitmap = NULL;
        char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
-        if (!bitmap_len || !left || (*ppos && !write)) {
+        if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
                *lenp = 0;
                return 0;
        }
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 88c9c65a430d..fe75444ae7ec 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
                                struct itimerspec *new_setting,
                                struct itimerspec *old_setting)
 {
+        ktime_t exp;
        if (!rtcdev)
                return -ENOTSUPP;
+        if (flags & ~TIMER_ABSTIME)
+                return -EINVAL;
        if (old_setting)
                alarm_timer_get(timr, old_setting);
@@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
        /* start the timer */
        timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
-        alarm_start(&timr->it.alarm.alarmtimer,
+        exp = timespec_to_ktime(new_setting->it_value);
-                        timespec_to_ktime(new_setting->it_value));
+        /* Convert (if necessary) to absolute time */
+        if (flags != TIMER_ABSTIME) {
+                ktime_t now;
+                now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
+                exp = ktime_add(now, exp);
+        }
+        alarm_start(&timr->it.alarm.alarmtimer, exp);
        return 0;
 }
@@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
        if (!alarmtimer_get_rtcdev())
                return -ENOTSUPP;
+        if (flags & ~TIMER_ABSTIME)
+                return -EINVAL;
        if (!capable(CAP_WAKE_ALARM))
                return -EPERM;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 419a52cecd20..33db43a39515 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -165,21 +165,21 @@ static inline void pps_set_freq(s64 freq)
 static inline int is_error_status(int status)
 {
-        return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+        return (status & (STA_UNSYNC|STA_CLOCKERR))
                /* PPS signal lost when either PPS time or
                 * PPS frequency synchronization requested
                 */
-                || ((time_status & (STA_PPSFREQ|STA_PPSTIME))
+                || ((status & (STA_PPSFREQ|STA_PPSTIME))
-                        && !(time_status & STA_PPSSIGNAL))
+                        && !(status & STA_PPSSIGNAL))
                /* PPS jitter exceeded when
                 * PPS time synchronization requested */
-                || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+                || ((status & (STA_PPSTIME|STA_PPSJITTER))
                        == (STA_PPSTIME|STA_PPSJITTER))
                /* PPS wander exceeded or calibration error when
                 * PPS frequency synchronization requested
                 */
-                || ((time_status & STA_PPSFREQ)
+                || ((status & STA_PPSFREQ)
-                        && (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+                        && (status & (STA_PPSWANDER|STA_PPSERROR)));
 }
 static inline void pps_fill_timex(struct timex *txc)
@@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
                time_status |= STA_PPSERROR;
                pps_errcnt++;
                pps_dec_freq_interval();
-                pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
+                printk_deferred(KERN_ERR
-                                freq_norm.sec);
+                        "hardpps: PPSERROR: interval too long - %ld s\n",
+                        freq_norm.sec);
                return 0;
        }
@@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
        delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
        pps_freq = ftemp;
        if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
-                pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
+                printk_deferred(KERN_WARNING
+                                "hardpps: PPSWANDER: change=%ld\n", delta);
                time_status |= STA_PPSWANDER;
                pps_stbcnt++;
                pps_dec_freq_interval();
@@ -844,8 +846,9 @@ static void hardpps_update_phase(long error)
         * the time offset is updated.
         */
        if (jitter > (pps_jitter << PPS_POPCORN)) {
-                pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+                printk_deferred(KERN_WARNING
-                       jitter, (pps_jitter << PPS_POPCORN));
+                                "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+                                jitter, (pps_jitter << PPS_POPCORN));
                time_status |= STA_PPSJITTER;
                pps_jitcnt++;
        } else if (time_status & STA_PPSTIME) {
@@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
                time_status |= STA_PPSJITTER;
                /* restart the frequency calibration interval */
                pps_fbase = *raw_ts;
-                pr_err("hardpps: PPSJITTER: bad pulse\n");
+                printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n");
                return;
        }
@@ -923,7 +926,10 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 static int __init ntp_tick_adj_setup(char *str)
 {
-        ntp_tick_adj = simple_strtol(str, NULL, 0);
+        int rc = kstrtol(str, 0, (long *)&ntp_tick_adj);
+        if (rc)
+                return rc;
        ntp_tick_adj <<= NTP_SCALE_SHIFT;
        return 1;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 4d23dc4d8139..445106d2c729 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -49,13 +49,6 @@ static u64 notrace jiffy_sched_clock_read(void)
        return (u64)(jiffies - INITIAL_JIFFIES);
 }
-static u32 __read_mostly (*read_sched_clock_32)(void);
-static u64 notrace read_sched_clock_32_wrapper(void)
-{
-        return read_sched_clock_32();
-}
 static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
@@ -176,12 +169,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
        pr_debug("Registered %pF as sched_clock source\n", read);
 }
-void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
-{
-        read_sched_clock_32 = read;
-        sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
-}
 void __init sched_clock_postinit(void)
 {
        /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f7df8ea21707..32d8d6aaedb8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
                                                        struct timespec *delta)
 {
        if (!timespec_valid_strict(delta)) {
-                printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
+                printk_deferred(KERN_WARNING
-                                        "sleep delta value!\n");
+                                "__timekeeping_inject_sleeptime: Invalid "
+                                "sleep delta value!\n");
                return;
        }
        tk_xtime_add(tk, delta);
@@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
        if (unlikely(tk->clock->maxadj &&
                (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
-                printk_once(KERN_WARNING
+                printk_deferred_once(KERN_WARNING
                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
                        tk->clock->name, (long)tk->mult + adj,
                        (long)tk->clock->mult + tk->clock->maxadj);
diff --git a/kernel/torture.c b/kernel/torture.c
index acc9afc2f26e..40bb511cca48 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -335,13 +335,8 @@ static void torture_shuffle_tasks(void)
        shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
        if (shuffle_idle_cpu >= nr_cpu_ids)
                shuffle_idle_cpu = -1;
-        if (shuffle_idle_cpu != -1) {
+        else
                cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
-                if (cpumask_empty(shuffle_tmp_mask)) {
-                        put_online_cpus();
-                        return;
-                }
-        }
        mutex_lock(&shuffle_task_mutex);
        list_for_each_entry(stp, &shuffle_task_list, st_l)
@@ -533,7 +528,11 @@ void stutter_wait(const char *title)
        while (ACCESS_ONCE(stutter_pause_test) ||
               (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
                if (stutter_pause_test)
-                        schedule_timeout_interruptible(1);
+                        if (ACCESS_ONCE(stutter_pause_test) == 1)
+                                schedule_timeout_interruptible(1);
+                        else
+                                while (ACCESS_ONCE(stutter_pause_test))
+                                        cond_resched();
                else
                        schedule_timeout_interruptible(round_jiffies_relative(HZ));
                torture_shutdown_absorb(title);
@@ -550,7 +549,11 @@ static int torture_stutter(void *arg)
        VERBOSE_TOROUT_STRING("torture_stutter task started");
        do {
                if (!torture_must_stop()) {
-                        schedule_timeout_interruptible(stutter);
+                        if (stutter > 1) {
+                                schedule_timeout_interruptible(stutter - 1);
+                                ACCESS_ONCE(stutter_pause_test) = 2;
+                        }
+                        schedule_timeout_interruptible(1);
                        ACCESS_ONCE(stutter_pause_test) = 1;
                }
                if (!torture_must_stop())
@@ -596,21 +599,27 @@ static void torture_stutter_cleanup(void)
 * The runnable parameter points to a flag that controls whether or not
 * the test is currently runnable.  If there is no such flag, pass in NULL.
 */
-void __init torture_init_begin(char *ttype, bool v, int *runnable)
+bool torture_init_begin(char *ttype, bool v, int *runnable)
 {
        mutex_lock(&fullstop_mutex);
+        if (torture_type != NULL) {
+                pr_alert("torture_init_begin: refusing %s init: %s running",
+                         ttype, torture_type);
+                mutex_unlock(&fullstop_mutex);
+                return false;
+        }
        torture_type = ttype;
        verbose = v;
        torture_runnable = runnable;
        fullstop = FULLSTOP_DONTSTOP;
+        return true;
 }
 EXPORT_SYMBOL_GPL(torture_init_begin);
 /*
 * Tell the torture module that initialization is complete.
 */
-void __init torture_init_end(void)
+void torture_init_end(void)
 {
        mutex_unlock(&fullstop_mutex);
        register_reboot_notifier(&torture_shutdown_nb);
@@ -642,6 +651,9 @@ bool torture_cleanup(void)
        torture_shuffle_cleanup();
        torture_stutter_cleanup();
        torture_onoff_cleanup();
+        mutex_lock(&fullstop_mutex);
+        torture_type = NULL;
+        mutex_unlock(&fullstop_mutex);
        return false;
 }
 EXPORT_SYMBOL_GPL(torture_cleanup);
@@ -674,8 +686,10 @@ EXPORT_SYMBOL_GPL(torture_must_stop_irq);
 */
 void torture_kthread_stopping(char *title)
 {
-        if (verbose)
+        char buf[128];
-                VERBOSE_TOROUT_STRING(title);
+        snprintf(buf, sizeof(buf), "Stopping %s", title);
+        VERBOSE_TOROUT_STRING(buf);
        while (!kthread_should_stop()) {
                torture_shutdown_absorb(title);
                schedule_timeout_uninterruptible(1);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8639819f6cef..d4409356f40d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -535,6 +535,36 @@ config MMIOTRACE_TEST
          Say N, unless you absolutely know what you are doing.
+config TRACEPOINT_BENCHMARK
+        bool "Add tracepoint that benchmarks tracepoints"
+        help
+         This option creates the tracepoint "benchmark:benchmark_event".
+         When the tracepoint is enabled, it kicks off a kernel thread that
+         goes into an infinite loop (calling cond_sched() to let other tasks
+         run), and calls the tracepoint. Each iteration will record the time
+         it took to write to the tracepoint and the next iteration that
+         data will be passed to the tracepoint itself. That is, the tracepoint
+         will report the time it took to do the previous tracepoint.
+         The string written to the tracepoint is a static string of 128 bytes
+         to keep the time the same. The initial string is simply a write of
+         "START". The second string records the cold cache time of the first
+         write which is not added to the rest of the calculations.
+         As it is a tight loop, it benchmarks as hot cache. That's fine because
+         we care most about hot paths that are probably in cache already.
+         An example of the output:
+              START
+              first=3672 [COLD CACHED]
+              last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712
+              last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337
+              last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064
+              last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411
+              last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389
+              last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666
 config RING_BUFFER_BENCHMARK
        tristate "Ring buffer benchmark stress tester"
        depends on RING_BUFFER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1378e84fbe39..2611613f14f1 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -17,6 +17,7 @@ ifdef CONFIG_TRACING_BRANCHES
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
+CFLAGS_trace_benchmark.o := -I$(src)
 CFLAGS_trace_events_filter.o := -I$(src)
 obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
@@ -62,4 +63,6 @@ endif
 obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
 obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
 libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a54a25afa2f..ac9d1dad630b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,7 +62,7 @@
 #define FTRACE_HASH_DEFAULT_BITS 10
 #define FTRACE_HASH_MAX_BITS 12
-#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
+#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
 #ifdef CONFIG_DYNAMIC_FTRACE
 #define INIT_REGEX_LOCK(opsname)        \
@@ -103,7 +103,6 @@ static int ftrace_disabled __read_mostly;
 static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
 static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
@@ -171,23 +170,6 @@ int ftrace_nr_registered_ops(void)
        return cnt;
 }
-static void
-ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
-                        struct ftrace_ops *op, struct pt_regs *regs)
-{
-        int bit;
-        bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
-        if (bit < 0)
-                return;
-        do_for_each_ftrace_op(op, ftrace_global_list) {
-                op->func(ip, parent_ip, op, regs);
-        } while_for_each_ftrace_op(op);
-        trace_clear_recursion(bit);
-}
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
                            struct ftrace_ops *op, struct pt_regs *regs)
 {
@@ -237,43 +219,6 @@ static int control_ops_alloc(struct ftrace_ops *ops)
        return 0;
 }
-static void update_global_ops(void)
-{
-        ftrace_func_t func = ftrace_global_list_func;
-        void *private = NULL;
-        /* The list has its own recursion protection. */
-        global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
-        /*
-         * If there's only one function registered, then call that
-         * function directly. Otherwise, we need to iterate over the
-         * registered callers.
-         */
-        if (ftrace_global_list == &ftrace_list_end ||
-            ftrace_global_list->next == &ftrace_list_end) {
-                func = ftrace_global_list->func;
-                private = ftrace_global_list->private;
-                /*
-                 * As we are calling the function directly.
-                 * If it does not have recursion protection,
-                 * the function_trace_op needs to be updated
-                 * accordingly.
-                 */
-                if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE))
-                        global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
-        }
-        /* If we filter on pids, update to use the pid function */
-        if (!list_empty(&ftrace_pids)) {
-                set_ftrace_pid_function(func);
-                func = ftrace_pid_func;
-        }
-        global_ops.func = func;
-        global_ops.private = private;
-}
 static void ftrace_sync(struct work_struct *work)
 {
        /*
@@ -301,8 +246,6 @@ static void update_ftrace_function(void)
 {
        ftrace_func_t func;
-        update_global_ops();
        /*
         * If we are at the end of the list and this ops is
         * recursion safe and not dynamic and the arch supports passing ops,
@@ -314,10 +257,7 @@ static void update_ftrace_function(void)
             (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
             !FTRACE_FORCE_LIST_FUNC)) {
                /* Set the ftrace_ops that the arch callback uses */
-                if (ftrace_ops_list == &global_ops)
+                set_function_trace_op = ftrace_ops_list;
-                        set_function_trace_op = ftrace_global_list;
-                else
-                        set_function_trace_op = ftrace_ops_list;
                func = ftrace_ops_list->func;
        } else {
                /* Just use the default ftrace_ops */
@@ -325,12 +265,12 @@ static void update_ftrace_function(void)
                func = ftrace_ops_list_func;
        }
+        update_function_graph_func();
        /* If there's no change, then do nothing more here */
        if (ftrace_trace_function == func)
                return;
-        update_function_graph_func();
        /*
         * If we are using the list function, it doesn't care
         * about the function_trace_ops.
@@ -373,6 +313,11 @@ static void update_ftrace_function(void)
        ftrace_trace_function = func;
 }
+int using_ftrace_ops_list_func(void)
+{
+        return ftrace_trace_function == ftrace_ops_list_func;
+}
 static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
 {
        ops->next = *list;
@@ -434,16 +379,9 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
        if (ops->flags & FTRACE_OPS_FL_DELETED)
                return -EINVAL;
-        if (FTRACE_WARN_ON(ops == &global_ops))
-                return -EINVAL;
        if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
                return -EBUSY;
-        /* We don't support both control and global flags set. */
-        if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
-                return -EINVAL;
 #ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
        /*
         * If the ftrace_ops specifies SAVE_REGS, then it only can be used
@@ -461,10 +399,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
        if (!core_kernel_data((unsigned long)ops))
                ops->flags |= FTRACE_OPS_FL_DYNAMIC;
-        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
+        if (ops->flags & FTRACE_OPS_FL_CONTROL) {
-                add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
-                ops->flags |= FTRACE_OPS_FL_ENABLED;
-        } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
                if (control_ops_alloc(ops))
                        return -ENOMEM;
                add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
@@ -484,15 +419,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
        if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
                return -EBUSY;
-        if (FTRACE_WARN_ON(ops == &global_ops))
+        if (ops->flags & FTRACE_OPS_FL_CONTROL) {
-                return -EINVAL;
-        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-                ret = remove_ftrace_list_ops(&ftrace_global_list,
-                                             &global_ops, ops);
-                if (!ret)
-                        ops->flags &= ~FTRACE_OPS_FL_ENABLED;
-        } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
                ret = remove_ftrace_list_ops(&ftrace_control_list,
                                             &control_ops, ops);
        } else
@@ -895,7 +822,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
        local_irq_save(flags);
-        stat = &__get_cpu_var(ftrace_profile_stats);
+        stat = this_cpu_ptr(&ftrace_profile_stats);
        if (!stat->hash || !ftrace_profile_enabled)
                goto out;
@@ -926,7 +853,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
        unsigned long flags;
        local_irq_save(flags);
-        stat = &__get_cpu_var(ftrace_profile_stats);
+        stat = this_cpu_ptr(&ftrace_profile_stats);
        if (!stat->hash || !ftrace_profile_enabled)
                goto out;
@@ -1178,7 +1105,7 @@ struct ftrace_page {
 static struct ftrace_page       *ftrace_pages_start;
 static struct ftrace_page       *ftrace_pages;
-static bool ftrace_hash_empty(struct ftrace_hash *hash)
+static bool __always_inline ftrace_hash_empty(struct ftrace_hash *hash)
 {
        return !hash || !hash->count;
 }
@@ -1625,7 +1552,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
                        in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
                        /*
+                         * If filter_hash is set, we want to match all functions
+                         * that are in the hash but not in the other hash.
                         *
+                         * If filter_hash is not set, then we are decrementing.
+                         * That means we match anything that is in the hash
+                         * and also in the other_hash. That is, we need to turn
+                         * off functions in the other hash because they are disabled
+                         * by this hash.
                         */
                        if (filter_hash && in_hash && !in_other_hash)
                                match = 1;
@@ -1767,19 +1701,15 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
                /*
                 * If this record is being updated from a nop, then
                 *   return UPDATE_MAKE_CALL.
-                 * Otherwise, if the EN flag is set, then return
-                 *   UPDATE_MODIFY_CALL_REGS to tell the caller to convert
-                 *   from the non-save regs, to a save regs function.
                 * Otherwise,
                 *   return UPDATE_MODIFY_CALL to tell the caller to convert
-                 *   from the save regs, to a non-save regs function.
+                 *   from the save regs, to a non-save regs function or
+                 *   vice versa.
                 */
                if (flag & FTRACE_FL_ENABLED)
                        return FTRACE_UPDATE_MAKE_CALL;
-                else if (rec->flags & FTRACE_FL_REGS_EN)
-                        return FTRACE_UPDATE_MODIFY_CALL_REGS;
+                return FTRACE_UPDATE_MODIFY_CALL;
-                else
-                        return FTRACE_UPDATE_MODIFY_CALL;
        }
        if (update) {
@@ -1821,6 +1751,42 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
        return ftrace_check_record(rec, enable, 0);
 }
+/**
+ * ftrace_get_addr_new - Get the call address to set to
+ * @rec:  The ftrace record descriptor
+ *
+ * If the record has the FTRACE_FL_REGS set, that means that it
+ * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
+ * is not not set, then it wants to convert to the normal callback.
+ *
+ * Returns the address of the trampoline to set to
+ */
+unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
+{
+        if (rec->flags & FTRACE_FL_REGS)
+                return (unsigned long)FTRACE_REGS_ADDR;
+        else
+                return (unsigned long)FTRACE_ADDR;
+}
+/**
+ * ftrace_get_addr_curr - Get the call address that is already there
+ * @rec:  The ftrace record descriptor
+ *
+ * The FTRACE_FL_REGS_EN is set when the record already points to
+ * a function that saves all the regs. Basically the '_EN' version
+ * represents the current state of the function.
+ *
+ * Returns the address of the trampoline that is currently being called
+ */
+unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
+{
+        if (rec->flags & FTRACE_FL_REGS_EN)
+                return (unsigned long)FTRACE_REGS_ADDR;
+        else
+                return (unsigned long)FTRACE_ADDR;
+}
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
@@ -1828,12 +1794,12 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
        unsigned long ftrace_addr;
        int ret;
-        ret = ftrace_update_record(rec, enable);
+        ftrace_addr = ftrace_get_addr_new(rec);
-        if (rec->flags & FTRACE_FL_REGS)
+        /* This needs to be done before we call ftrace_update_record */
-                ftrace_addr = (unsigned long)FTRACE_REGS_ADDR;
+        ftrace_old_addr = ftrace_get_addr_curr(rec);
-        else
-                ftrace_addr = (unsigned long)FTRACE_ADDR;
+        ret = ftrace_update_record(rec, enable);
        switch (ret) {
        case FTRACE_UPDATE_IGNORE:
@@ -1845,13 +1811,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
        case FTRACE_UPDATE_MAKE_NOP:
                return ftrace_make_nop(NULL, rec, ftrace_addr);
-        case FTRACE_UPDATE_MODIFY_CALL_REGS:
        case FTRACE_UPDATE_MODIFY_CALL:
-                if (rec->flags & FTRACE_FL_REGS)
-                        ftrace_old_addr = (unsigned long)FTRACE_ADDR;
-                else
-                        ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
                return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
        }
@@ -2115,7 +2075,6 @@ static void ftrace_startup_enable(int command)
 static int ftrace_startup(struct ftrace_ops *ops, int command)
 {
-        bool hash_enable = true;
        int ret;
        if (unlikely(ftrace_disabled))
@@ -2128,18 +2087,9 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
        ftrace_start_up++;
        command |= FTRACE_UPDATE_CALLS;
-        /* ops marked global share the filter hashes */
-        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
-                ops = &global_ops;
-                /* Don't update hash if global is already set */
-                if (global_start_up)
-                        hash_enable = false;
-                global_start_up++;
-        }
        ops->flags |= FTRACE_OPS_FL_ENABLED;
-        if (hash_enable)
-                ftrace_hash_rec_enable(ops, 1);
+        ftrace_hash_rec_enable(ops, 1);
        ftrace_startup_enable(command);
@@ -2148,7 +2098,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
 static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 {
-        bool hash_disable = true;
        int ret;
        if (unlikely(ftrace_disabled))
@@ -2166,21 +2115,9 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
         */
        WARN_ON_ONCE(ftrace_start_up < 0);
-        if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
+        ftrace_hash_rec_disable(ops, 1);
-                ops = &global_ops;
-                global_start_up--;
-                WARN_ON_ONCE(global_start_up < 0);
-                /* Don't update hash if global still has users */
-                if (global_start_up) {
-                        WARN_ON_ONCE(!ftrace_start_up);
-                        hash_disable = false;
-                }
-        }
-        if (hash_disable)
-                ftrace_hash_rec_disable(ops, 1);
-        if (ops != &global_ops || !global_start_up)
+        if (!global_start_up)
                ops->flags &= ~FTRACE_OPS_FL_ENABLED;
        command |= FTRACE_UPDATE_CALLS;
@@ -3524,10 +3461,6 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
        struct ftrace_hash *hash;
        int ret;
-        /* All global ops uses the global ops filters */
-        if (ops->flags & FTRACE_OPS_FL_GLOBAL)
-                ops = &global_ops;
        if (unlikely(ftrace_disabled))
                return -ENODEV;
@@ -3639,8 +3572,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
 }
 EXPORT_SYMBOL_GPL(ftrace_set_notrace);
 /**
- * ftrace_set_filter - set a function to filter on in ftrace
+ * ftrace_set_global_filter - set a function to filter on with global tracers
- * @ops - the ops to set the filter with
 * @buf - the string that holds the function filter text.
 * @len - the length of the string.
 * @reset - non zero to reset all filters before applying this filter.
@@ -3655,8 +3587,7 @@ void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
 EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
 /**
- * ftrace_set_notrace - set a function to not trace in ftrace
+ * ftrace_set_global_notrace - set a function to not trace with global tracers
- * @ops - the ops to set the notrace filter with
 * @buf - the string that holds the function notrace text.
 * @len - the length of the string.
 * @reset - non zero to reset all filters before applying this filter.
@@ -4443,6 +4374,34 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
 #endif /* CONFIG_DYNAMIC_FTRACE */
+__init void ftrace_init_global_array_ops(struct trace_array *tr)
+{
+        tr->ops = &global_ops;
+        tr->ops->private = tr;
+}
+void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
+{
+        /* If we filter on pids, update to use the pid function */
+        if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+                if (WARN_ON(tr->ops->func != ftrace_stub))
+                        printk("ftrace ops had %pS for function\n",
+                               tr->ops->func);
+                /* Only the top level instance does pid tracing */
+                if (!list_empty(&ftrace_pids)) {
+                        set_ftrace_pid_function(func);
+                        func = ftrace_pid_func;
+                }
+        }
+        tr->ops->func = func;
+        tr->ops->private = tr;
+}
+void ftrace_reset_array_ops(struct trace_array *tr)
+{
+        tr->ops->func = ftrace_stub;
+}
 static void
 ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
                        struct ftrace_ops *op, struct pt_regs *regs)
@@ -4501,9 +4460,16 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
         */
        preempt_disable_notrace();
        do_for_each_ftrace_op(op, ftrace_ops_list) {
-                if (ftrace_ops_test(op, ip, regs))
+                if (ftrace_ops_test(op, ip, regs)) {
+                        if (WARN_ON(!op->func)) {
+                                function_trace_stop = 1;
+                                printk("op=%p %pS\n", op, op);
+                                goto out;
+                        }
                        op->func(ip, parent_ip, op, regs);
+                }
        } while_for_each_ftrace_op(op);
+out:
        preempt_enable_notrace();
        trace_clear_recursion(bit);
 }
@@ -4908,7 +4874,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static int ftrace_graph_active;
-static struct notifier_block ftrace_suspend_notifier;
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
 {
@@ -5054,13 +5019,6 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
        return NOTIFY_DONE;
 }
-/* Just a place holder for function graph */
-static struct ftrace_ops fgraph_ops __read_mostly = {
-        .func           = ftrace_stub,
-        .flags          = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
-                                FTRACE_OPS_FL_RECURSION_SAFE,
-};
 static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
 {
        if (!ftrace_ops_test(&global_ops, trace->func, NULL))
@@ -5085,6 +5043,10 @@ static void update_function_graph_func(void)
                ftrace_graph_entry = ftrace_graph_entry_test;
 }
+static struct notifier_block ftrace_suspend_notifier = {
+        .notifier_call = ftrace_suspend_notifier_call,
+};
 int register_ftrace_graph(trace_func_graph_ret_t retfunc,
                        trace_func_graph_ent_t entryfunc)
 {
@@ -5098,7 +5060,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
                goto out;
        }
-        ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
        register_pm_notifier(&ftrace_suspend_notifier);
        ftrace_graph_active++;
@@ -5120,7 +5081,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        ftrace_graph_entry = ftrace_graph_entry_test;
        update_function_graph_func();
-        ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
+        /* Function graph doesn't use the .func field of global_ops */
+        global_ops.flags |= FTRACE_OPS_FL_STUB;
+        ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
 out:
        mutex_unlock(&ftrace_lock);
@@ -5138,7 +5102,8 @@ void unregister_ftrace_graph(void)
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
        __ftrace_graph_entry = ftrace_graph_entry_stub;
-        ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
+        ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
+        global_ops.flags &= ~FTRACE_OPS_FL_STUB;
        unregister_pm_notifier(&ftrace_suspend_notifier);
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c634868c2921..ff7027199a9a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -543,7 +543,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
 * as data is added to any of the @buffer's cpu buffers. Otherwise
 * it will wait for data to be added to a specific cpu buffer.
 */
-void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        DEFINE_WAIT(wait);
@@ -557,6 +557,8 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
        if (cpu == RING_BUFFER_ALL_CPUS)
                work = &buffer->irq_work;
        else {
+                if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                        return -ENODEV;
                cpu_buffer = buffer->buffers[cpu];
                work = &cpu_buffer->irq_work;
        }
@@ -591,6 +593,7 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
                schedule();
        finish_wait(&work->waiters, &wait);
+        return 0;
 }
 /**
@@ -613,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
        struct ring_buffer_per_cpu *cpu_buffer;
        struct rb_irq_work *work;
-        if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
-            (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
-                return POLLIN | POLLRDNORM;
        if (cpu == RING_BUFFER_ALL_CPUS)
                work = &buffer->irq_work;
        else {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 737b0efa1a62..bda9621638cc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -275,7 +275,7 @@ int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
 }
 EXPORT_SYMBOL_GPL(call_filter_check_discard);
-cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
+static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
        u64 ts;
@@ -466,6 +466,12 @@ int __trace_puts(unsigned long ip, const char *str, int size)
        struct print_entry *entry;
        unsigned long irq_flags;
        int alloc;
+        int pc;
+        if (!(trace_flags & TRACE_ITER_PRINTK))
+                return 0;
+        pc = preempt_count();
        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;
@@ -475,7 +481,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
        local_save_flags(irq_flags);
        buffer = global_trace.trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
-                                          irq_flags, preempt_count());
+                                          irq_flags, pc);
        if (!event)
                return 0;
@@ -492,6 +498,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
                entry->buf[size] = '\0';
        __buffer_unlock_commit(buffer, event);
+        ftrace_trace_stack(buffer, irq_flags, 4, pc);
        return size;
 }
@@ -509,6 +516,12 @@ int __trace_bputs(unsigned long ip, const char *str)
        struct bputs_entry *entry;
        unsigned long irq_flags;
        int size = sizeof(struct bputs_entry);
+        int pc;
+        if (!(trace_flags & TRACE_ITER_PRINTK))
+                return 0;
+        pc = preempt_count();
        if (unlikely(tracing_selftest_running || tracing_disabled))
                return 0;
@@ -516,7 +529,7 @@ int __trace_bputs(unsigned long ip, const char *str)
        local_save_flags(irq_flags);
        buffer = global_trace.trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
-                                          irq_flags, preempt_count());
+                                          irq_flags, pc);
        if (!event)
                return 0;
@@ -525,6 +538,7 @@ int __trace_bputs(unsigned long ip, const char *str)
        entry->str                      = str;
        __buffer_unlock_commit(buffer, event);
+        ftrace_trace_stack(buffer, irq_flags, 4, pc);
        return 1;
 }
@@ -599,7 +613,7 @@ static int alloc_snapshot(struct trace_array *tr)
        return 0;
 }
-void free_snapshot(struct trace_array *tr)
+static void free_snapshot(struct trace_array *tr)
 {
        /*
         * We don't free the ring buffer. instead, resize it because
@@ -963,27 +977,9 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
        return cnt;
 }
-/*
- * ftrace_max_lock is used to protect the swapping of buffers
- * when taking a max snapshot. The buffers themselves are
- * protected by per_cpu spinlocks. But the action of the swap
- * needs its own lock.
- *
- * This is defined as a arch_spinlock_t in order to help
- * with performance when lockdep debugging is enabled.
- *
- * It is also used in other places outside the update_max_tr
- * so it needs to be defined outside of the
- * CONFIG_TRACER_MAX_TRACE.
- */
-static arch_spinlock_t ftrace_max_lock =
-        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 unsigned long __read_mostly     tracing_thresh;
 #ifdef CONFIG_TRACER_MAX_TRACE
-unsigned long __read_mostly     tracing_max_latency;
 /*
 * Copy the new maximum trace into the separate maximum-trace
 * structure. (this way the maximum trace is permanently saved,
@@ -1000,7 +996,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
        max_buf->cpu = cpu;
        max_buf->time_start = data->preempt_timestamp;
-        max_data->saved_latency = tracing_max_latency;
+        max_data->saved_latency = tr->max_latency;
        max_data->critical_start = data->critical_start;
        max_data->critical_end = data->critical_end;
@@ -1048,14 +1044,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
        }
-        arch_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&tr->max_lock);
        buf = tr->trace_buffer.buffer;
        tr->trace_buffer.buffer = tr->max_buffer.buffer;
        tr->max_buffer.buffer = buf;
        __update_max_tr(tr, tsk, cpu);
-        arch_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&tr->max_lock);
 }
 /**
@@ -1081,7 +1077,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
        }
-        arch_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&tr->max_lock);
        ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
@@ -1099,17 +1095,17 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
        WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
        __update_max_tr(tr, tsk, cpu);
-        arch_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&tr->max_lock);
 }
 #endif /* CONFIG_TRACER_MAX_TRACE */
-static void default_wait_pipe(struct trace_iterator *iter)
+static int wait_on_pipe(struct trace_iterator *iter)
 {
        /* Iterators are static, they should be filled or empty */
        if (trace_buffer_iter(iter, iter->cpu_file))
-                return;
+                return 0;
-        ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+        return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
 }
 #ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1220,8 +1216,6 @@ int register_tracer(struct tracer *type)
        else
                if (!type->flags->opts)
                        type->flags->opts = dummy_tracer_opt;
-        if (!type->wait_pipe)
-                type->wait_pipe = default_wait_pipe;
        ret = run_tracer_selftest(type);
        if (ret < 0)
@@ -1305,22 +1299,71 @@ void tracing_reset_all_online_cpus(void)
        }
 }
-#define SAVED_CMDLINES 128
+#define SAVED_CMDLINES_DEFAULT 128
 #define NO_CMDLINE_MAP UINT_MAX
-static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
-static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
-static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
-static int cmdline_idx;
 static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
+struct saved_cmdlines_buffer {
+        unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
+        unsigned *map_cmdline_to_pid;
+        unsigned cmdline_num;
+        int cmdline_idx;
+        char *saved_cmdlines;
+};
+static struct saved_cmdlines_buffer *savedcmd;
 /* temporary disable recording */
 static atomic_t trace_record_cmdline_disabled __read_mostly;
-static void trace_init_cmdlines(void)
+static inline char *get_saved_cmdlines(int idx)
 {
-        memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline));
+        return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
-        memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid));
+}
-        cmdline_idx = 0;
+static inline void set_cmdline(int idx, const char *cmdline)
+{
+        memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN);
+}
+static int allocate_cmdlines_buffer(unsigned int val,
+                                    struct saved_cmdlines_buffer *s)
+{
+        s->map_cmdline_to_pid = kmalloc(val * sizeof(*s->map_cmdline_to_pid),
+                                        GFP_KERNEL);
+        if (!s->map_cmdline_to_pid)
+                return -ENOMEM;
+        s->saved_cmdlines = kmalloc(val * TASK_COMM_LEN, GFP_KERNEL);
+        if (!s->saved_cmdlines) {
+                kfree(s->map_cmdline_to_pid);
+                return -ENOMEM;
+        }
+        s->cmdline_idx = 0;
+        s->cmdline_num = val;
+        memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
+               sizeof(s->map_pid_to_cmdline));
+        memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
+               val * sizeof(*s->map_cmdline_to_pid));
+        return 0;
+}
+static int trace_create_savedcmd(void)
+{
+        int ret;
+        savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
+        if (!savedcmd)
+                return -ENOMEM;
+        ret = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT, savedcmd);
+        if (ret < 0) {
+                kfree(savedcmd);
+                savedcmd = NULL;
+                return -ENOMEM;
+        }
+        return 0;
 }
 int is_tracing_stopped(void)
@@ -1353,7 +1396,7 @@ void tracing_start(void)
        }
        /* Prevent the buffers from switching */
-        arch_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&global_trace.max_lock);
        buffer = global_trace.trace_buffer.buffer;
        if (buffer)
@@ -1365,9 +1408,8 @@ void tracing_start(void)
                ring_buffer_record_enable(buffer);
 #endif
-        arch_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&global_trace.max_lock);
-        ftrace_start();
 out:
        raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
 }
@@ -1414,13 +1456,12 @@ void tracing_stop(void)
        struct ring_buffer *buffer;
        unsigned long flags;
-        ftrace_stop();
        raw_spin_lock_irqsave(&global_trace.start_lock, flags);
        if (global_trace.stop_count++)
                goto out;
        /* Prevent the buffers from switching */
-        arch_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&global_trace.max_lock);
        buffer = global_trace.trace_buffer.buffer;
        if (buffer)
@@ -1432,7 +1473,7 @@ void tracing_stop(void)
                ring_buffer_record_disable(buffer);
 #endif
-        arch_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&global_trace.max_lock);
 out:
        raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
@@ -1461,12 +1502,12 @@ static void tracing_stop_tr(struct trace_array *tr)
 void trace_stop_cmdline_recording(void);
-static void trace_save_cmdline(struct task_struct *tsk)
+static int trace_save_cmdline(struct task_struct *tsk)
 {
        unsigned pid, idx;
        if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
-                return;
+                return 0;
        /*
         * It's not the end of the world if we don't get
@@ -1475,11 +1516,11 @@ static void trace_save_cmdline(struct task_struct *tsk)
         * so if we miss here, then better luck next time.
         */
        if (!arch_spin_trylock(&trace_cmdline_lock))
-                return;
+                return 0;
-        idx = map_pid_to_cmdline[tsk->pid];
+        idx = savedcmd->map_pid_to_cmdline[tsk->pid];
        if (idx == NO_CMDLINE_MAP) {
-                idx = (cmdline_idx + 1) % SAVED_CMDLINES;
+                idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
                /*
                 * Check whether the cmdline buffer at idx has a pid
@@ -1487,22 +1528,24 @@ static void trace_save_cmdline(struct task_struct *tsk)
                 * need to clear the map_pid_to_cmdline. Otherwise we
                 * would read the new comm for the old pid.
                 */
-                pid = map_cmdline_to_pid[idx];
+                pid = savedcmd->map_cmdline_to_pid[idx];
                if (pid != NO_CMDLINE_MAP)
-                        map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
+                        savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
-                map_cmdline_to_pid[idx] = tsk->pid;
+                savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
-                map_pid_to_cmdline[tsk->pid] = idx;
+                savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
-                cmdline_idx = idx;
+                savedcmd->cmdline_idx = idx;
        }
-        memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
+        set_cmdline(idx, tsk->comm);
        arch_spin_unlock(&trace_cmdline_lock);
+        return 1;
 }
-void trace_find_cmdline(int pid, char comm[])
+static void __trace_find_cmdline(int pid, char comm[])
 {
        unsigned map;
@@ -1521,13 +1564,19 @@ void trace_find_cmdline(int pid, char comm[])
                return;
        }
-        preempt_disable();
+        map = savedcmd->map_pid_to_cmdline[pid];
-        arch_spin_lock(&trace_cmdline_lock);
-        map = map_pid_to_cmdline[pid];
        if (map != NO_CMDLINE_MAP)
-                strcpy(comm, saved_cmdlines[map]);
+                strcpy(comm, get_saved_cmdlines(map));
        else
                strcpy(comm, "<...>");
+}
+void trace_find_cmdline(int pid, char comm[])
+{
+        preempt_disable();
+        arch_spin_lock(&trace_cmdline_lock);
+        __trace_find_cmdline(pid, comm);
        arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();
@@ -1541,9 +1590,8 @@ void tracing_record_cmdline(struct task_struct *tsk)
        if (!__this_cpu_read(trace_cmdline_save))
                return;
-        __this_cpu_write(trace_cmdline_save, false);
+        if (trace_save_cmdline(tsk))
+                __this_cpu_write(trace_cmdline_save, false);
-        trace_save_cmdline(tsk);
 }
 void
@@ -1746,7 +1794,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
         */
        barrier();
        if (use_stack == 1) {
-                trace.entries           = &__get_cpu_var(ftrace_stack).calls[0];
+                trace.entries           = this_cpu_ptr(ftrace_stack.calls);
                trace.max_entries       = FTRACE_STACK_MAX_ENTRIES;
                if (regs)
@@ -1995,7 +2043,21 @@ void trace_printk_init_buffers(void)
        if (alloc_percpu_trace_buffer())
                return;
-        pr_info("ftrace: Allocated trace_printk buffers\n");
+        /* trace_printk() is for debug use only. Don't use it in production. */
+        pr_warning("\n**********************************************************\n");
+        pr_warning("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
+        pr_warning("**                                                      **\n");
+        pr_warning("** trace_printk() being used. Allocating extra memory.  **\n");
+        pr_warning("**                                                      **\n");
+        pr_warning("** This means that this is a DEBUG kernel and it is     **\n");
+        pr_warning("** unsafe for produciton use.                           **\n");
+        pr_warning("**                                                      **\n");
+        pr_warning("** If you see this message and you are not debugging    **\n");
+        pr_warning("** the kernel, report this immediately to your vendor!  **\n");
+        pr_warning("**                                                      **\n");
+        pr_warning("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
+        pr_warning("**********************************************************\n");
        /* Expand the buffers to set size */
        tracing_update_buffers();
@@ -3333,7 +3395,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        mutex_lock(&tracing_cpumask_update_lock);
        local_irq_disable();
-        arch_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&tr->max_lock);
        for_each_tracing_cpu(cpu) {
                /*
                 * Increase/decrease the disabled counter if we are
@@ -3350,7 +3412,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                        ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
                }
        }
-        arch_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&tr->max_lock);
        local_irq_enable();
        cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
@@ -3592,6 +3654,7 @@ static const char readme_msg[] =
        "  trace_options\t\t- Set format or modify how tracing happens\n"
        "\t\t\t  Disable an option by adding a suffix 'no' to the\n"
        "\t\t\t  option name\n"
+        "  saved_cmdlines_size\t- echo command number in here to store comm-pid list\n"
 #ifdef CONFIG_DYNAMIC_FTRACE
        "\n  available_filter_functions - list of functions that can be filtered on\n"
        "  set_ftrace_filter\t- echo function name in here to only trace these\n"
@@ -3705,55 +3768,153 @@ static const struct file_operations tracing_readme_fops = {
        .llseek         = generic_file_llseek,
 };
+static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
+{
+        unsigned int *ptr = v;
+        if (*pos || m->count)
+                ptr++;
+        (*pos)++;
+        for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num];
+             ptr++) {
+                if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
+                        continue;
+                return ptr;
+        }
+        return NULL;
+}
+static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
+{
+        void *v;
+        loff_t l = 0;
+        preempt_disable();
+        arch_spin_lock(&trace_cmdline_lock);
+        v = &savedcmd->map_cmdline_to_pid[0];
+        while (l <= *pos) {
+                v = saved_cmdlines_next(m, v, &l);
+                if (!v)
+                        return NULL;
+        }
+        return v;
+}
+static void saved_cmdlines_stop(struct seq_file *m, void *v)
+{
+        arch_spin_unlock(&trace_cmdline_lock);
+        preempt_enable();
+}
+static int saved_cmdlines_show(struct seq_file *m, void *v)
+{
+        char buf[TASK_COMM_LEN];
+        unsigned int *pid = v;
+        __trace_find_cmdline(*pid, buf);
+        seq_printf(m, "%d %s\n", *pid, buf);
+        return 0;
+}
+static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
+        .start          = saved_cmdlines_start,
+        .next           = saved_cmdlines_next,
+        .stop           = saved_cmdlines_stop,
+        .show           = saved_cmdlines_show,
+};
+static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
+{
+        if (tracing_disabled)
+                return -ENODEV;
+        return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
+}
+static const struct file_operations tracing_saved_cmdlines_fops = {
+        .open           = tracing_saved_cmdlines_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
 static ssize_t
-tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
+tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
-                                size_t cnt, loff_t *ppos)
+                                 size_t cnt, loff_t *ppos)
 {
-        char *buf_comm;
+        char buf[64];
-        char *file_buf;
+        int r;
-        char *buf;
-        int len = 0;
+        arch_spin_lock(&trace_cmdline_lock);
-        int pid;
+        r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
-        int i;
+        arch_spin_unlock(&trace_cmdline_lock);
-        file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-        if (!file_buf)
+}
+static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
+{
+        kfree(s->saved_cmdlines);
+        kfree(s->map_cmdline_to_pid);
+        kfree(s);
+}
+static int tracing_resize_saved_cmdlines(unsigned int val)
+{
+        struct saved_cmdlines_buffer *s, *savedcmd_temp;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s)
                return -ENOMEM;
-        buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
+        if (allocate_cmdlines_buffer(val, s) < 0) {
-        if (!buf_comm) {
+                kfree(s);
-                kfree(file_buf);
                return -ENOMEM;
        }
-        buf = file_buf;
+        arch_spin_lock(&trace_cmdline_lock);
+        savedcmd_temp = savedcmd;
+        savedcmd = s;
+        arch_spin_unlock(&trace_cmdline_lock);
+        free_saved_cmdlines_buffer(savedcmd_temp);
-        for (i = 0; i < SAVED_CMDLINES; i++) {
+        return 0;
-                int r;
+}
-                pid = map_cmdline_to_pid[i];
+static ssize_t
-                if (pid == -1 || pid == NO_CMDLINE_MAP)
+tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf,
-                        continue;
+                                  size_t cnt, loff_t *ppos)
+{
+        unsigned long val;
+        int ret;
-                trace_find_cmdline(pid, buf_comm);
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-                r = sprintf(buf, "%d %s\n", pid, buf_comm);
+        if (ret)
-                buf += r;
+                return ret;
-                len += r;
-        }
-        len = simple_read_from_buffer(ubuf, cnt, ppos,
+        /* must have at least 1 entry or less than PID_MAX_DEFAULT */
-                                      file_buf, len);
+        if (!val || val > PID_MAX_DEFAULT)
+                return -EINVAL;
-        kfree(file_buf);
+        ret = tracing_resize_saved_cmdlines((unsigned int)val);
-        kfree(buf_comm);
+        if (ret < 0)
+                return ret;
-        return len;
+        *ppos += cnt;
+        return cnt;
 }
-static const struct file_operations tracing_saved_cmdlines_fops = {
+static const struct file_operations tracing_saved_cmdlines_size_fops = {
-    .open       = tracing_open_generic,
+        .open           = tracing_open_generic,
-    .read       = tracing_saved_cmdlines_read,
+        .read           = tracing_saved_cmdlines_size_read,
-    .llseek     = generic_file_llseek,
+        .write          = tracing_saved_cmdlines_size_write,
 };
 static ssize_t
@@ -4225,29 +4386,11 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
        return trace_poll(iter, filp, poll_table);
 }
-/*
- * This is a make-shift waitqueue.
- * A tracer might use this callback on some rare cases:
- *
- *  1) the current tracer might hold the runqueue lock when it wakes up
- *     a reader, hence a deadlock (sched, function, and function graph tracers)
- *  2) the function tracers, trace all functions, we don't want
- *     the overhead of calling wake_up and friends
- *     (and tracing them too)
- *
- *     Anyway, this is really very primitive wakeup.
- */
-void poll_wait_pipe(struct trace_iterator *iter)
-{
-        set_current_state(TASK_INTERRUPTIBLE);
-        /* sleep for 100 msecs, and try again. */
-        schedule_timeout(HZ / 10);
-}
 /* Must be called with trace_types_lock mutex held. */
 static int tracing_wait_pipe(struct file *filp)
 {
        struct trace_iterator *iter = filp->private_data;
+        int ret;
        while (trace_empty(iter)) {
@@ -4255,15 +4398,6 @@ static int tracing_wait_pipe(struct file *filp)
                        return -EAGAIN;
                }
-                mutex_unlock(&iter->mutex);
-                iter->trace->wait_pipe(iter);
-                mutex_lock(&iter->mutex);
-                if (signal_pending(current))
-                        return -EINTR;
                /*
                 * We block until we read something and tracing is disabled.
                 * We still block if tracing is disabled, but we have never
@@ -4275,6 +4409,18 @@ static int tracing_wait_pipe(struct file *filp)
                 */
                if (!tracing_is_on() && iter->pos)
                        break;
+                mutex_unlock(&iter->mutex);
+                ret = wait_on_pipe(iter);
+                mutex_lock(&iter->mutex);
+                if (ret)
+                        return ret;
+                if (signal_pending(current))
+                        return -EINTR;
        }
        return 1;
@@ -5197,8 +5343,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
                                goto out_unlock;
                        }
                        mutex_unlock(&trace_types_lock);
-                        iter->trace->wait_pipe(iter);
+                        ret = wait_on_pipe(iter);
                        mutex_lock(&trace_types_lock);
+                        if (ret) {
+                                size = ret;
+                                goto out_unlock;
+                        }
                        if (signal_pending(current)) {
                                size = -EINTR;
                                goto out_unlock;
@@ -5408,8 +5558,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                        goto out;
                }
                mutex_unlock(&trace_types_lock);
-                iter->trace->wait_pipe(iter);
+                ret = wait_on_pipe(iter);
                mutex_lock(&trace_types_lock);
+                if (ret)
+                        goto out;
                if (signal_pending(current)) {
                        ret = -EINTR;
                        goto out;
@@ -6102,6 +6254,28 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
        return 0;
 }
+static void free_trace_buffer(struct trace_buffer *buf)
+{
+        if (buf->buffer) {
+                ring_buffer_free(buf->buffer);
+                buf->buffer = NULL;
+                free_percpu(buf->data);
+                buf->data = NULL;
+        }
+}
+static void free_trace_buffers(struct trace_array *tr)
+{
+        if (!tr)
+                return;
+        free_trace_buffer(&tr->trace_buffer);
+#ifdef CONFIG_TRACER_MAX_TRACE
+        free_trace_buffer(&tr->max_buffer);
+#endif
+}
 static int new_instance_create(const char *name)
 {
        struct trace_array *tr;
@@ -6131,6 +6305,8 @@ static int new_instance_create(const char *name)
        raw_spin_lock_init(&tr->start_lock);
+        tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
        tr->current_trace = &nop_trace;
        INIT_LIST_HEAD(&tr->systems);
@@ -6158,8 +6334,7 @@ static int new_instance_create(const char *name)
        return 0;
 out_free_tr:
-        if (tr->trace_buffer.buffer)
+        free_trace_buffers(tr);
-                ring_buffer_free(tr->trace_buffer.buffer);
        free_cpumask_var(tr->tracing_cpumask);
        kfree(tr->name);
        kfree(tr);
@@ -6199,8 +6374,7 @@ static int instance_delete(const char *name)
        event_trace_del_tracer(tr);
        ftrace_destroy_function_files(tr);
        debugfs_remove_recursive(tr->dir);
-        free_percpu(tr->trace_buffer.data);
+        free_trace_buffers(tr);
-        ring_buffer_free(tr->trace_buffer.buffer);
        kfree(tr->name);
        kfree(tr);
@@ -6328,6 +6502,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
        trace_create_file("tracing_on", 0644, d_tracer,
                          tr, &rb_simple_fops);
+#ifdef CONFIG_TRACER_MAX_TRACE
+        trace_create_file("tracing_max_latency", 0644, d_tracer,
+                        &tr->max_latency, &tracing_max_lat_fops);
+#endif
        if (ftrace_create_function_files(tr, d_tracer))
                WARN(1, "Could not allocate function filter files");
@@ -6353,11 +6532,6 @@ static __init int tracer_init_debugfs(void)
        init_tracer_debugfs(&global_trace, d_tracer);
-#ifdef CONFIG_TRACER_MAX_TRACE
-        trace_create_file("tracing_max_latency", 0644, d_tracer,
-                        &tracing_max_latency, &tracing_max_lat_fops);
-#endif
        trace_create_file("tracing_thresh", 0644, d_tracer,
                        &tracing_thresh, &tracing_max_lat_fops);
@@ -6367,6 +6541,9 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("saved_cmdlines", 0444, d_tracer,
                        NULL, &tracing_saved_cmdlines_fops);
+        trace_create_file("saved_cmdlines_size", 0644, d_tracer,
+                          NULL, &tracing_saved_cmdlines_size_fops);
 #ifdef CONFIG_DYNAMIC_FTRACE
        trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                        &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -6603,18 +6780,19 @@ __init static int tracer_alloc_buffers(void)
        if (!temp_buffer)
                goto out_free_cpumask;
+        if (trace_create_savedcmd() < 0)
+                goto out_free_temp_buffer;
        /* TODO: make the number of buffers hot pluggable with CPUS */
        if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
                printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
                WARN_ON(1);
-                goto out_free_temp_buffer;
+                goto out_free_savedcmd;
        }
        if (global_trace.buffer_disabled)
                tracing_off();
-        trace_init_cmdlines();
        if (trace_boot_clock) {
                ret = tracing_set_clock(&global_trace, trace_boot_clock);
                if (ret < 0)
@@ -6629,6 +6807,10 @@ __init static int tracer_alloc_buffers(void)
         */
        global_trace.current_trace = &nop_trace;
+        global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
+        ftrace_init_global_array_ops(&global_trace);
        register_tracer(&nop_trace);
        /* All seems OK, enable tracing */
@@ -6656,13 +6838,11 @@ __init static int tracer_alloc_buffers(void)
        return 0;
+out_free_savedcmd:
+        free_saved_cmdlines_buffer(savedcmd);
 out_free_temp_buffer:
        ring_buffer_free(temp_buffer);
 out_free_cpumask:
-        free_percpu(global_trace.trace_buffer.data);
-#ifdef CONFIG_TRACER_MAX_TRACE
-        free_percpu(global_trace.max_buffer.data);
-#endif
        free_cpumask_var(global_trace.tracing_cpumask);
 out_free_buffer_mask:
        free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2e29d7ba5a52..9258f5a815db 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -190,7 +190,22 @@ struct trace_array {
         */
        struct trace_buffer     max_buffer;
        bool                    allocated_snapshot;
+        unsigned long           max_latency;
 #endif
+        /*
+         * max_lock is used to protect the swapping of buffers
+         * when taking a max snapshot. The buffers themselves are
+         * protected by per_cpu spinlocks. But the action of the swap
+         * needs its own lock.
+         *
+         * This is defined as a arch_spinlock_t in order to help
+         * with performance when lockdep debugging is enabled.
+         *
+         * It is also used in other places outside the update_max_tr
+         * so it needs to be defined outside of the
+         * CONFIG_TRACER_MAX_TRACE.
+         */
+        arch_spinlock_t         max_lock;
        int                     buffer_disabled;
 #ifdef CONFIG_FTRACE_SYSCALLS
        int                     sys_refcount_enter;
@@ -237,6 +252,9 @@ static inline struct trace_array *top_trace_array(void)
 {
        struct trace_array *tr;
+        if (list_empty(&ftrace_trace_arrays))
+                return NULL;
        tr = list_entry(ftrace_trace_arrays.prev,
                        typeof(*tr), list);
        WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
@@ -323,7 +341,6 @@ struct tracer_flags {
 * @stop: called when tracing is paused (echo 0 > tracing_enabled)
 * @open: called when the trace file is opened
 * @pipe_open: called when the trace_pipe file is opened
- * @wait_pipe: override how the user waits for traces on trace_pipe
 * @close: called when the trace file is released
 * @pipe_close: called when the trace_pipe file is released
 * @read: override the default read callback on trace_pipe
@@ -342,7 +359,6 @@ struct tracer {
        void                    (*stop)(struct trace_array *tr);
        void                    (*open)(struct trace_iterator *iter);
        void                    (*pipe_open)(struct trace_iterator *iter);
-        void                    (*wait_pipe)(struct trace_iterator *iter);
        void                    (*close)(struct trace_iterator *iter);
        void                    (*pipe_close)(struct trace_iterator *iter);
        ssize_t                 (*read)(struct trace_iterator *iter,
@@ -416,13 +432,7 @@ enum {
        TRACE_FTRACE_IRQ_BIT,
        TRACE_FTRACE_SIRQ_BIT,
-        /* GLOBAL_BITs must be greater than FTRACE_BITs */
+        /* INTERNAL_BITs must be greater than FTRACE_BITs */
-        TRACE_GLOBAL_BIT,
-        TRACE_GLOBAL_NMI_BIT,
-        TRACE_GLOBAL_IRQ_BIT,
-        TRACE_GLOBAL_SIRQ_BIT,
-        /* INTERNAL_BITs must be greater than GLOBAL_BITs */
        TRACE_INTERNAL_BIT,
        TRACE_INTERNAL_NMI_BIT,
        TRACE_INTERNAL_IRQ_BIT,
@@ -449,9 +459,6 @@ enum {
 #define TRACE_FTRACE_START      TRACE_FTRACE_BIT
 #define TRACE_FTRACE_MAX        ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
-#define TRACE_GLOBAL_START      TRACE_GLOBAL_BIT
-#define TRACE_GLOBAL_MAX        ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
 #define TRACE_LIST_START        TRACE_INTERNAL_BIT
 #define TRACE_LIST_MAX          ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
@@ -560,8 +567,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
 void tracing_iter_reset(struct trace_iterator *iter, int cpu);
-void poll_wait_pipe(struct trace_iterator *iter);
 void tracing_sched_switch_trace(struct trace_array *tr,
                                struct task_struct *prev,
                                struct task_struct *next,
@@ -608,8 +613,6 @@ extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 extern unsigned long tracing_thresh;
 #ifdef CONFIG_TRACER_MAX_TRACE
-extern unsigned long tracing_max_latency;
 void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
                          struct task_struct *tsk, int cpu);
@@ -724,6 +727,8 @@ extern unsigned long trace_flags;
 #define TRACE_GRAPH_PRINT_PROC          0x8
 #define TRACE_GRAPH_PRINT_DURATION      0x10
 #define TRACE_GRAPH_PRINT_ABS_TIME      0x20
+#define TRACE_GRAPH_PRINT_IRQS          0x40
+#define TRACE_GRAPH_PRINT_TAIL          0x80
 #define TRACE_GRAPH_PRINT_FILL_SHIFT    28
 #define TRACE_GRAPH_PRINT_FILL_MASK     (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
@@ -823,6 +828,10 @@ extern int ftrace_is_dead(void);
 int ftrace_create_function_files(struct trace_array *tr,
                                 struct dentry *parent);
 void ftrace_destroy_function_files(struct trace_array *tr);
+void ftrace_init_global_array_ops(struct trace_array *tr);
+void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
+void ftrace_reset_array_ops(struct trace_array *tr);
+int using_ftrace_ops_list_func(void);
 #else
 static inline int ftrace_trace_task(struct task_struct *task)
 {
@@ -836,6 +845,11 @@ ftrace_create_function_files(struct trace_array *tr,
        return 0;
 }
 static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
+static inline __init void
+ftrace_init_global_array_ops(struct trace_array *tr) { }
+static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
+/* ftace_func_t type is not defined, use macro instead of static inline */
+#define ftrace_init_array_ops(tr, func) do { } while (0)
 #endif /* CONFIG_FUNCTION_TRACER */
 #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
new file mode 100644
index 000000000000..40a14cbcf8e0
--- /dev/null
+++ b/kernel/trace/trace_benchmark.c
@@ -0,0 +1,198 @@
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/trace_clock.h>
+#define CREATE_TRACE_POINTS
+#include "trace_benchmark.h"
+static struct task_struct *bm_event_thread;
+static char bm_str[BENCHMARK_EVENT_STRLEN] = "START";
+static u64 bm_total;
+static u64 bm_totalsq;
+static u64 bm_last;
+static u64 bm_max;
+static u64 bm_min;
+static u64 bm_first;
+static u64 bm_cnt;
+static u64 bm_stddev;
+static unsigned int bm_avg;
+static unsigned int bm_std;
+/*
+ * This gets called in a loop recording the time it took to write
+ * the tracepoint. What it writes is the time statistics of the last
+ * tracepoint write. As there is nothing to write the first time
+ * it simply writes "START". As the first write is cold cache and
+ * the rest is hot, we save off that time in bm_first and it is
+ * reported as "first", which is shown in the second write to the
+ * tracepoint. The "first" field is writen within the statics from
+ * then on but never changes.
+ */
+static void trace_do_benchmark(void)
+{
+        u64 start;
+        u64 stop;
+        u64 delta;
+        u64 stddev;
+        u64 seed;
+        u64 last_seed;
+        unsigned int avg;
+        unsigned int std = 0;
+        /* Only run if the tracepoint is actually active */
+        if (!trace_benchmark_event_enabled())
+                return;
+        local_irq_disable();
+        start = trace_clock_local();
+        trace_benchmark_event(bm_str);
+        stop = trace_clock_local();
+        local_irq_enable();
+        bm_cnt++;
+        delta = stop - start;
+        /*
+         * The first read is cold cached, keep it separate from the
+         * other calculations.
+         */
+        if (bm_cnt == 1) {
+                bm_first = delta;
+                scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+                          "first=%llu [COLD CACHED]", bm_first);
+                return;
+        }
+        bm_last = delta;
+        if (delta > bm_max)
+                bm_max = delta;
+        if (!bm_min || delta < bm_min)
+                bm_min = delta;
+        /*
+         * When bm_cnt is greater than UINT_MAX, it breaks the statistics
+         * accounting. Freeze the statistics when that happens.
+         * We should have enough data for the avg and stddev anyway.
+         */
+        if (bm_cnt > UINT_MAX) {
+                scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+                    "last=%llu first=%llu max=%llu min=%llu ** avg=%u std=%d std^2=%lld",
+                          bm_last, bm_first, bm_max, bm_min, bm_avg, bm_std, bm_stddev);
+                return;
+        }
+        bm_total += delta;
+        bm_totalsq += delta * delta;
+        if (bm_cnt > 1) {
+                /*
+                 * Apply Welford's method to calculate standard deviation:
+                 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
+                 */
+                stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total;
+                do_div(stddev, (u32)bm_cnt);
+                do_div(stddev, (u32)bm_cnt - 1);
+        } else
+                stddev = 0;
+        delta = bm_total;
+        do_div(delta, bm_cnt);
+        avg = delta;
+        if (stddev > 0) {
+                int i = 0;
+                /*
+                 * stddev is the square of standard deviation but
+                 * we want the actualy number. Use the average
+                 * as our seed to find the std.
+                 *
+                 * The next try is:
+                 *  x = (x + N/x) / 2
+                 *
+                 * Where N is the squared number to find the square
+                 * root of.
+                 */
+                seed = avg;
+                do {
+                        last_seed = seed;
+                        seed = stddev;
+                        if (!last_seed)
+                                break;
+                        do_div(seed, last_seed);
+                        seed += last_seed;
+                        do_div(seed, 2);
+                } while (i++ < 10 && last_seed != seed);
+                std = seed;
+        }
+        scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+                  "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld",
+                  bm_last, bm_first, bm_max, bm_min, avg, std, stddev);
+        bm_std = std;
+        bm_avg = avg;
+        bm_stddev = stddev;
+}
+static int benchmark_event_kthread(void *arg)
+{
+        /* sleep a bit to make sure the tracepoint gets activated */
+        msleep(100);
+        while (!kthread_should_stop()) {
+                trace_do_benchmark();
+                /*
+                 * We don't go to sleep, but let others
+                 * run as well.
+                 */
+                cond_resched();
+        }
+        return 0;
+}
+/*
+ * When the benchmark tracepoint is enabled, it calls this
+ * function and the thread that calls the tracepoint is created.
+ */
+void trace_benchmark_reg(void)
+{
+        bm_event_thread = kthread_run(benchmark_event_kthread,
+                                      NULL, "event_benchmark");
+        WARN_ON(!bm_event_thread);
+}
+/*
+ * When the benchmark tracepoint is disabled, it calls this
+ * function and the thread that calls the tracepoint is deleted
+ * and all the numbers are reset.
+ */
+void trace_benchmark_unreg(void)
+{
+        if (!bm_event_thread)
+                return;
+        kthread_stop(bm_event_thread);
+        strcpy(bm_str, "START");
+        bm_total = 0;
+        bm_totalsq = 0;
+        bm_last = 0;
+        bm_max = 0;
+        bm_min = 0;
+        bm_cnt = 0;
+        /* These don't need to be reset but reset them anyway */
+        bm_first = 0;
+        bm_std = 0;
+        bm_avg = 0;
+        bm_stddev = 0;
+}
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
new file mode 100644
index 000000000000..3c1df1df4e29
--- /dev/null
+++ b/kernel/trace/trace_benchmark.h
@@ -0,0 +1,41 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM benchmark
+#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BENCHMARK_H
+#include <linux/tracepoint.h>
+extern void trace_benchmark_reg(void);
+extern void trace_benchmark_unreg(void);
+#define BENCHMARK_EVENT_STRLEN          128
+TRACE_EVENT_FN(benchmark_event,
+        TP_PROTO(const char *str),
+        TP_ARGS(str),
+        TP_STRUCT__entry(
+                __array(        char,   str,    BENCHMARK_EVENT_STRLEN  )
+        ),
+        TP_fast_assign(
+                memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
+        ),
+        TP_printk("%s", __entry->str),
+        trace_benchmark_reg, trace_benchmark_unreg
+);
+#endif /* _TRACE_BENCHMARK_H */
+#undef TRACE_INCLUDE_FILE
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_benchmark
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index c894614de14d..5d12bb407b44 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -248,8 +248,8 @@ void perf_trace_del(struct perf_event *p_event, int flags)
        tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 }
-__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
+void *perf_trace_buf_prepare(int size, unsigned short type,
-                                       struct pt_regs *regs, int *rctxp)
+                             struct pt_regs *regs, int *rctxp)
 {
        struct trace_entry *entry;
        unsigned long flags;
@@ -281,6 +281,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
        return raw_data;
 }
 EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
+NOKPROBE_SYMBOL(perf_trace_buf_prepare);
 #ifdef CONFIG_FUNCTION_TRACER
 static void
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3ddfd8f62c05..2de53628689f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -470,6 +470,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file)
        list_del(&file->list);
        remove_subsystem(file->system);
+        free_event_filter(file->filter);
        kmem_cache_free(file_cachep, file);
 }
@@ -574,6 +575,9 @@ int trace_set_clr_event(const char *system, const char *event, int set)
 {
        struct trace_array *tr = top_trace_array();
+        if (!tr)
+                return -ENODEV;
        return __ftrace_set_clr_event(tr, NULL, system, event, set);
 }
 EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -2065,6 +2069,9 @@ event_enable_func(struct ftrace_hash *hash,
        bool enable;
        int ret;
+        if (!tr)
+                return -ENODEV;
        /* hash funcs only work with set_ftrace_filter */
        if (!enabled || !param)
                return -EINVAL;
@@ -2396,6 +2403,9 @@ static __init int event_trace_enable(void)
        char *token;
        int ret;
+        if (!tr)
+                return -ENODEV;
        for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
                call = *iter;
@@ -2442,6 +2452,8 @@ static __init int event_trace_init(void)
        int ret;
        tr = top_trace_array();
+        if (!tr)
+                return -ENODEV;
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
@@ -2535,6 +2547,8 @@ static __init void event_trace_self_tests(void)
        int ret;
        tr = top_trace_array();
+        if (!tr)
+                return;
        pr_info("Running tests on trace events:\n");
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index ffd56351b521..57f0ec962d2c 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -26,8 +26,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
                          struct ftrace_ops *op, struct pt_regs *pt_regs);
-static struct ftrace_ops trace_ops;
-static struct ftrace_ops trace_stack_ops;
 static struct tracer_flags func_flags;
 /* Our option */
@@ -83,28 +81,24 @@ void ftrace_destroy_function_files(struct trace_array *tr)
 static int function_trace_init(struct trace_array *tr)
 {
-        struct ftrace_ops *ops;
+        ftrace_func_t func;
-        if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
-                /* There's only one global tr */
-                if (!trace_ops.private) {
-                        trace_ops.private = tr;
-                        trace_stack_ops.private = tr;
-                }
-                if (func_flags.val & TRACE_FUNC_OPT_STACK)
+        /*
-                        ops = &trace_stack_ops;
+         * Instance trace_arrays get their ops allocated
-                else
+         * at instance creation. Unless it failed
-                        ops = &trace_ops;
+         * the allocation.
-                tr->ops = ops;
+         */
-        } else if (!tr->ops) {
+        if (!tr->ops)
-                /*
-                 * Instance trace_arrays get their ops allocated
-                 * at instance creation. Unless it failed
-                 * the allocation.
-                 */
                return -ENOMEM;
-        }
+        /* Currently only the global instance can do stack tracing */
+        if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
+            func_flags.val & TRACE_FUNC_OPT_STACK)
+                func = function_stack_trace_call;
+        else
+                func = function_trace_call;
+        ftrace_init_array_ops(tr, func);
        tr->trace_buffer.cpu = get_cpu();
        put_cpu();
@@ -118,6 +112,7 @@ static void function_trace_reset(struct trace_array *tr)
 {
        tracing_stop_function_trace(tr);
        tracing_stop_cmdline_record();
+        ftrace_reset_array_ops(tr);
 }
 static void function_trace_start(struct trace_array *tr)
@@ -199,18 +194,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
        local_irq_restore(flags);
 }
-static struct ftrace_ops trace_ops __read_mostly =
-{
-        .func = function_trace_call,
-        .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
-static struct ftrace_ops trace_stack_ops __read_mostly =
-{
-        .func = function_stack_trace_call,
-        .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
 static struct tracer_opt func_opts[] = {
 #ifdef CONFIG_STACKTRACE
        { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
@@ -248,10 +231,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
                unregister_ftrace_function(tr->ops);
                if (set) {
-                        tr->ops = &trace_stack_ops;
+                        tr->ops->func = function_stack_trace_call;
                        register_ftrace_function(tr->ops);
                } else {
-                        tr->ops = &trace_ops;
+                        tr->ops->func = function_trace_call;
                        register_ftrace_function(tr->ops);
                }
@@ -269,7 +252,6 @@ static struct tracer function_trace __tracer_data =
        .init           = function_trace_init,
        .reset          = function_trace_reset,
        .start          = function_trace_start,
-        .wait_pipe      = poll_wait_pipe,
        .flags          = &func_flags,
        .set_flag       = func_set_flag,
        .allow_instances = true,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index deff11200261..4de3e57f723c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -38,15 +38,6 @@ struct fgraph_data {
 #define TRACE_GRAPH_INDENT      2
-/* Flag options */
-#define TRACE_GRAPH_PRINT_OVERRUN       0x1
-#define TRACE_GRAPH_PRINT_CPU           0x2
-#define TRACE_GRAPH_PRINT_OVERHEAD      0x4
-#define TRACE_GRAPH_PRINT_PROC          0x8
-#define TRACE_GRAPH_PRINT_DURATION      0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME      0x20
-#define TRACE_GRAPH_PRINT_IRQS          0x40
 static unsigned int max_depth;
 static struct tracer_opt trace_opts[] = {
@@ -64,11 +55,13 @@ static struct tracer_opt trace_opts[] = {
        { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
        /* Display interrupts */
        { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
+        /* Display function name after trailing } */
+        { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
        { } /* Empty entry */
 };
 static struct tracer_flags tracer_flags = {
-        /* Don't display overruns and proc by default */
+        /* Don't display overruns, proc, or tail by default */
        .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
               TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
        .opts = trace_opts
@@ -1176,9 +1169,10 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
         * If the return function does not have a matching entry,
         * then the entry was lost. Instead of just printing
         * the '}' and letting the user guess what function this
-         * belongs to, write out the function name.
+         * belongs to, write out the function name. Always do
+         * that if the funcgraph-tail option is enabled.
         */
-        if (func_match) {
+        if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
                ret = trace_seq_puts(s, "}\n");
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
@@ -1505,7 +1499,6 @@ static struct tracer graph_trace __tracer_data = {
        .pipe_open      = graph_trace_open,
        .close          = graph_trace_close,
        .pipe_close     = graph_trace_close,
-        .wait_pipe      = poll_wait_pipe,
        .init           = graph_trace_init,
        .reset          = graph_trace_reset,
        .print_line     = print_graph_function,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 8ff02cbb892f..9bb104f748d0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,12 +151,6 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
        atomic_dec(&data->disabled);
 }
-static struct ftrace_ops trace_ops __read_mostly =
-{
-        .func = irqsoff_tracer_call,
-        .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
 #endif /* CONFIG_FUNCTION_TRACER */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -176,7 +170,7 @@ irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
        for_each_possible_cpu(cpu)
                per_cpu(tracing_cpu, cpu) = 0;
-        tracing_max_latency = 0;
+        tr->max_latency = 0;
        tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
        return start_irqsoff_tracer(irqsoff_trace, set);
@@ -303,13 +297,13 @@ static void irqsoff_print_header(struct seq_file *s)
 /*
 * Should this new latency be reported/recorded?
 */
-static int report_latency(cycle_t delta)
+static int report_latency(struct trace_array *tr, cycle_t delta)
 {
        if (tracing_thresh) {
                if (delta < tracing_thresh)
                        return 0;
        } else {
-                if (delta <= tracing_max_latency)
+                if (delta <= tr->max_latency)
                        return 0;
        }
        return 1;
@@ -333,13 +327,13 @@ check_critical_timing(struct trace_array *tr,
        pc = preempt_count();
-        if (!report_latency(delta))
+        if (!report_latency(tr, delta))
                goto out;
        raw_spin_lock_irqsave(&max_trace_lock, flags);
        /* check if we are still the max latency */
-        if (!report_latency(delta))
+        if (!report_latency(tr, delta))
                goto out_unlock;
        __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
@@ -352,7 +346,7 @@ check_critical_timing(struct trace_array *tr,
        data->critical_end = parent_ip;
        if (likely(!is_tracing_stopped())) {
-                tracing_max_latency = delta;
+                tr->max_latency = delta;
                update_max_tr_single(tr, current, cpu);
        }
@@ -531,7 +525,7 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
-static int register_irqsoff_function(int graph, int set)
+static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
 {
        int ret;
@@ -543,7 +537,7 @@ static int register_irqsoff_function(int graph, int set)
                ret = register_ftrace_graph(&irqsoff_graph_return,
                                            &irqsoff_graph_entry);
        else
-                ret = register_ftrace_function(&trace_ops);
+                ret = register_ftrace_function(tr->ops);
        if (!ret)
                function_enabled = true;
@@ -551,7 +545,7 @@ static int register_irqsoff_function(int graph, int set)
        return ret;
 }
-static void unregister_irqsoff_function(int graph)
+static void unregister_irqsoff_function(struct trace_array *tr, int graph)
 {
        if (!function_enabled)
                return;
@@ -559,17 +553,17 @@ static void unregister_irqsoff_function(int graph)
        if (graph)
                unregister_ftrace_graph();
        else
-                unregister_ftrace_function(&trace_ops);
+                unregister_ftrace_function(tr->ops);
        function_enabled = false;
 }
-static void irqsoff_function_set(int set)
+static void irqsoff_function_set(struct trace_array *tr, int set)
 {
        if (set)
-                register_irqsoff_function(is_graph(), 1);
+                register_irqsoff_function(tr, is_graph(), 1);
        else
-                unregister_irqsoff_function(is_graph());
+                unregister_irqsoff_function(tr, is_graph());
 }
 static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -577,7 +571,7 @@ static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
        struct tracer *tracer = tr->current_trace;
        if (mask & TRACE_ITER_FUNCTION)
-                irqsoff_function_set(set);
+                irqsoff_function_set(tr, set);
        return trace_keep_overwrite(tracer, mask, set);
 }
@@ -586,7 +580,7 @@ static int start_irqsoff_tracer(struct trace_array *tr, int graph)
 {
        int ret;
-        ret = register_irqsoff_function(graph, 0);
+        ret = register_irqsoff_function(tr, graph, 0);
        if (!ret && tracing_is_enabled())
                tracer_enabled = 1;
@@ -600,25 +594,37 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 {
        tracer_enabled = 0;
-        unregister_irqsoff_function(graph);
+        unregister_irqsoff_function(tr, graph);
 }
-static void __irqsoff_tracer_init(struct trace_array *tr)
+static bool irqsoff_busy;
+static int __irqsoff_tracer_init(struct trace_array *tr)
 {
+        if (irqsoff_busy)
+                return -EBUSY;
        save_flags = trace_flags;
        /* non overwrite screws up the latency tracers */
        set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
        set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
-        tracing_max_latency = 0;
+        tr->max_latency = 0;
        irqsoff_trace = tr;
        /* make sure that the tracer is visible */
        smp_wmb();
        tracing_reset_online_cpus(&tr->trace_buffer);
-        if (start_irqsoff_tracer(tr, is_graph()))
+        ftrace_init_array_ops(tr, irqsoff_tracer_call);
+        /* Only toplevel instance supports graph tracing */
+        if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
+                                      is_graph())))
                printk(KERN_ERR "failed to start irqsoff tracer\n");
+        irqsoff_busy = true;
+        return 0;
 }
 static void irqsoff_tracer_reset(struct trace_array *tr)
@@ -630,6 +636,9 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
        set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
        set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+        ftrace_reset_array_ops(tr);
+        irqsoff_busy = false;
 }
 static void irqsoff_tracer_start(struct trace_array *tr)
@@ -647,8 +656,7 @@ static int irqsoff_tracer_init(struct trace_array *tr)
 {
        trace_type = TRACER_IRQS_OFF;
-        __irqsoff_tracer_init(tr);
+        return __irqsoff_tracer_init(tr);
-        return 0;
 }
 static struct tracer irqsoff_tracer __read_mostly =
 {
@@ -668,6 +676,7 @@ static struct tracer irqsoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
+        .allow_instances = true,
        .use_max_tr     = true,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
@@ -680,8 +689,7 @@ static int preemptoff_tracer_init(struct trace_array *tr)
 {
        trace_type = TRACER_PREEMPT_OFF;
-        __irqsoff_tracer_init(tr);
+        return __irqsoff_tracer_init(tr);
-        return 0;
 }
 static struct tracer preemptoff_tracer __read_mostly =
@@ -702,6 +710,7 @@ static struct tracer preemptoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
+        .allow_instances = true,
        .use_max_tr     = true,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
@@ -716,8 +725,7 @@ static int preemptirqsoff_tracer_init(struct trace_array *tr)
 {
        trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
-        __irqsoff_tracer_init(tr);
+        return __irqsoff_tracer_init(tr);
-        return 0;
 }
 static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -738,6 +746,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
+        .allow_instances = true,
        .use_max_tr     = true,
 };
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 903ae28962be..282f6e4e5539 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -40,27 +40,27 @@ struct trace_kprobe {
        (sizeof(struct probe_arg) * (n)))
-static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk)
+static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
        return tk->rp.handler != NULL;
 }
-static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk)
+static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk)
 {
        return tk->symbol ? tk->symbol : "unknown";
 }
-static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
+static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
 {
        return tk->rp.kp.offset;
 }
-static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk)
+static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
 {
        return !!(kprobe_gone(&tk->rp.kp));
 }
-static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
+static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
                                                 struct module *mod)
 {
        int len = strlen(mod->name);
@@ -68,7 +68,7 @@ static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
        return strncmp(mod->name, name, len) == 0 && name[len] == ':';
 }
-static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
+static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
 {
        return !!strchr(trace_kprobe_symbol(tk), ':');
 }
@@ -132,19 +132,21 @@ struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
 * Kprobes-specific fetch functions
 */
 #define DEFINE_FETCH_stack(type)                                        \
-static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,          \
                                          void *offset, void *dest)     \
 {                                                                       \
        *(type *)dest = (type)regs_get_kernel_stack_nth(regs,           \
                                (unsigned int)((unsigned long)offset)); \
-}
+}                                                                       \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type));
 DEFINE_BASIC_FETCH_FUNCS(stack)
 /* No string on the stack entry */
 #define fetch_stack_string      NULL
 #define fetch_stack_string_size NULL
 #define DEFINE_FETCH_memory(type)                                       \
-static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,         \
                                          void *addr, void *dest)       \
 {                                                                       \
        type retval;                                                    \
@@ -152,14 +154,16 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
                *(type *)dest = 0;                                      \
        else                                                            \
                *(type *)dest = retval;                                 \
-}
+}                                                                       \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type));
 DEFINE_BASIC_FETCH_FUNCS(memory)
 /*
 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
 * length and relative data location.
 */
-static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
-                                                      void *addr, void *dest)
+                                            void *addr, void *dest)
 {
        long ret;
        int maxlen = get_rloc_len(*(u32 *)dest);
@@ -193,10 +197,11 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
                                              get_rloc_offs(*(u32 *)dest));
        }
 }
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
 /* Return the length of string -- including null terminal byte */
-static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
-                                                        void *addr, void *dest)
+                                                 void *addr, void *dest)
 {
        mm_segment_t old_fs;
        int ret, len = 0;
@@ -219,17 +224,19 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
        else
                *(u32 *)dest = len;
 }
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size));
 #define DEFINE_FETCH_symbol(type)                                       \
-__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,      \
+void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\
-                                          void *data, void *dest)       \
 {                                                                       \
        struct symbol_cache *sc = data;                                 \
        if (sc->addr)                                                   \
                fetch_memory_##type(regs, (void *)sc->addr, dest);      \
        else                                                            \
                *(type *)dest = 0;                                      \
-}
+}                                                                       \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type));
 DEFINE_BASIC_FETCH_FUNCS(symbol)
 DEFINE_FETCH_symbol(string)
 DEFINE_FETCH_symbol(string_size)
@@ -907,7 +914,7 @@ static const struct file_operations kprobe_profile_ops = {
 };
 /* Kprobe handler */
-static __kprobes void
+static nokprobe_inline void
 __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
                    struct ftrace_event_file *ftrace_file)
 {
@@ -943,7 +950,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
                                         entry, irq_flags, pc, regs);
 }
-static __kprobes void
+static void
 kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
        struct event_file_link *link;
@@ -951,9 +958,10 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
        list_for_each_entry_rcu(link, &tk->tp.files, list)
                __kprobe_trace_func(tk, regs, link->file);
 }
+NOKPROBE_SYMBOL(kprobe_trace_func);
 /* Kretprobe handler */
-static __kprobes void
+static nokprobe_inline void
 __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
                       struct pt_regs *regs,
                       struct ftrace_event_file *ftrace_file)
@@ -991,7 +999,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
                                         entry, irq_flags, pc, regs);
 }
-static __kprobes void
+static void
 kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
                     struct pt_regs *regs)
 {
@@ -1000,6 +1008,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
        list_for_each_entry_rcu(link, &tk->tp.files, list)
                __kretprobe_trace_func(tk, ri, regs, link->file);
 }
+NOKPROBE_SYMBOL(kretprobe_trace_func);
 /* Event entry printers */
 static enum print_line_t
@@ -1131,7 +1140,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 /* Kprobe profile handler */
-static __kprobes void
+static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
        struct ftrace_event_call *call = &tk->tp.call;
@@ -1158,9 +1167,10 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
        store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
        perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
 }
+NOKPROBE_SYMBOL(kprobe_perf_func);
 /* Kretprobe profile handler */
-static __kprobes void
+static void
 kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
                    struct pt_regs *regs)
 {
@@ -1188,6 +1198,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
        store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
        perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
 }
+NOKPROBE_SYMBOL(kretprobe_perf_func);
 #endif  /* CONFIG_PERF_EVENTS */
 /*
@@ -1196,9 +1207,8 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
 * lockless, but we can't race with this __init function.
 */
-static __kprobes
+static int kprobe_register(struct ftrace_event_call *event,
-int kprobe_register(struct ftrace_event_call *event,
+                           enum trace_reg type, void *data)
-                    enum trace_reg type, void *data)
 {
        struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
        struct ftrace_event_file *file = data;
@@ -1224,8 +1234,7 @@ int kprobe_register(struct ftrace_event_call *event,
        return 0;
 }
-static __kprobes
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
-int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
        struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
@@ -1239,9 +1248,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 #endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
+NOKPROBE_SYMBOL(kprobe_dispatcher);
-static __kprobes
+static int
-int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
        struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
@@ -1255,6 +1265,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 #endif
        return 0;       /* We don't tweek kernel, so just return 0 */
 }
+NOKPROBE_SYMBOL(kretprobe_dispatcher);
 static struct trace_event_functions kretprobe_funcs = {
        .trace          = print_kretprobe_event
@@ -1377,6 +1388,9 @@ static __init int kprobe_trace_self_tests_init(void)
        struct trace_kprobe *tk;
        struct ftrace_event_file *file;
+        if (tracing_is_disabled())
+                return -ENODEV;
        target = kprobe_trace_selftest_target;
        pr_info("Testing kprobe tracing: ");
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 69a5cc94c01a..fcf0a9e48916 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -91,7 +91,6 @@ struct tracer nop_trace __read_mostly =
        .name           = "nop",
        .init           = nop_trace_init,
        .reset          = nop_trace_reset,
-        .wait_pipe      = poll_wait_pipe,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest       = trace_selftest_startup_nop,
 #endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a436de18aa99..f3dad80c20b2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -126,6 +126,34 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 EXPORT_SYMBOL_GPL(trace_seq_printf);
 /**
+ * trace_seq_bitmask - put a list of longs as a bitmask print output
+ * @s:          trace sequence descriptor
+ * @maskp:      points to an array of unsigned longs that represent a bitmask
+ * @nmaskbits:  The number of bits that are valid in @maskp
+ *
+ * It returns 0 if the trace oversizes the buffer's free
+ * space, 1 otherwise.
+ *
+ * Writes a ASCII representation of a bitmask string into @s.
+ */
+int
+trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+                  int nmaskbits)
+{
+        int len = (PAGE_SIZE - 1) - s->len;
+        int ret;
+        if (s->full || !len)
+                return 0;
+        ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
+        s->len += ret;
+        return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_bitmask);
+/**
 * trace_seq_vprintf - sequence printing of trace information
 * @s: trace sequence descriptor
 * @fmt: printf format string
@@ -399,6 +427,19 @@ EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
 #endif
 const char *
+ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
+                         unsigned int bitmask_size)
+{
+        const char *ret = p->buffer + p->len;
+        trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
+        trace_seq_putc(p, 0);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(ftrace_print_bitmask_seq);
+const char *
 ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 {
        int i;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8364a421b4df..d4b9fc22cd27 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -37,13 +37,13 @@ const char *reserved_field_names[] = {
 /* Printing  in basic type function template */
 #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt)                         \
-__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,   \
+int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name,   \
-                                                const char *name,       \
+                                void *data, void *ent)                  \
-                                                void *data, void *ent)  \
 {                                                                       \
        return trace_seq_printf(s, " %s=" fmt, name, *(type *)data);    \
 }                                                                       \
-const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+const char PRINT_TYPE_FMT_NAME(type)[] = fmt;                           \
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
 DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
 DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
@@ -55,9 +55,8 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
 DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
 /* Print type function for string type */
-__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
+int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
-                                                  const char *name,
+                                 void *data, void *ent)
-                                                  void *data, void *ent)
 {
        int len = *(u32 *)data >> 16;
@@ -67,6 +66,7 @@ __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
                return trace_seq_printf(s, " %s=\"%s\"", name,
                                        (const char *)get_loc_data(data, ent));
 }
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
 const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
@@ -81,23 +81,24 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
 /* Data fetch function templates */
 #define DEFINE_FETCH_reg(type)                                          \
-__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs,         \
+void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \
-                                        void *offset, void *dest)       \
 {                                                                       \
        *(type *)dest = (type)regs_get_register(regs,                   \
                                (unsigned int)((unsigned long)offset)); \
-}
+}                                                                       \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type));
 DEFINE_BASIC_FETCH_FUNCS(reg)
 /* No string on the register */
 #define fetch_reg_string        NULL
 #define fetch_reg_string_size   NULL
 #define DEFINE_FETCH_retval(type)                                       \
-__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,      \
+void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,                \
-                                          void *dummy, void *dest)      \
+                                   void *dummy, void *dest)             \
 {                                                                       \
        *(type *)dest = (type)regs_return_value(regs);                  \
-}
+}                                                                       \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type));
 DEFINE_BASIC_FETCH_FUNCS(retval)
 /* No string on the retval */
 #define fetch_retval_string             NULL
@@ -112,8 +113,8 @@ struct deref_fetch_param {
 };
 #define DEFINE_FETCH_deref(type)                                        \
-__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,       \
+void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,                 \
-                                            void *data, void *dest)     \
+                                  void *data, void *dest)               \
 {                                                                       \
        struct deref_fetch_param *dprm = data;                          \
        unsigned long addr;                                             \
@@ -123,12 +124,13 @@ __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,	\
                dprm->fetch(regs, (void *)addr, dest);                  \
        } else                                                          \
                *(type *)dest = 0;                                      \
-}
+}                                                                       \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type));
 DEFINE_BASIC_FETCH_FUNCS(deref)
 DEFINE_FETCH_deref(string)
-__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
+void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
-                                                   void *data, void *dest)
+                                         void *data, void *dest)
 {
        struct deref_fetch_param *dprm = data;
        unsigned long addr;
@@ -140,16 +142,18 @@ __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
        } else
                *(string_size *)dest = 0;
 }
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size));
-static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
+static void update_deref_fetch_param(struct deref_fetch_param *data)
 {
        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
                update_deref_fetch_param(data->orig.data);
        else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
                update_symbol_cache(data->orig.data);
 }
+NOKPROBE_SYMBOL(update_deref_fetch_param);
-static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
+static void free_deref_fetch_param(struct deref_fetch_param *data)
 {
        if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
                free_deref_fetch_param(data->orig.data);
@@ -157,6 +161,7 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
                free_symbol_cache(data->orig.data);
        kfree(data);
 }
+NOKPROBE_SYMBOL(free_deref_fetch_param);
 /* Bitfield fetch function */
 struct bitfield_fetch_param {
@@ -166,8 +171,8 @@ struct bitfield_fetch_param {
 };
 #define DEFINE_FETCH_bitfield(type)                                     \
-__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,    \
+void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,              \
-                                            void *data, void *dest)     \
+                                     void *data, void *dest)            \
 {                                                                       \
        struct bitfield_fetch_param *bprm = data;                       \
        type buf = 0;                                                   \
@@ -177,13 +182,13 @@ __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,	\
                buf >>= bprm->low_shift;                                \
        }                                                               \
        *(type *)dest = buf;                                            \
-}
+}                                                                       \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type));
 DEFINE_BASIC_FETCH_FUNCS(bitfield)
 #define fetch_bitfield_string           NULL
 #define fetch_bitfield_string_size      NULL
-static __kprobes void
+static void
 update_bitfield_fetch_param(struct bitfield_fetch_param *data)
 {
        /*
@@ -196,7 +201,7 @@ update_bitfield_fetch_param(struct bitfield_fetch_param *data)
                update_symbol_cache(data->orig.data);
 }
-static __kprobes void
+static void
 free_bitfield_fetch_param(struct bitfield_fetch_param *data)
 {
        /*
@@ -255,17 +260,17 @@ fail:
 }
 /* Special function : only accept unsigned long */
-static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs,
+static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest)
-                                                 void *dummy, void *dest)
 {
        *(unsigned long *)dest = kernel_stack_pointer(regs);
 }
+NOKPROBE_SYMBOL(fetch_kernel_stack_address);
-static __kprobes void fetch_user_stack_address(struct pt_regs *regs,
+static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest)
-                                               void *dummy, void *dest)
 {
        *(unsigned long *)dest = user_stack_pointer(regs);
 }
+NOKPROBE_SYMBOL(fetch_user_stack_address);
 static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
                                            fetch_func_t orig_fn,
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index fb1ab5dfbd42..4f815fbce16d 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -81,13 +81,13 @@
 */
 #define convert_rloc_to_loc(dl, offs)   ((u32)(dl) + (offs))
-static inline void *get_rloc_data(u32 *dl)
+static nokprobe_inline void *get_rloc_data(u32 *dl)
 {
        return (u8 *)dl + get_rloc_offs(*dl);
 }
 /* For data_loc conversion */
-static inline void *get_loc_data(u32 *dl, void *ent)
+static nokprobe_inline void *get_loc_data(u32 *dl, void *ent)
 {
        return (u8 *)ent + get_rloc_offs(*dl);
 }
@@ -136,9 +136,8 @@ typedef u32 string_size;
 /* Printing  in basic type function template */
 #define DECLARE_BASIC_PRINT_TYPE_FUNC(type)                             \
-__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s,           \
+int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name,   \
-                                         const char *name,              \
+                                void *data, void *ent);                 \
-                                         void *data, void *ent);        \
 extern const char PRINT_TYPE_FMT_NAME(type)[]
 DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
@@ -303,7 +302,7 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp)
        return !!(tp->flags & TP_FLAG_REGISTERED);
 }
-static inline __kprobes void call_fetch(struct fetch_param *fprm,
+static nokprobe_inline void call_fetch(struct fetch_param *fprm,
                                 struct pt_regs *regs, void *dest)
 {
        return fprm->fn(regs, fprm->data, dest);
@@ -351,7 +350,7 @@ extern ssize_t traceprobe_probes_write(struct file *file,
 extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
 /* Sum up total data length for dynamic arraies (strings) */
-static inline __kprobes int
+static nokprobe_inline int
 __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
 {
        int i, ret = 0;
@@ -367,7 +366,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
 }
 /* Store the value of each argument */
-static inline __kprobes void
+static nokprobe_inline void
 store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
                 u8 *data, int maxlen)
 {
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e14da5e97a69..19bd8928ce94 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -130,15 +130,9 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
        atomic_dec(&data->disabled);
        preempt_enable_notrace();
 }
-static struct ftrace_ops trace_ops __read_mostly =
-{
-        .func = wakeup_tracer_call,
-        .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
 #endif /* CONFIG_FUNCTION_TRACER */
-static int register_wakeup_function(int graph, int set)
+static int register_wakeup_function(struct trace_array *tr, int graph, int set)
 {
        int ret;
@@ -150,7 +144,7 @@ static int register_wakeup_function(int graph, int set)
                ret = register_ftrace_graph(&wakeup_graph_return,
                                            &wakeup_graph_entry);
        else
-                ret = register_ftrace_function(&trace_ops);
+                ret = register_ftrace_function(tr->ops);
        if (!ret)
                function_enabled = true;
@@ -158,7 +152,7 @@ static int register_wakeup_function(int graph, int set)
        return ret;
 }
-static void unregister_wakeup_function(int graph)
+static void unregister_wakeup_function(struct trace_array *tr, int graph)
 {
        if (!function_enabled)
                return;
@@ -166,17 +160,17 @@ static void unregister_wakeup_function(int graph)
        if (graph)
                unregister_ftrace_graph();
        else
-                unregister_ftrace_function(&trace_ops);
+                unregister_ftrace_function(tr->ops);
        function_enabled = false;
 }
-static void wakeup_function_set(int set)
+static void wakeup_function_set(struct trace_array *tr, int set)
 {
        if (set)
-                register_wakeup_function(is_graph(), 1);
+                register_wakeup_function(tr, is_graph(), 1);
        else
-                unregister_wakeup_function(is_graph());
+                unregister_wakeup_function(tr, is_graph());
 }
 static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
@@ -184,16 +178,16 @@ static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
        struct tracer *tracer = tr->current_trace;
        if (mask & TRACE_ITER_FUNCTION)
-                wakeup_function_set(set);
+                wakeup_function_set(tr, set);
        return trace_keep_overwrite(tracer, mask, set);
 }
-static int start_func_tracer(int graph)
+static int start_func_tracer(struct trace_array *tr, int graph)
 {
        int ret;
-        ret = register_wakeup_function(graph, 0);
+        ret = register_wakeup_function(tr, graph, 0);
        if (!ret && tracing_is_enabled())
                tracer_enabled = 1;
@@ -203,11 +197,11 @@ static int start_func_tracer(int graph)
        return ret;
 }
-static void stop_func_tracer(int graph)
+static void stop_func_tracer(struct trace_array *tr, int graph)
 {
        tracer_enabled = 0;
-        unregister_wakeup_function(graph);
+        unregister_wakeup_function(tr, graph);
 }
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -221,12 +215,12 @@ wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
        if (!(is_graph() ^ set))
                return 0;
-        stop_func_tracer(!set);
+        stop_func_tracer(tr, !set);
        wakeup_reset(wakeup_trace);
-        tracing_max_latency = 0;
+        tr->max_latency = 0;
-        return start_func_tracer(set);
+        return start_func_tracer(tr, set);
 }
 static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
@@ -350,13 +344,13 @@ static void wakeup_print_header(struct seq_file *s)
 /*
 * Should this new latency be reported/recorded?
 */
-static int report_latency(cycle_t delta)
+static int report_latency(struct trace_array *tr, cycle_t delta)
 {
        if (tracing_thresh) {
                if (delta < tracing_thresh)
                        return 0;
        } else {
-                if (delta <= tracing_max_latency)
+                if (delta <= tr->max_latency)
                        return 0;
        }
        return 1;
@@ -424,11 +418,11 @@ probe_wakeup_sched_switch(void *ignore,
        T1 = ftrace_now(cpu);
        delta = T1-T0;
-        if (!report_latency(delta))
+        if (!report_latency(wakeup_trace, delta))
                goto out_unlock;
        if (likely(!is_tracing_stopped())) {
-                tracing_max_latency = delta;
+                wakeup_trace->max_latency = delta;
                update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
        }
@@ -587,7 +581,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
         */
        smp_wmb();
-        if (start_func_tracer(is_graph()))
+        if (start_func_tracer(tr, is_graph()))
                printk(KERN_ERR "failed to start wakeup tracer\n");
        return;
@@ -600,13 +594,15 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
        tracer_enabled = 0;
-        stop_func_tracer(is_graph());
+        stop_func_tracer(tr, is_graph());
        unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
        unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
        unregister_trace_sched_wakeup(probe_wakeup, NULL);
        unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
 }
+static bool wakeup_busy;
 static int __wakeup_tracer_init(struct trace_array *tr)
 {
        save_flags = trace_flags;
@@ -615,14 +611,20 @@ static int __wakeup_tracer_init(struct trace_array *tr)
        set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
        set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
-        tracing_max_latency = 0;
+        tr->max_latency = 0;
        wakeup_trace = tr;
+        ftrace_init_array_ops(tr, wakeup_tracer_call);
        start_wakeup_tracer(tr);
+        wakeup_busy = true;
        return 0;
 }
 static int wakeup_tracer_init(struct trace_array *tr)
 {
+        if (wakeup_busy)
+                return -EBUSY;
        wakeup_dl = 0;
        wakeup_rt = 0;
        return __wakeup_tracer_init(tr);
@@ -630,6 +632,9 @@ static int wakeup_tracer_init(struct trace_array *tr)
 static int wakeup_rt_tracer_init(struct trace_array *tr)
 {
+        if (wakeup_busy)
+                return -EBUSY;
        wakeup_dl = 0;
        wakeup_rt = 1;
        return __wakeup_tracer_init(tr);
@@ -637,6 +642,9 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
 static int wakeup_dl_tracer_init(struct trace_array *tr)
 {
+        if (wakeup_busy)
+                return -EBUSY;
        wakeup_dl = 1;
        wakeup_rt = 0;
        return __wakeup_tracer_init(tr);
@@ -653,6 +661,8 @@ static void wakeup_tracer_reset(struct trace_array *tr)
        set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
        set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
+        ftrace_reset_array_ops(tr);
+        wakeup_busy = false;
 }
 static void wakeup_tracer_start(struct trace_array *tr)
@@ -684,6 +694,7 @@ static struct tracer wakeup_tracer __read_mostly =
 #endif
        .open           = wakeup_trace_open,
        .close          = wakeup_trace_close,
+        .allow_instances = true,
        .use_max_tr     = true,
 };
@@ -694,7 +705,6 @@ static struct tracer wakeup_rt_tracer __read_mostly =
        .reset          = wakeup_tracer_reset,
        .start          = wakeup_tracer_start,
        .stop           = wakeup_tracer_stop,
-        .wait_pipe      = poll_wait_pipe,
        .print_max      = true,
        .print_header   = wakeup_print_header,
        .print_line     = wakeup_print_line,
@@ -706,6 +716,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 #endif
        .open           = wakeup_trace_open,
        .close          = wakeup_trace_close,
+        .allow_instances = true,
        .use_max_tr     = true,
 };
@@ -716,7 +727,6 @@ static struct tracer wakeup_dl_tracer __read_mostly =
        .reset          = wakeup_tracer_reset,
        .start          = wakeup_tracer_start,
        .stop           = wakeup_tracer_stop,
-        .wait_pipe      = poll_wait_pipe,
        .print_max      = true,
        .print_header   = wakeup_print_header,
        .print_line     = wakeup_print_line,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index e98fca60974f..5ef60499dc8e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -65,7 +65,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
        /* Don't allow flipping of max traces now */
        local_irq_save(flags);
-        arch_spin_lock(&ftrace_max_lock);
+        arch_spin_lock(&buf->tr->max_lock);
        cnt = ring_buffer_entries(buf->buffer);
@@ -83,7 +83,7 @@ static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
                        break;
        }
        tracing_on();
-        arch_spin_unlock(&ftrace_max_lock);
+        arch_spin_unlock(&buf->tr->max_lock);
        local_irq_restore(flags);
        if (count)
@@ -161,11 +161,6 @@ static struct ftrace_ops test_probe3 = {
        .flags                  = FTRACE_OPS_FL_RECURSION_SAFE,
 };
-static struct ftrace_ops test_global = {
-        .func           = trace_selftest_test_global_func,
-        .flags          = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
-};
 static void print_counts(void)
 {
        printk("(%d %d %d %d %d) ",
@@ -185,7 +180,7 @@ static void reset_counts(void)
        trace_selftest_test_dyn_cnt = 0;
 }
-static int trace_selftest_ops(int cnt)
+static int trace_selftest_ops(struct trace_array *tr, int cnt)
 {
        int save_ftrace_enabled = ftrace_enabled;
        struct ftrace_ops *dyn_ops;
@@ -220,7 +215,11 @@ static int trace_selftest_ops(int cnt)
        register_ftrace_function(&test_probe1);
        register_ftrace_function(&test_probe2);
        register_ftrace_function(&test_probe3);
-        register_ftrace_function(&test_global);
+        /* First time we are running with main function */
+        if (cnt > 1) {
+                ftrace_init_array_ops(tr, trace_selftest_test_global_func);
+                register_ftrace_function(tr->ops);
+        }
        DYN_FTRACE_TEST_NAME();
@@ -232,8 +231,10 @@ static int trace_selftest_ops(int cnt)
                goto out;
        if (trace_selftest_test_probe3_cnt != 1)
                goto out;
-        if (trace_selftest_test_global_cnt == 0)
+        if (cnt > 1) {
-                goto out;
+                if (trace_selftest_test_global_cnt == 0)
+                        goto out;
+        }
        DYN_FTRACE_TEST_NAME2();
@@ -269,8 +270,10 @@ static int trace_selftest_ops(int cnt)
                goto out_free;
        if (trace_selftest_test_probe3_cnt != 3)
                goto out_free;
-        if (trace_selftest_test_global_cnt == 0)
+        if (cnt > 1) {
-                goto out;
+                if (trace_selftest_test_global_cnt == 0)
+                        goto out;
+        }
        if (trace_selftest_test_dyn_cnt == 0)
                goto out_free;
@@ -295,7 +298,9 @@ static int trace_selftest_ops(int cnt)
        unregister_ftrace_function(&test_probe1);
        unregister_ftrace_function(&test_probe2);
        unregister_ftrace_function(&test_probe3);
-        unregister_ftrace_function(&test_global);
+        if (cnt > 1)
+                unregister_ftrace_function(tr->ops);
+        ftrace_reset_array_ops(tr);
        /* Make sure everything is off */
        reset_counts();
@@ -315,9 +320,9 @@ static int trace_selftest_ops(int cnt)
 }
 /* Test dynamic code modification and ftrace filters */
-int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
-                                           struct trace_array *tr,
+                                                  struct trace_array *tr,
-                                           int (*func)(void))
+                                                  int (*func)(void))
 {
        int save_ftrace_enabled = ftrace_enabled;
        unsigned long count;
@@ -388,7 +393,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        }
        /* Test the ops with global tracing running */
-        ret = trace_selftest_ops(1);
+        ret = trace_selftest_ops(tr, 1);
        trace->reset(tr);
 out:
@@ -399,7 +404,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        /* Test the ops with global tracing off */
        if (!ret)
-                ret = trace_selftest_ops(2);
+                ret = trace_selftest_ops(tr, 2);
        return ret;
 }
@@ -802,7 +807,7 @@ out:
 int
 trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 {
-        unsigned long save_max = tracing_max_latency;
+        unsigned long save_max = tr->max_latency;
        unsigned long count;
        int ret;
@@ -814,7 +819,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
        }
        /* reset the max latency */
-        tracing_max_latency = 0;
+        tr->max_latency = 0;
        /* disable interrupts for a bit */
        local_irq_disable();
        udelay(100);
@@ -841,7 +846,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
                ret = -1;
        }
-        tracing_max_latency = save_max;
+        tr->max_latency = save_max;
        return ret;
 }
@@ -851,7 +856,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 int
 trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 {
-        unsigned long save_max = tracing_max_latency;
+        unsigned long save_max = tr->max_latency;
        unsigned long count;
        int ret;
@@ -876,7 +881,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
        }
        /* reset the max latency */
-        tracing_max_latency = 0;
+        tr->max_latency = 0;
        /* disable preemption for a bit */
        preempt_disable();
        udelay(100);
@@ -903,7 +908,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
                ret = -1;
        }
-        tracing_max_latency = save_max;
+        tr->max_latency = save_max;
        return ret;
 }
@@ -913,7 +918,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 int
 trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
 {
-        unsigned long save_max = tracing_max_latency;
+        unsigned long save_max = tr->max_latency;
        unsigned long count;
        int ret;
@@ -938,7 +943,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
        }
        /* reset the max latency */
-        tracing_max_latency = 0;
+        tr->max_latency = 0;
        /* disable preemption and interrupts for a bit */
        preempt_disable();
@@ -973,7 +978,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
        }
        /* do the test by disabling interrupts first this time */
-        tracing_max_latency = 0;
+        tr->max_latency = 0;
        tracing_start();
        trace->start(tr);
@@ -1004,7 +1009,7 @@ out:
        tracing_start();
 out_no_start:
        trace->reset(tr);
-        tracing_max_latency = save_max;
+        tr->max_latency = save_max;
        return ret;
 }
@@ -1057,7 +1062,7 @@ static int trace_wakeup_test_thread(void *data)
 int
 trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 {
-        unsigned long save_max = tracing_max_latency;
+        unsigned long save_max = tr->max_latency;
        struct task_struct *p;
        struct completion is_ready;
        unsigned long count;
@@ -1083,7 +1088,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
        }
        /* reset the max latency */
-        tracing_max_latency = 0;
+        tr->max_latency = 0;
        while (p->on_rq) {
                /*
@@ -1113,7 +1118,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
        trace->reset(tr);
        tracing_start();
-        tracing_max_latency = save_max;
+        tr->max_latency = save_max;
        /* kill the thread */
        kthread_stop(p);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 21b320e5d163..8a4e5cb66a4c 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -51,11 +51,33 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
 int stack_tracer_enabled;
 static int last_stack_tracer_enabled;
+static inline void print_max_stack(void)
+{
+        long i;
+        int size;
+        pr_emerg("        Depth    Size   Location    (%d entries)\n"
+                           "        -----    ----   --------\n",
+                           max_stack_trace.nr_entries - 1);
+        for (i = 0; i < max_stack_trace.nr_entries; i++) {
+                if (stack_dump_trace[i] == ULONG_MAX)
+                        break;
+                if (i+1 == max_stack_trace.nr_entries ||
+                                stack_dump_trace[i+1] == ULONG_MAX)
+                        size = stack_dump_index[i];
+                else
+                        size = stack_dump_index[i] - stack_dump_index[i+1];
+                pr_emerg("%3ld) %8d   %5d   %pS\n", i, stack_dump_index[i],
+                                size, (void *)stack_dump_trace[i]);
+        }
+}
 static inline void
 check_stack(unsigned long ip, unsigned long *stack)
 {
-        unsigned long this_size, flags;
+        unsigned long this_size, flags; unsigned long *p, *top, *start;
-        unsigned long *p, *top, *start;
        static int tracer_frame;
        int frame_size = ACCESS_ONCE(tracer_frame);
        int i;
@@ -85,8 +107,12 @@ check_stack(unsigned long ip, unsigned long *stack)
        max_stack_size = this_size;
-        max_stack_trace.nr_entries      = 0;
+        max_stack_trace.nr_entries = 0;
-        max_stack_trace.skip            = 3;
+        if (using_ftrace_ops_list_func())
+                max_stack_trace.skip = 4;
+        else
+                max_stack_trace.skip = 3;
        save_stack_trace(&max_stack_trace);
@@ -145,8 +171,12 @@ check_stack(unsigned long ip, unsigned long *stack)
                        i++;
        }
-        BUG_ON(current != &init_task &&
+        if ((current != &init_task &&
-                *(end_of_stack(current)) != STACK_END_MAGIC);
+                *(end_of_stack(current)) != STACK_END_MAGIC)) {
+                print_max_stack();
+                BUG();
+        }
 out:
        arch_spin_unlock(&max_stack_lock);
        local_irq_restore(flags);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c082a7441345..3c9b97e6b1f4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -108,8 +108,8 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
 * Uprobes-specific fetch functions
 */
 #define DEFINE_FETCH_stack(type)                                        \
-static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,          \
-                                          void *offset, void *dest)     \
+                                         void *offset, void *dest)      \
 {                                                                       \
        *(type *)dest = (type)get_user_stack_nth(regs,                  \
                                              ((unsigned long)offset)); \
@@ -120,8 +120,8 @@ DEFINE_BASIC_FETCH_FUNCS(stack)
 #define fetch_stack_string_size NULL
 #define DEFINE_FETCH_memory(type)                                       \
-static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,         \
-                                                void *addr, void *dest) \
+                                          void *addr, void *dest)       \
 {                                                                       \
        type retval;                                                    \
        void __user *vaddr = (void __force __user *) addr;              \
@@ -136,8 +136,8 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
 * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
 * length and relative data location.
 */
-static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
-                                                      void *addr, void *dest)
+                                            void *addr, void *dest)
 {
        long ret;
        u32 rloc = *(u32 *)dest;
@@ -158,8 +158,8 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
        }
 }
-static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
-                                                      void *addr, void *dest)
+                                                 void *addr, void *dest)
 {
        int len;
        void __user *vaddr = (void __force __user *) addr;
@@ -184,8 +184,8 @@ static unsigned long translate_user_vaddr(void *file_offset)
 }
 #define DEFINE_FETCH_file_offset(type)                                  \
-static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\
+static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,    \
-                                        void *offset, void *dest)       \
+                                               void *offset, void *dest)\
 {                                                                       \
        void *vaddr = (void *)translate_user_vaddr(offset);             \
                                                                        \
@@ -893,6 +893,9 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
        int ret;
        if (file) {
+                if (tu->tp.flags & TP_FLAG_PROFILE)
+                        return -EINTR;
                link = kmalloc(sizeof(*link), GFP_KERNEL);
                if (!link)
                        return -ENOMEM;
@@ -901,29 +904,40 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
                list_add_tail_rcu(&link->list, &tu->tp.files);
                tu->tp.flags |= TP_FLAG_TRACE;
-        } else
+        } else {
-                tu->tp.flags |= TP_FLAG_PROFILE;
+                if (tu->tp.flags & TP_FLAG_TRACE)
+                        return -EINTR;
-        ret = uprobe_buffer_enable();
+                tu->tp.flags |= TP_FLAG_PROFILE;
-        if (ret < 0)
+        }
-                return ret;
        WARN_ON(!uprobe_filter_is_empty(&tu->filter));
        if (enabled)
                return 0;
+        ret = uprobe_buffer_enable();
+        if (ret)
+                goto err_flags;
        tu->consumer.filter = filter;
        ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
-        if (ret) {
+        if (ret)
-                if (file) {
+                goto err_buffer;
-                        list_del(&link->list);
-                        kfree(link);
+        return 0;
-                        tu->tp.flags &= ~TP_FLAG_TRACE;
-                } else
-                        tu->tp.flags &= ~TP_FLAG_PROFILE;
-        }
+ err_buffer:
+        uprobe_buffer_disable();
+ err_flags:
+        if (file) {
+                list_del(&link->list);
+                kfree(link);
+                tu->tp.flags &= ~TP_FLAG_TRACE;
+        } else {
+                tu->tp.flags &= ~TP_FLAG_PROFILE;
+        }
        return ret;
 }
@@ -1009,56 +1023,60 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
        return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
 }
-static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
 {
        bool done;
        write_lock(&tu->filter.rwlock);
        if (event->hw.tp_target) {
-                /*
+                list_del(&event->hw.tp_list);
-                 * event->parent != NULL means copy_process(), we can avoid
-                 * uprobe_apply(). current->mm must be probed and we can rely
-                 * on dup_mmap() which preserves the already installed bp's.
-                 *
-                 * attr.enable_on_exec means that exec/mmap will install the
-                 * breakpoints we need.
-                 */
                done = tu->filter.nr_systemwide ||
-                        event->parent || event->attr.enable_on_exec ||
+                        (event->hw.tp_target->flags & PF_EXITING) ||
                        uprobe_filter_event(tu, event);
-                list_add(&event->hw.tp_list, &tu->filter.perf_events);
        } else {
+                tu->filter.nr_systemwide--;
                done = tu->filter.nr_systemwide;
-                tu->filter.nr_systemwide++;
        }
        write_unlock(&tu->filter.rwlock);
        if (!done)
-                uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+                return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
        return 0;
 }
-static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
 {
        bool done;
+        int err;
        write_lock(&tu->filter.rwlock);
        if (event->hw.tp_target) {
-                list_del(&event->hw.tp_list);
+                /*
+                 * event->parent != NULL means copy_process(), we can avoid
+                 * uprobe_apply(). current->mm must be probed and we can rely
+                 * on dup_mmap() which preserves the already installed bp's.
+                 *
+                 * attr.enable_on_exec means that exec/mmap will install the
+                 * breakpoints we need.
+                 */
                done = tu->filter.nr_systemwide ||
-                        (event->hw.tp_target->flags & PF_EXITING) ||
+                        event->parent || event->attr.enable_on_exec ||
                        uprobe_filter_event(tu, event);
+                list_add(&event->hw.tp_list, &tu->filter.perf_events);
        } else {
-                tu->filter.nr_systemwide--;
                done = tu->filter.nr_systemwide;
+                tu->filter.nr_systemwide++;
        }
        write_unlock(&tu->filter.rwlock);
-        if (!done)
+        err = 0;
-                uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+        if (!done) {
+                err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
-        return 0;
+                if (err)
+                        uprobe_perf_close(tu, event);
+        }
+        return err;
 }
 static bool uprobe_perf_filter(struct uprobe_consumer *uc,
@@ -1197,12 +1215,6 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
        current->utask->vaddr = (unsigned long) &udd;
-#ifdef CONFIG_PERF_EVENTS
-        if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
-            !uprobe_perf_filter(&tu->consumer, 0, current->mm))
-                return UPROBE_HANDLER_REMOVE;
-#endif
        if (WARN_ON_ONCE(!uprobe_cpu_buffer))
                return 0;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 6620e5837ce2..3490407dc7b7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 * tracepoint_probe_register -  Connect a probe to a tracepoint
 * @tp: tracepoint
 * @probe: probe handler
+ * @data: tracepoint data
 *
 * Returns 0 if ok, error value on error.
 * Note: if @tp is within a module, the caller is responsible for
@@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
 * tracepoint_probe_unregister -  Disconnect a probe from a tracepoint
 * @tp: tracepoint
 * @probe: probe function pointer
+ * @data: tracepoint data
 *
 * Returns 0 if ok, error value on error.
 */
@@ -490,33 +492,29 @@ static int sys_tracepoint_refcount;
 void syscall_regfunc(void)
 {
-        unsigned long flags;
+        struct task_struct *p, *t;
-        struct task_struct *g, *t;
        if (!sys_tracepoint_refcount) {
-                read_lock_irqsave(&tasklist_lock, flags);
+                read_lock(&tasklist_lock);
-                do_each_thread(g, t) {
+                for_each_process_thread(p, t) {
-                        /* Skip kernel threads. */
+                        set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
-                        if (t->mm)
+                }
-                                set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+                read_unlock(&tasklist_lock);
-                } while_each_thread(g, t);
-                read_unlock_irqrestore(&tasklist_lock, flags);
        }
        sys_tracepoint_refcount++;
 }
 void syscall_unregfunc(void)
 {
-        unsigned long flags;
+        struct task_struct *p, *t;
-        struct task_struct *g, *t;
        sys_tracepoint_refcount--;
        if (!sys_tracepoint_refcount) {
-                read_lock_irqsave(&tasklist_lock, flags);
+                read_lock(&tasklist_lock);
-                do_each_thread(g, t) {
+                for_each_process_thread(p, t) {
                        clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
-                } while_each_thread(g, t);
+                }
-                read_unlock_irqrestore(&tasklist_lock, flags);
+                read_unlock(&tasklist_lock);
        }
 }
 #endif
diff --git a/kernel/user.c b/kernel/user.c
index 294fc6a94168..4efa39350e44 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
 struct user_struct root_user = {
        .__count        = ATOMIC_INIT(1),
        .processes      = ATOMIC_INIT(1),
-        .files          = ATOMIC_INIT(0),
        .sigpending     = ATOMIC_INIT(0),
        .locked_shm     = 0,
        .uid            = GLOBAL_ROOT_UID,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index bf71b4b2d632..fcc02560fd6b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -286,7 +286,7 @@ EXPORT_SYMBOL(from_kuid_munged);
 /**
 *      make_kgid - Map a user-namespace gid pair into a kgid.
 *      @ns:  User namespace that the gid is in
- *      @uid: group identifier
+ *      @gid: group identifier
 *
 *      Maps a user-namespace gid pair into a kernel internal kgid,
 *      and returns that kgid.
@@ -482,7 +482,8 @@ static int projid_m_show(struct seq_file *seq, void *v)
        return 0;
 }
-static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
+static void *m_start(struct seq_file *seq, loff_t *ppos,
+                     struct uid_gid_map *map)
 {
        struct uid_gid_extent *extent = NULL;
        loff_t pos = *ppos;
@@ -546,7 +547,8 @@ struct seq_operations proc_projid_seq_operations = {
        .show = projid_m_show,
 };
-static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent)
+static bool mappings_overlap(struct uid_gid_map *new_map,
+                             struct uid_gid_extent *extent)
 {
        u32 upper_first, lower_first, upper_last, lower_last;
        unsigned idx;
@@ -653,7 +655,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
        ret = -EINVAL;
        pos = kbuf;
        new_map.nr_extents = 0;
-        for (;pos; pos = next_line) {
+        for (; pos; pos = next_line) {
                extent = &new_map.extent[new_map.nr_extents];
                /* Find the end of line and ensure I don't look past it */
@@ -687,13 +689,16 @@ static ssize_t map_write(struct file *file, const char __user *buf,
                /* Verify we have been given valid starting values */
                if ((extent->first == (u32) -1) ||
-                    (extent->lower_first == (u32) -1 ))
+                    (extent->lower_first == (u32) -1))
                        goto out;
-                /* Verify count is not zero and does not cause the extent to wrap */
+                /* Verify count is not zero and does not cause the
+                 * extent to wrap
+                 */
                if ((extent->first + extent->count) <= extent->first)
                        goto out;
-                if ((extent->lower_first + extent->count) <= extent->lower_first)
+                if ((extent->lower_first + extent->count) <=
+                     extent->lower_first)
                        goto out;
                /* Do the ranges in extent overlap any previous extents? */
@@ -751,7 +756,8 @@ out:
        return ret;
 }
-ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
+                           size_t size, loff_t *ppos)
 {
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
@@ -767,7 +773,8 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
                         &ns->uid_map, &ns->parent->uid_map);
 }
-ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
+                           size_t size, loff_t *ppos)
 {
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
@@ -783,7 +790,8 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
                         &ns->gid_map, &ns->parent->gid_map);
 }
-ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
+ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
+                              size_t size, loff_t *ppos)
 {
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
@@ -800,7 +808,7 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
                         &ns->projid_map, &ns->parent->projid_map);
 }
-static bool new_idmap_permitted(const struct file *file, 
+static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *new_map)
 {
@@ -811,8 +819,7 @@ static bool new_idmap_permitted(const struct file *file,
                        kuid_t uid = make_kuid(ns->parent, id);
                        if (uid_eq(uid, file->f_cred->fsuid))
                                return true;
-                }
+                } else if (cap_setid == CAP_SETGID) {
-                else if (cap_setid == CAP_SETGID) {
                        kgid_t gid = make_kgid(ns->parent, id);
                        if (gid_eq(gid, file->f_cred->fsgid))
                                return true;
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4f69f9a5e221..c8eac43267e9 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -17,7 +17,7 @@
 #ifdef CONFIG_PROC_SYSCTL
-static void *get_uts(ctl_table *table, int write)
+static void *get_uts(struct ctl_table *table, int write)
 {
        char *which = table->data;
        struct uts_namespace *uts_ns;
@@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write)
        return which;
 }
-static void put_uts(ctl_table *table, int write, void *which)
+static void put_uts(struct ctl_table *table, int write, void *which)
 {
        if (!write)
                up_read(&uts_sem);
@@ -44,14 +44,14 @@ static void put_uts(ctl_table *table, int write, void *which)
 *      Special case of dostring for the UTS structure. This has locks
 *      to observe. Should this be in kernel/sys.c ????
 */
-static int proc_do_uts_string(ctl_table *table, int write,
+static int proc_do_uts_string(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        struct ctl_table uts_table;
        int r;
        memcpy(&uts_table, table, sizeof(uts_table));
        uts_table.data = get_uts(table, write);
-        r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
+        r = proc_dostring(&uts_table, write, buffer, lenp, ppos);
        put_uts(table, write, uts_table.data);
        if (write)
@@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void)
        return 0;
 }
-__initcall(utsname_sysctl_init);
+device_initcall(utsname_sysctl_init);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 516203e665fc..c3319bd1b040 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
 int watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
+#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#endif
 static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
+static unsigned long soft_lockup_nmi_warn;
 /* boot commands */
 /*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
 }
 __setup("nosoftlockup", nosoftlockup_setup);
 /*  */
+#ifdef CONFIG_SMP
+static int __init softlockup_all_cpu_backtrace_setup(char *str)
+{
+        sysctl_softlockup_all_cpu_backtrace =
+                !!simple_strtol(str, NULL, 0);
+        return 1;
+}
+__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#endif
 /*
 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
        struct pt_regs *regs = get_irq_regs();
        int duration;
+        int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
        /* kick the hardlockup detector */
        watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                if (__this_cpu_read(soft_watchdog_warn) == true)
                        return HRTIMER_RESTART;
+                if (softlockup_all_cpu_backtrace) {
+                        /* Prevent multiple soft-lockup reports if one cpu is already
+                         * engaged in dumping cpu back traces
+                         */
+                        if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
+                                /* Someone else will report us. Let's give up */
+                                __this_cpu_write(soft_watchdog_warn, true);
+                                return HRTIMER_RESTART;
+                        }
+                }
                printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
                        smp_processor_id(), duration,
                        current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
                else
                        dump_stack();
+                if (softlockup_all_cpu_backtrace) {
+                        /* Avoid generating two back traces for current
+                         * given that one is already made above
+                         */
+                        trigger_allbutself_cpu_backtrace();
+                        clear_bit(0, &soft_lockup_nmi_warn);
+                        /* Barrier to sync with other cpus */
+                        smp_mb__after_atomic();
+                }
                if (softlockup_panic)
                        panic("softlockup: hung tasks");
                __this_cpu_write(soft_watchdog_warn, true);
@@ -527,10 +566,8 @@ static void update_timers_all_cpus(void)
        int cpu;
        get_online_cpus();
-        preempt_disable();
        for_each_online_cpu(cpu)
                update_timers(cpu);
-        preempt_enable();
        put_online_cpus();
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ee63af30bd1..35974ac69600 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -65,15 +65,12 @@ enum {
         * be executing on any CPU.  The pool behaves as an unbound one.
         *
         * Note that DISASSOCIATED should be flipped only while holding
-         * manager_mutex to avoid changing binding state while
+         * attach_mutex to avoid changing binding state while
-         * create_worker() is in progress.
+         * worker_attach_to_pool() is in progress.
         */
-        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
        POOL_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
-        POOL_FREEZING           = 1 << 3,       /* freeze in progress */
        /* worker flags */
-        WORKER_STARTED          = 1 << 0,       /* started */
        WORKER_DIE              = 1 << 1,       /* die die die */
        WORKER_IDLE             = 1 << 2,       /* is idle */
        WORKER_PREP             = 1 << 3,       /* preparing to run works */
@@ -100,10 +97,10 @@ enum {
        /*
         * Rescue workers are used only on emergencies and shared by
-         * all cpus.  Give -20.
+         * all cpus.  Give MIN_NICE.
         */
-        RESCUER_NICE_LEVEL      = -20,
+        RESCUER_NICE_LEVEL      = MIN_NICE,
-        HIGHPRI_NICE_LEVEL      = -20,
+        HIGHPRI_NICE_LEVEL      = MIN_NICE,
        WQ_NAME_LEN             = 24,
 };
@@ -124,8 +121,7 @@ enum {
 *    cpu or grabbing pool->lock is enough for read access.  If
 *    POOL_DISASSOCIATED is set, it's identical to L.
 *
- * MG: pool->manager_mutex and pool->lock protected.  Writes require both
+ * A: pool->attach_mutex protected.
- *     locks.  Reads can happen under either lock.
 *
 * PL: wq_pool_mutex protected.
 *
@@ -163,8 +159,11 @@ struct worker_pool {
        /* see manage_workers() for details on the two manager mutexes */
        struct mutex            manager_arb;    /* manager arbitration */
-        struct mutex            manager_mutex;  /* manager exclusion */
+        struct mutex            attach_mutex;   /* attach/detach exclusion */
-        struct idr              worker_idr;     /* MG: worker IDs and iteration */
+        struct list_head        workers;        /* A: attached workers */
+        struct completion       *detach_completion; /* all workers detached */
+        struct ida              worker_ida;     /* worker IDs for task name */
        struct workqueue_attrs  *attrs;         /* I: worker attributes */
        struct hlist_node       hash_node;      /* PL: unbound_pool_hash node */
@@ -340,16 +339,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
                           lockdep_is_held(&wq->mutex),                 \
                           "sched RCU or wq->mutex should be held")
-#ifdef CONFIG_LOCKDEP
-#define assert_manager_or_pool_lock(pool)                               \
-        WARN_ONCE(debug_locks &&                                        \
-                  !lockdep_is_held(&(pool)->manager_mutex) &&           \
-                  !lockdep_is_held(&(pool)->lock),                      \
-                  "pool->manager_mutex or ->lock should be held")
-#else
-#define assert_manager_or_pool_lock(pool)       do { } while (0)
-#endif
 #define for_each_cpu_worker_pool(pool, cpu)                             \
        for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
             (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
@@ -375,17 +364,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
 /**
 * for_each_pool_worker - iterate through all workers of a worker_pool
 * @worker: iteration cursor
- * @wi: integer used for iteration
 * @pool: worker_pool to iterate workers of
 *
- * This must be called with either @pool->manager_mutex or ->lock held.
+ * This must be called with @pool->attach_mutex.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
-#define for_each_pool_worker(worker, wi, pool)                          \
+#define for_each_pool_worker(worker, pool)                              \
-        idr_for_each_entry(&(pool)->worker_idr, (worker), (wi))         \
+        list_for_each_entry((worker), &(pool)->workers, node)           \
-                if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
+                if (({ lockdep_assert_held(&pool->attach_mutex); false; })) { } \
                else
 /**
@@ -763,13 +751,6 @@ static bool need_to_create_worker(struct worker_pool *pool)
        return need_more_worker(pool) && !may_start_working(pool);
 }
-/* Do I need to be the manager? */
-static bool need_to_manage_workers(struct worker_pool *pool)
-{
-        return need_to_create_worker(pool) ||
-                (pool->flags & POOL_MANAGE_WORKERS);
-}
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
@@ -791,8 +772,8 @@ static bool too_many_workers(struct worker_pool *pool)
 * Wake up functions.
 */
-/* Return the first worker.  Safe with preemption disabled */
+/* Return the first idle worker.  Safe with preemption disabled */
-static struct worker *first_worker(struct worker_pool *pool)
+static struct worker *first_idle_worker(struct worker_pool *pool)
 {
        if (unlikely(list_empty(&pool->idle_list)))
                return NULL;
@@ -811,7 +792,7 @@ static struct worker *first_worker(struct worker_pool *pool)
 */
 static void wake_up_worker(struct worker_pool *pool)
 {
-        struct worker *worker = first_worker(pool);
+        struct worker *worker = first_idle_worker(pool);
        if (likely(worker))
                wake_up_process(worker->task);
@@ -885,7 +866,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
         */
        if (atomic_dec_and_test(&pool->nr_running) &&
            !list_empty(&pool->worklist))
-                to_wakeup = first_worker(pool);
+                to_wakeup = first_idle_worker(pool);
        return to_wakeup ? to_wakeup->task : NULL;
 }
@@ -1621,70 +1602,6 @@ static void worker_leave_idle(struct worker *worker)
        list_del_init(&worker->entry);
 }
-/**
- * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
- * @pool: target worker_pool
- *
- * Bind %current to the cpu of @pool if it is associated and lock @pool.
- *
- * Works which are scheduled while the cpu is online must at least be
- * scheduled to a worker which is bound to the cpu so that if they are
- * flushed from cpu callbacks while cpu is going down, they are
- * guaranteed to execute on the cpu.
- *
- * This function is to be used by unbound workers and rescuers to bind
- * themselves to the target cpu and may race with cpu going down or
- * coming online.  kthread_bind() can't be used because it may put the
- * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
- * verbatim as it's best effort and blocking and pool may be
- * [dis]associated in the meantime.
- *
- * This function tries set_cpus_allowed() and locks pool and verifies the
- * binding against %POOL_DISASSOCIATED which is set during
- * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
- * enters idle state or fetches works without dropping lock, it can
- * guarantee the scheduling requirement described in the first paragraph.
- *
- * CONTEXT:
- * Might sleep.  Called without any lock but returns with pool->lock
- * held.
- *
- * Return:
- * %true if the associated pool is online (@worker is successfully
- * bound), %false if offline.
- */
-static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
-__acquires(&pool->lock)
-{
-        while (true) {
-                /*
-                 * The following call may fail, succeed or succeed
-                 * without actually migrating the task to the cpu if
-                 * it races with cpu hotunplug operation.  Verify
-                 * against POOL_DISASSOCIATED.
-                 */
-                if (!(pool->flags & POOL_DISASSOCIATED))
-                        set_cpus_allowed_ptr(current, pool->attrs->cpumask);
-                spin_lock_irq(&pool->lock);
-                if (pool->flags & POOL_DISASSOCIATED)
-                        return false;
-                if (task_cpu(current) == pool->cpu &&
-                    cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
-                        return true;
-                spin_unlock_irq(&pool->lock);
-                /*
-                 * We've raced with CPU hot[un]plug.  Give it a breather
-                 * and retry migration.  cond_resched() is required here;
-                 * otherwise, we might deadlock against cpu_stop trying to
-                 * bring down the CPU on non-preemptive kernel.
-                 */
-                cpu_relax();
-                cond_resched();
-        }
-}
 static struct worker *alloc_worker(void)
 {
        struct worker *worker;
@@ -1693,6 +1610,7 @@ static struct worker *alloc_worker(void)
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
+                INIT_LIST_HEAD(&worker->node);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
@@ -1700,12 +1618,68 @@ static struct worker *alloc_worker(void)
 }
 /**
+ * worker_attach_to_pool() - attach a worker to a pool
+ * @worker: worker to be attached
+ * @pool: the target pool
+ *
+ * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and
+ * cpu-binding of @worker are kept coordinated with the pool across
+ * cpu-[un]hotplugs.
+ */
+static void worker_attach_to_pool(struct worker *worker,
+                                   struct worker_pool *pool)
+{
+        mutex_lock(&pool->attach_mutex);
+        /*
+         * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
+         * online CPUs.  It'll be re-applied when any of the CPUs come up.
+         */
+        set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+        /*
+         * The pool->attach_mutex ensures %POOL_DISASSOCIATED remains
+         * stable across this function.  See the comments above the
+         * flag definition for details.
+         */
+        if (pool->flags & POOL_DISASSOCIATED)
+                worker->flags |= WORKER_UNBOUND;
+        list_add_tail(&worker->node, &pool->workers);
+        mutex_unlock(&pool->attach_mutex);
+}
+/**
+ * worker_detach_from_pool() - detach a worker from its pool
+ * @worker: worker which is attached to its pool
+ * @pool: the pool @worker is attached to
+ *
+ * Undo the attaching which had been done in worker_attach_to_pool().  The
+ * caller worker shouldn't access to the pool after detached except it has
+ * other reference to the pool.
+ */
+static void worker_detach_from_pool(struct worker *worker,
+                                    struct worker_pool *pool)
+{
+        struct completion *detach_completion = NULL;
+        mutex_lock(&pool->attach_mutex);
+        list_del(&worker->node);
+        if (list_empty(&pool->workers))
+                detach_completion = pool->detach_completion;
+        mutex_unlock(&pool->attach_mutex);
+        if (detach_completion)
+                complete(detach_completion);
+}
+/**
 * create_worker - create a new workqueue worker
 * @pool: pool the new worker will belong to
 *
- * Create a new worker which is bound to @pool.  The returned worker
+ * Create a new worker which is attached to @pool.  The new worker must be
- * can be started by calling start_worker() or destroyed using
+ * started by start_worker().
- * destroy_worker().
 *
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
@@ -1719,19 +1693,8 @@ static struct worker *create_worker(struct worker_pool *pool)
        int id = -1;
        char id_buf[16];
-        lockdep_assert_held(&pool->manager_mutex);
+        /* ID is needed to determine kthread name */
+        id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
-        /*
-         * ID is needed to determine kthread name.  Allocate ID first
-         * without installing the pointer.
-         */
-        idr_preload(GFP_KERNEL);
-        spin_lock_irq(&pool->lock);
-        id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
-        spin_unlock_irq(&pool->lock);
-        idr_preload_end();
        if (id < 0)
                goto fail;
@@ -1758,33 +1721,14 @@ static struct worker *create_worker(struct worker_pool *pool)
        /* prevent userland from meddling with cpumask of workqueue workers */
        worker->task->flags |= PF_NO_SETAFFINITY;
-        /*
+        /* successful, attach the worker to the pool */
-         * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
+        worker_attach_to_pool(worker, pool);
-         * online CPUs.  It'll be re-applied when any of the CPUs come up.
-         */
-        set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
-        /*
-         * The caller is responsible for ensuring %POOL_DISASSOCIATED
-         * remains stable across this function.  See the comments above the
-         * flag definition for details.
-         */
-        if (pool->flags & POOL_DISASSOCIATED)
-                worker->flags |= WORKER_UNBOUND;
-        /* successful, commit the pointer to idr */
-        spin_lock_irq(&pool->lock);
-        idr_replace(&pool->worker_idr, worker, worker->id);
-        spin_unlock_irq(&pool->lock);
        return worker;
 fail:
-        if (id >= 0) {
+        if (id >= 0)
-                spin_lock_irq(&pool->lock);
+                ida_simple_remove(&pool->worker_ida, id);
-                idr_remove(&pool->worker_idr, id);
-                spin_unlock_irq(&pool->lock);
-        }
        kfree(worker);
        return NULL;
 }
@@ -1800,7 +1744,6 @@ fail:
 */
 static void start_worker(struct worker *worker)
 {
-        worker->flags |= WORKER_STARTED;
        worker->pool->nr_workers++;
        worker_enter_idle(worker);
        wake_up_process(worker->task);
@@ -1818,8 +1761,6 @@ static int create_and_start_worker(struct worker_pool *pool)
 {
        struct worker *worker;
-        mutex_lock(&pool->manager_mutex);
        worker = create_worker(pool);
        if (worker) {
                spin_lock_irq(&pool->lock);
@@ -1827,8 +1768,6 @@ static int create_and_start_worker(struct worker_pool *pool)
                spin_unlock_irq(&pool->lock);
        }
-        mutex_unlock(&pool->manager_mutex);
        return worker ? 0 : -ENOMEM;
 }
@@ -1836,46 +1775,30 @@ static int create_and_start_worker(struct worker_pool *pool)
 * destroy_worker - destroy a workqueue worker
 * @worker: worker to be destroyed
 *
- * Destroy @worker and adjust @pool stats accordingly.
+ * Destroy @worker and adjust @pool stats accordingly.  The worker should
+ * be idle.
 *
 * CONTEXT:
- * spin_lock_irq(pool->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock).
 */
 static void destroy_worker(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-        lockdep_assert_held(&pool->manager_mutex);
        lockdep_assert_held(&pool->lock);
        /* sanity check frenzy */
        if (WARN_ON(worker->current_work) ||
-            WARN_ON(!list_empty(&worker->scheduled)))
+            WARN_ON(!list_empty(&worker->scheduled)) ||
+            WARN_ON(!(worker->flags & WORKER_IDLE)))
                return;
-        if (worker->flags & WORKER_STARTED)
+        pool->nr_workers--;
-                pool->nr_workers--;
+        pool->nr_idle--;
-        if (worker->flags & WORKER_IDLE)
-                pool->nr_idle--;
-        /*
-         * Once WORKER_DIE is set, the kworker may destroy itself at any
-         * point.  Pin to ensure the task stays until we're done with it.
-         */
-        get_task_struct(worker->task);
        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
+        wake_up_process(worker->task);
-        idr_remove(&pool->worker_idr, worker->id);
-        spin_unlock_irq(&pool->lock);
-        kthread_stop(worker->task);
-        put_task_struct(worker->task);
-        kfree(worker);
-        spin_lock_irq(&pool->lock);
 }
 static void idle_worker_timeout(unsigned long __pool)
@@ -1884,7 +1807,7 @@ static void idle_worker_timeout(unsigned long __pool)
        spin_lock_irq(&pool->lock);
-        if (too_many_workers(pool)) {
+        while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;
@@ -1892,13 +1815,12 @@ static void idle_worker_timeout(unsigned long __pool)
                worker = list_entry(pool->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
-                if (time_before(jiffies, expires))
+                if (time_before(jiffies, expires)) {
                        mod_timer(&pool->idle_timer, expires);
-                else {
+                        break;
-                        /* it's been idle for too long, wake up manager */
-                        pool->flags |= POOL_MANAGE_WORKERS;
-                        wake_up_worker(pool);
                }
+                destroy_worker(worker);
        }
        spin_unlock_irq(&pool->lock);
@@ -1916,6 +1838,12 @@ static void send_mayday(struct work_struct *work)
        /* mayday mayday mayday */
        if (list_empty(&pwq->mayday_node)) {
+                /*
+                 * If @pwq is for an unbound wq, its base ref may be put at
+                 * any time due to an attribute change.  Pin @pwq until the
+                 * rescuer is done with it.
+                 */
+                get_pwq(pwq);
                list_add_tail(&pwq->mayday_node, &wq->maydays);
                wake_up_process(wq->rescuer->task);
        }
@@ -2011,44 +1939,6 @@ restart:
 }
 /**
- * maybe_destroy_worker - destroy workers which have been idle for a while
- * @pool: pool to destroy workers for
- *
- * Destroy @pool workers which have been idle for longer than
- * IDLE_WORKER_TIMEOUT.
- *
- * LOCKING:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
- * multiple times.  Called only from manager.
- *
- * Return:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
- */
-static bool maybe_destroy_workers(struct worker_pool *pool)
-{
-        bool ret = false;
-        while (too_many_workers(pool)) {
-                struct worker *worker;
-                unsigned long expires;
-                worker = list_entry(pool->idle_list.prev, struct worker, entry);
-                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
-                if (time_before(jiffies, expires)) {
-                        mod_timer(&pool->idle_timer, expires);
-                        break;
-                }
-                destroy_worker(worker);
-                ret = true;
-        }
-        return ret;
-}
-/**
 * manage_workers - manage worker pool
 * @worker: self
 *
@@ -2077,8 +1967,6 @@ static bool manage_workers(struct worker *worker)
        bool ret = false;
        /*
-         * Managership is governed by two mutexes - manager_arb and
-         * manager_mutex.  manager_arb handles arbitration of manager role.
         * Anyone who successfully grabs manager_arb wins the arbitration
         * and becomes the manager.  mutex_trylock() on pool->manager_arb
         * failure while holding pool->lock reliably indicates that someone
@@ -2087,40 +1975,12 @@ static bool manage_workers(struct worker *worker)
         * grabbing manager_arb is responsible for actually performing
         * manager duties.  If manager_arb is grabbed and released without
         * actual management, the pool may stall indefinitely.
-         *
-         * manager_mutex is used for exclusion of actual management
-         * operations.  The holder of manager_mutex can be sure that none
-         * of management operations, including creation and destruction of
-         * workers, won't take place until the mutex is released.  Because
-         * manager_mutex doesn't interfere with manager role arbitration,
-         * it is guaranteed that the pool's management, while may be
-         * delayed, won't be disturbed by someone else grabbing
-         * manager_mutex.
         */
        if (!mutex_trylock(&pool->manager_arb))
                return ret;
-        /*
-         * With manager arbitration won, manager_mutex would be free in
-         * most cases.  trylock first without dropping @pool->lock.
-         */
-        if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
-                spin_unlock_irq(&pool->lock);
-                mutex_lock(&pool->manager_mutex);
-                spin_lock_irq(&pool->lock);
-                ret = true;
-        }
-        pool->flags &= ~POOL_MANAGE_WORKERS;
-        /*
-         * Destroy and then create so that may_start_working() is true
-         * on return.
-         */
-        ret |= maybe_destroy_workers(pool);
        ret |= maybe_create_worker(pool);
-        mutex_unlock(&pool->manager_mutex);
        mutex_unlock(&pool->manager_arb);
        return ret;
 }
@@ -2308,6 +2168,11 @@ woke_up:
                spin_unlock_irq(&pool->lock);
                WARN_ON_ONCE(!list_empty(&worker->entry));
                worker->task->flags &= ~PF_WQ_WORKER;
+                set_task_comm(worker->task, "kworker/dying");
+                ida_simple_remove(&pool->worker_ida, worker->id);
+                worker_detach_from_pool(worker, pool);
+                kfree(worker);
                return 0;
        }
@@ -2355,9 +2220,6 @@ recheck:
        worker_set_flags(worker, WORKER_PREP, false);
 sleep:
-        if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
-                goto recheck;
        /*
         * pool->lock is held and there's no work to process and no need to
         * manage, sleep.  Workers are woken up only while holding
@@ -2398,6 +2260,7 @@ static int rescuer_thread(void *__rescuer)
        struct worker *rescuer = __rescuer;
        struct workqueue_struct *wq = rescuer->rescue_wq;
        struct list_head *scheduled = &rescuer->scheduled;
+        bool should_stop;
        set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -2409,11 +2272,15 @@ static int rescuer_thread(void *__rescuer)
 repeat:
        set_current_state(TASK_INTERRUPTIBLE);
-        if (kthread_should_stop()) {
+        /*
-                __set_current_state(TASK_RUNNING);
+         * By the time the rescuer is requested to stop, the workqueue
-                rescuer->task->flags &= ~PF_WQ_WORKER;
+         * shouldn't have any work pending, but @wq->maydays may still have
-                return 0;
+         * pwq(s) queued.  This can happen by non-rescuer workers consuming
-        }
+         * all the work items before the rescuer got to them.  Go through
+         * @wq->maydays processing before acting on should_stop so that the
+         * list is always empty on exit.
+         */
+        should_stop = kthread_should_stop();
        /* see whether any pwq is asking for help */
        spin_lock_irq(&wq_mayday_lock);
@@ -2429,8 +2296,9 @@ repeat:
                spin_unlock_irq(&wq_mayday_lock);
-                /* migrate to the target cpu if possible */
+                worker_attach_to_pool(rescuer, pool);
-                worker_maybe_bind_and_lock(pool);
+                spin_lock_irq(&pool->lock);
                rescuer->pool = pool;
                /*
@@ -2443,6 +2311,17 @@ repeat:
                                move_linked_works(work, scheduled, &n);
                process_scheduled_works(rescuer);
+                spin_unlock_irq(&pool->lock);
+                worker_detach_from_pool(rescuer, pool);
+                spin_lock_irq(&pool->lock);
+                /*
+                 * Put the reference grabbed by send_mayday().  @pool won't
+                 * go away while we're holding its lock.
+                 */
+                put_pwq(pwq);
                /*
                 * Leave this pool.  If keep_working() is %true, notify a
@@ -2459,6 +2338,12 @@ repeat:
        spin_unlock_irq(&wq_mayday_lock);
+        if (should_stop) {
+                __set_current_state(TASK_RUNNING);
+                rescuer->task->flags &= ~PF_WQ_WORKER;
+                return 0;
+        }
        /* rescuers should never participate in concurrency management */
        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
@@ -3399,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
                }
        }
+        dev_set_uevent_suppress(&wq_dev->dev, false);
        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
        return 0;
 }
@@ -3527,9 +3413,10 @@ static int init_worker_pool(struct worker_pool *pool)
                    (unsigned long)pool);
        mutex_init(&pool->manager_arb);
-        mutex_init(&pool->manager_mutex);
+        mutex_init(&pool->attach_mutex);
-        idr_init(&pool->worker_idr);
+        INIT_LIST_HEAD(&pool->workers);
+        ida_init(&pool->worker_ida);
        INIT_HLIST_NODE(&pool->hash_node);
        pool->refcnt = 1;
@@ -3544,7 +3431,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
 {
        struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
-        idr_destroy(&pool->worker_idr);
+        ida_destroy(&pool->worker_ida);
        free_workqueue_attrs(pool->attrs);
        kfree(pool);
 }
@@ -3562,6 +3449,7 @@ static void rcu_free_pool(struct rcu_head *rcu)
 */
 static void put_unbound_pool(struct worker_pool *pool)
 {
+        DECLARE_COMPLETION_ONSTACK(detach_completion);
        struct worker *worker;
        lockdep_assert_held(&wq_pool_mutex);
@@ -3582,18 +3470,24 @@ static void put_unbound_pool(struct worker_pool *pool)
        /*
         * Become the manager and destroy all workers.  Grabbing
         * manager_arb prevents @pool's workers from blocking on
-         * manager_mutex.
+         * attach_mutex.
         */
        mutex_lock(&pool->manager_arb);
-        mutex_lock(&pool->manager_mutex);
-        spin_lock_irq(&pool->lock);
-        while ((worker = first_worker(pool)))
+        spin_lock_irq(&pool->lock);
+        while ((worker = first_idle_worker(pool)))
                destroy_worker(worker);
        WARN_ON(pool->nr_workers || pool->nr_idle);
        spin_unlock_irq(&pool->lock);
-        mutex_unlock(&pool->manager_mutex);
+        mutex_lock(&pool->attach_mutex);
+        if (!list_empty(&pool->workers))
+                pool->detach_completion = &detach_completion;
+        mutex_unlock(&pool->attach_mutex);
+        if (pool->detach_completion)
+                wait_for_completion(pool->detach_completion);
        mutex_unlock(&pool->manager_arb);
        /* shut down the timers */
@@ -3639,9 +3533,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
        if (!pool || init_worker_pool(pool) < 0)
                goto fail;
-        if (workqueue_freezing)
-                pool->flags |= POOL_FREEZING;
        lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
        copy_workqueue_attrs(pool->attrs, attrs);
@@ -3748,7 +3639,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
        spin_lock_irq(&pwq->pool->lock);
-        if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) {
+        /*
+         * During [un]freezing, the caller is responsible for ensuring that
+         * this function is called at least once after @workqueue_freezing
+         * is updated and visible.
+         */
+        if (!freezable || !workqueue_freezing) {
                pwq->max_active = wq->saved_max_active;
                while (!list_empty(&pwq->delayed_works) &&
@@ -4080,17 +3976,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
         * Let's determine what needs to be done.  If the target cpumask is
         * different from wq's, we need to compare it to @pwq's and create
         * a new one if they don't match.  If the target cpumask equals
-         * wq's, the default pwq should be used.  If @pwq is already the
+         * wq's, the default pwq should be used.
-         * default one, nothing to do; otherwise, install the default one.
         */
        if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
                if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
                        goto out_unlock;
        } else {
-                if (pwq == wq->dfl_pwq)
+                goto use_dfl_pwq;
-                        goto out_unlock;
-                else
-                        goto use_dfl_pwq;
        }
        mutex_unlock(&wq->mutex);
@@ -4098,9 +3990,10 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
        /* create a new pwq */
        pwq = alloc_unbound_pwq(wq, target_attrs);
        if (!pwq) {
-                pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
+                pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
-                           wq->name);
+                        wq->name);
-                goto out_unlock;
+                mutex_lock(&wq->mutex);
+                goto use_dfl_pwq;
        }
        /*
@@ -4575,28 +4468,27 @@ static void wq_unbind_fn(struct work_struct *work)
        int cpu = smp_processor_id();
        struct worker_pool *pool;
        struct worker *worker;
-        int wi;
        for_each_cpu_worker_pool(pool, cpu) {
                WARN_ON_ONCE(cpu != smp_processor_id());
-                mutex_lock(&pool->manager_mutex);
+                mutex_lock(&pool->attach_mutex);
                spin_lock_irq(&pool->lock);
                /*
-                 * We've blocked all manager operations.  Make all workers
+                 * We've blocked all attach/detach operations. Make all workers
                 * unbound and set DISASSOCIATED.  Before this, all workers
                 * except for the ones which are still executing works from
                 * before the last CPU down must be on the cpu.  After
                 * this, they may become diasporas.
                 */
-                for_each_pool_worker(worker, wi, pool)
+                for_each_pool_worker(worker, pool)
                        worker->flags |= WORKER_UNBOUND;
                pool->flags |= POOL_DISASSOCIATED;
                spin_unlock_irq(&pool->lock);
-                mutex_unlock(&pool->manager_mutex);
+                mutex_unlock(&pool->attach_mutex);
                /*
                 * Call schedule() so that we cross rq->lock and thus can
@@ -4636,9 +4528,8 @@ static void wq_unbind_fn(struct work_struct *work)
 static void rebind_workers(struct worker_pool *pool)
 {
        struct worker *worker;
-        int wi;
-        lockdep_assert_held(&pool->manager_mutex);
+        lockdep_assert_held(&pool->attach_mutex);
        /*
         * Restore CPU affinity of all workers.  As all idle workers should
@@ -4647,13 +4538,13 @@ static void rebind_workers(struct worker_pool *pool)
         * of all workers first and then clear UNBOUND.  As we're called
         * from CPU_ONLINE, the following shouldn't fail.
         */
-        for_each_pool_worker(worker, wi, pool)
+        for_each_pool_worker(worker, pool)
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                  pool->attrs->cpumask) < 0);
        spin_lock_irq(&pool->lock);
-        for_each_pool_worker(worker, wi, pool) {
+        for_each_pool_worker(worker, pool) {
                unsigned int worker_flags = worker->flags;
                /*
@@ -4705,9 +4596,8 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
 {
        static cpumask_t cpumask;
        struct worker *worker;
-        int wi;
-        lockdep_assert_held(&pool->manager_mutex);
+        lockdep_assert_held(&pool->attach_mutex);
        /* is @cpu allowed for @pool? */
        if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
@@ -4719,7 +4609,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
                return;
        /* as we're called from CPU_ONLINE, the following shouldn't fail */
-        for_each_pool_worker(worker, wi, pool)
+        for_each_pool_worker(worker, pool)
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                  pool->attrs->cpumask) < 0);
 }
@@ -4752,7 +4642,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
                mutex_lock(&wq_pool_mutex);
                for_each_pool(pool, pi) {
-                        mutex_lock(&pool->manager_mutex);
+                        mutex_lock(&pool->attach_mutex);
                        if (pool->cpu == cpu) {
                                spin_lock_irq(&pool->lock);
@@ -4764,7 +4654,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
                                restore_unbound_workers_cpumask(pool, cpu);
                        }
-                        mutex_unlock(&pool->manager_mutex);
+                        mutex_unlock(&pool->attach_mutex);
                }
                /* update NUMA affinity of unbound workqueues */
@@ -4863,24 +4753,14 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
 */
 void freeze_workqueues_begin(void)
 {
-        struct worker_pool *pool;
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;
-        int pi;
        mutex_lock(&wq_pool_mutex);
        WARN_ON_ONCE(workqueue_freezing);
        workqueue_freezing = true;
-        /* set FREEZING */
-        for_each_pool(pool, pi) {
-                spin_lock_irq(&pool->lock);
-                WARN_ON_ONCE(pool->flags & POOL_FREEZING);
-                pool->flags |= POOL_FREEZING;
-                spin_unlock_irq(&pool->lock);
-        }
        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                for_each_pwq(pwq, wq)
@@ -4950,21 +4830,13 @@ void thaw_workqueues(void)
 {
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;
-        struct worker_pool *pool;
-        int pi;
        mutex_lock(&wq_pool_mutex);
        if (!workqueue_freezing)
                goto out_unlock;
-        /* clear FREEZING */
+        workqueue_freezing = false;
-        for_each_pool(pool, pi) {
-                spin_lock_irq(&pool->lock);
-                WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
-                pool->flags &= ~POOL_FREEZING;
-                spin_unlock_irq(&pool->lock);
-        }
        /* restore max_active and repopulate worklist */
        list_for_each_entry(wq, &workqueues, list) {
@@ -4974,7 +4846,6 @@ void thaw_workqueues(void)
                mutex_unlock(&wq->mutex);
        }
-        workqueue_freezing = false;
 out_unlock:
        mutex_unlock(&wq_pool_mutex);
 }
@@ -5009,7 +4880,7 @@ static void __init wq_numa_init(void)
        BUG_ON(!tbl);
        for_each_node(node)
-                BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
+                BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
                                node_online(node) ? node : NUMA_NO_NODE));
        for_each_possible_cpu(cpu) {
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 7e2204db0b1a..45215870ac6c 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -37,6 +37,8 @@ struct worker {
        struct task_struct      *task;          /* I: worker task */
        struct worker_pool      *pool;          /* I: the associated pool */
                                                /* L: for rescuers */
+        struct list_head        node;           /* A: anchored at pool->workers */
+                                                /* A: runs through worker->node */
        unsigned long           last_active;    /* L: last active timestamp */
        unsigned int            flags;          /* X: flags */
author	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2014-08-07 02:36:12 -0400
committer	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2014-08-07 02:36:12 -0400
commit	5e2aa2ed08e2e280121dc7cf5609c87d464f12ef (patch)
tree	ca7d7b1480285e3b617fecc5b41f0ce150a82c32 /kernel
parent	f62d14a8072b9756db36ba394e2b267470a40240 (diff)
parent	fc8104bc5a3f6f49d79f45f2706f79f77a9fb2ae (diff)