123 files changed, 11266 insertions, 5063 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f1090f437..b3097bde4e9c 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,4 @@
 config_data.h
 config_data.gz
 timeconst.h
+hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index bbde5f1a4486..271fd3119af9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -24,6 +24,7 @@ endif
 obj-y += sched/
 obj-y += power/
+obj-y += cpu/
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -175,7 +176,7 @@ signing_key.priv signing_key.x509: x509.genkey
        openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
                -batch -x509 -config x509.genkey \
                -outform DER -out signing_key.x509 \
-                -keyout signing_key.priv
+                -keyout signing_key.priv 2>&1
        @echo "###"
        @echo "### Key pair generated."
        @echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index b9bd7f098ee5..8d6e145138bb 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -540,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct,
        ac.ac_swaps = encode_comp_t(0);
        /*
+         * Get freeze protection. If the fs is frozen, just skip the write
+         * as we could deadlock the system otherwise.
+         */
+        if (!file_start_write_trylock(file))
+                goto out;
+        /*
         * Kernel segment override to datasegment and write it
         * to the accounting file.
         */
@@ -554,6 +560,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
                               sizeof(acct_t), &file->f_pos);
        current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
        set_fs(fs);
+        file_end_write(file);
 out:
        revert_creds(orig_cred);
 }
diff --git a/kernel/async.c b/kernel/async.c
index 8ddee2c3e5b0..61f023ce0228 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -73,7 +73,7 @@ struct async_entry {
        struct list_head        global_list;
        struct work_struct      work;
        async_cookie_t          cookie;
-        async_func_ptr          *func;
+        async_func_t            func;
        void                    *data;
        struct async_domain     *domain;
 };
@@ -84,24 +84,20 @@ static atomic_t entry_count;
 static async_cookie_t lowest_in_progress(struct async_domain *domain)
 {
-        struct async_entry *first = NULL;
+        struct list_head *pending;
        async_cookie_t ret = ASYNC_COOKIE_MAX;
        unsigned long flags;
        spin_lock_irqsave(&async_lock, flags);
-        if (domain) {
+        if (domain)
-                if (!list_empty(&domain->pending))
+                pending = &domain->pending;
-                        first = list_first_entry(&domain->pending,
+        else
-                                        struct async_entry, domain_list);
+                pending = &async_global_pending;
-        } else {
-                if (!list_empty(&async_global_pending))
-                        first = list_first_entry(&async_global_pending,
-                                        struct async_entry, global_list);
-        }
-        if (first)
+        if (!list_empty(pending))
-                ret = first->cookie;
+                ret = list_first_entry(pending, struct async_entry,
+                                       domain_list)->cookie;
        spin_unlock_irqrestore(&async_lock, flags);
        return ret;
@@ -149,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work)
        wake_up(&async_done);
 }
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
+static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
 {
        struct async_entry *entry;
        unsigned long flags;
@@ -169,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
                spin_unlock_irqrestore(&async_lock, flags);
                /* low on memory.. run synchronously */
-                ptr(data, newcookie);
+                func(data, newcookie);
                return newcookie;
        }
        INIT_LIST_HEAD(&entry->domain_list);
        INIT_LIST_HEAD(&entry->global_list);
        INIT_WORK(&entry->work, async_run_entry_fn);
-        entry->func = ptr;
+        entry->func = func;
        entry->data = data;
        entry->domain = domain;
@@ -202,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
 /**
 * async_schedule - schedule a function for asynchronous execution
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
 * @data: data pointer to pass to the function
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
 * Note: This function may be called from atomic or non-atomic contexts.
 */
-async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
+async_cookie_t async_schedule(async_func_t func, void *data)
 {
-        return __async_schedule(ptr, data, &async_dfl_domain);
+        return __async_schedule(func, data, &async_dfl_domain);
 }
 EXPORT_SYMBOL_GPL(async_schedule);
 /**
 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
 * @data: data pointer to pass to the function
 * @domain: the domain
 *
@@ -226,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule);
 * synchronization domain is specified via @domain.  Note: This function
 * may be called from atomic or non-atomic contexts.
 */
-async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+async_cookie_t async_schedule_domain(async_func_t func, void *data,
                                     struct async_domain *domain)
 {
-        return __async_schedule(ptr, data, domain);
+        return __async_schedule(func, data, domain);
 }
 EXPORT_SYMBOL_GPL(async_schedule_domain);
diff --git a/kernel/audit.c b/kernel/audit.c
index d596e5355f15..0b084fa44b1f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -58,7 +58,7 @@
 #ifdef CONFIG_SECURITY
 #include <linux/security.h>
 #endif
-#include <linux/netlink.h>
+#include <net/netlink.h>
 #include <linux/freezer.h>
 #include <linux/tty.h>
 #include <linux/pid_namespace.h>
@@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        /* As soon as there's any sign of userspace auditd,
         * start kauditd to talk to it */
-        if (!kauditd_task)
+        if (!kauditd_task) {
                kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
-        if (IS_ERR(kauditd_task)) {
+                if (IS_ERR(kauditd_task)) {
-                err = PTR_ERR(kauditd_task);
+                        err = PTR_ERR(kauditd_task);
-                kauditd_task = NULL;
+                        kauditd_task = NULL;
-                return err;
+                        return err;
+                }
        }
        loginuid = audit_get_loginuid(current);
        sessionid = audit_get_sessionid(current);
        security_task_getsecid(current, &sid);
@@ -910,7 +910,7 @@ static void audit_receive_skb(struct sk_buff *skb)
 {
        struct nlmsghdr *nlh;
        /*
-         * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+         * len MUST be signed for nlmsg_next to be able to dec it below 0
         * if the nlmsg_len was not aligned
         */
        int len;
@@ -919,13 +919,13 @@ static void audit_receive_skb(struct sk_buff *skb)
        nlh = nlmsg_hdr(skb);
        len = skb->len;
-        while (NLMSG_OK(nlh, len)) {
+        while (nlmsg_ok(nlh, len)) {
                err = audit_receive_msg(skb, nlh);
                /* if err or if this message says it wants a response */
                if (err || (nlh->nlmsg_flags & NLM_F_ACK))
                        netlink_ack(skb, nlh, err);
-                nlh = NLMSG_NEXT(nlh, len);
+                nlh = nlmsg_next(nlh, &len);
        }
 }
@@ -1483,7 +1483,7 @@ void audit_log_end(struct audit_buffer *ab)
                audit_log_lost("rate limit exceeded");
        } else {
                struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
-                nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+                nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
                if (audit_pid) {
                        skb_queue_tail(&audit_skb_queue, ab->skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index d51cba868e1b..11468d99dad0 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -59,10 +59,7 @@ struct audit_entry {
        struct audit_krule      rule;
 };
-#ifdef CONFIG_AUDIT
-extern int audit_enabled;
 extern int audit_ever_enabled;
-#endif
 extern int audit_pid;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 642a89c4f3d6..a291aa23fb3f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -617,9 +617,9 @@ void audit_trim_trees(void)
                }
                spin_unlock(&hash_lock);
                trim_marked(tree);
-                put_tree(tree);
                drop_collected_mounts(root_mnt);
 skip_it:
+                put_tree(tree);
                mutex_lock(&audit_filter_mutex);
        }
        list_del(&cursor);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f9fc54bbe06f..267436826c3b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -594,6 +594,10 @@ exit_nofree:
        return entry;
 exit_free:
+        if (entry->rule.watch)
+                audit_put_watch(entry->rule.watch); /* matches initial get */
+        if (entry->rule.tree)
+                audit_put_tree(entry->rule.tree); /* that's the temporary one */
        audit_free_rule(entry);
        return ERR_PTR(err);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a371f857a0a9..c68229411a7c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context)
        }
 }
-static inline void audit_zero_context(struct audit_context *context,
-                                      enum audit_state state)
-{
-        memset(context, 0, sizeof(*context));
-        context->state      = state;
-        context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
-}
 static inline struct audit_context *audit_alloc_context(enum audit_state state)
 {
        struct audit_context *context;
-        if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+        context = kzalloc(sizeof(*context), GFP_KERNEL);
+        if (!context)
                return NULL;
-        audit_zero_context(context, state);
+        context->state = state;
+        context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
        INIT_LIST_HEAD(&context->killed_trees);
        INIT_LIST_HEAD(&context->names_list);
        return context;
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d97259484..f6c2ce5701e1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
 EXPORT_SYMBOL(ns_capable);
 /**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file:  The file we want to check
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
+{
+        if (WARN_ON_ONCE(!cap_valid(cap)))
+                return false;
+        if (security_capable(file->f_cred, ns, cap) == 0)
+                return true;
+        return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666c..2a9926275f80 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -30,7 +30,6 @@
 #include <linux/cred.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
-#include <linux/fs.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
@@ -59,7 +58,7 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/eventfd.h>
 #include <linux/poll.h>
-#include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/flex_array.h> /* used in cgroup_attach_task */
 #include <linux/kthread.h>
 #include <linux/atomic.h>
@@ -83,7 +82,13 @@
 * B happens only through cgroup_show_options() and using cgroup_root_mutex
 * breaks it.
 */
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+EXPORT_SYMBOL_GPL(cgroup_mutex);        /* only for task_subsys_state_check() */
+#else
 static DEFINE_MUTEX(cgroup_mutex);
+#endif
 static DEFINE_MUTEX(cgroup_root_mutex);
 /*
@@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
 };
-#define MAX_CGROUP_ROOT_NAMELEN 64
-/*
- * A cgroupfs_root represents the root of a cgroup hierarchy,
- * and may be associated with a superblock to form an active
- * hierarchy
- */
-struct cgroupfs_root {
-        struct super_block *sb;
-        /*
-         * The bitmask of subsystems intended to be attached to this
-         * hierarchy
-         */
-        unsigned long subsys_mask;
-        /* Unique id for this hierarchy. */
-        int hierarchy_id;
-        /* The bitmask of subsystems currently attached to this hierarchy */
-        unsigned long actual_subsys_mask;
-        /* A list running through the attached subsystems */
-        struct list_head subsys_list;
-        /* The root cgroup for this hierarchy */
-        struct cgroup top_cgroup;
-        /* Tracks how many cgroups are currently defined in hierarchy.*/
-        int number_of_cgroups;
-        /* A list running through the active hierarchies */
-        struct list_head root_list;
-        /* All cgroups on this root, cgroup_mutex protected */
-        struct list_head allcg_list;
-        /* Hierarchy-specific flags */
-        unsigned long flags;
-        /* IDs for cgroups in this hierarchy */
-        struct ida cgroup_ida;
-        /* The path to use for release notifications. */
-        char release_agent_path[PATH_MAX];
-        /* The name for this hierarchy - may be empty */
-        char name[MAX_CGROUP_ROOT_NAMELEN];
-};
 /*
 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
 * subsystems that are otherwise unattached - it never has more than a
@@ -162,6 +117,9 @@ struct cfent {
        struct list_head                node;
        struct dentry                   *dentry;
        struct cftype                   *type;
+        /* file xattrs */
+        struct simple_xattrs            xattrs;
 };
 /*
@@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
 #define dummytop (&rootnode.top_cgroup)
+static struct cgroup_name root_cgroup_name = { .name = "/" };
 /* This flag indicates whether tasks in the fork and exit paths should
 * check for fork/exit handlers to call. This avoids us having to do
 * extra work in the fork/exit path if none of the subsystems need to
@@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
                              struct cftype cfts[], bool is_add);
-#ifdef CONFIG_PROVE_LOCKING
-int cgroup_lock_is_held(void)
-{
-        return lockdep_is_held(&cgroup_mutex);
-}
-#else /* #ifdef CONFIG_PROVE_LOCKING */
-int cgroup_lock_is_held(void)
-{
-        return mutex_is_locked(&cgroup_mutex);
-}
-#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
-EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
 static int css_unbias_refcnt(int refcnt)
 {
        return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
@@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
        return test_bit(CGRP_REMOVED, &cgrp->flags);
 }
-/* bits in struct cgroupfs_root flags field */
+/**
-enum {
+ * cgroup_is_descendant - test ancestry
-        ROOT_NOPREFIX,  /* mounted subsystems have no named prefix */
+ * @cgrp: the cgroup to be tested
-        ROOT_XATTR,     /* supports extended attributes */
+ * @ancestor: possible ancestor of @cgrp
-};
+ *
+ * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
+ * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
+ * and @ancestor are accessible.
+ */
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
+{
+        while (cgrp) {
+                if (cgrp == ancestor)
+                        return true;
+                cgrp = cgrp->parent;
+        }
+        return false;
+}
+EXPORT_SYMBOL_GPL(cgroup_is_descendant);
 static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
@@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
        return __d_cfe(dentry)->type;
 }
+/**
+ * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
+ * @cgrp: the cgroup to be checked for liveness
+ *
+ * On success, returns true; the mutex should be later unlocked.  On
+ * failure returns false with no lock held.
+ */
+static bool cgroup_lock_live_group(struct cgroup *cgrp)
+{
+        mutex_lock(&cgroup_mutex);
+        if (cgroup_is_removed(cgrp)) {
+                mutex_unlock(&cgroup_mutex);
+                return false;
+        }
+        return true;
+}
 /* the list of cgroups eligible for automatic release. Protected by
 * release_list_lock */
 static LIST_HEAD(release_list);
@@ -800,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 * update of a tasks cgroup pointer by cgroup_attach_task()
 */
-/**
- * cgroup_lock - lock out any changes to cgroup structures
- *
- */
-void cgroup_lock(void)
-{
-        mutex_lock(&cgroup_mutex);
-}
-EXPORT_SYMBOL_GPL(cgroup_lock);
-/**
- * cgroup_unlock - release lock on cgroup changes
- *
- * Undo the lock taken in a previous cgroup_lock() call.
- */
-void cgroup_unlock(void)
-{
-        mutex_unlock(&cgroup_mutex);
-}
-EXPORT_SYMBOL_GPL(cgroup_unlock);
 /*
 * A couple of forward declarations required, due to cyclic reference loop:
 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
@@ -859,6 +815,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
        return inode;
 }
+static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
+{
+        struct cgroup_name *name;
+        name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
+        if (!name)
+                return NULL;
+        strcpy(name->name, dentry->d_name.name);
+        return name;
+}
 static void cgroup_free_fn(struct work_struct *work)
 {
        struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
@@ -875,8 +842,18 @@ static void cgroup_free_fn(struct work_struct *work)
        mutex_unlock(&cgroup_mutex);
        /*
+         * We get a ref to the parent's dentry, and put the ref when
+         * this cgroup is being freed, so it's guaranteed that the
+         * parent won't be destroyed before its children.
+         */
+        dput(cgrp->parent->dentry);
+        ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+        /*
         * Drop the active superblock reference that we took when we
-         * created the cgroup
+         * created the cgroup. This will free cgrp->root, if we are
+         * holding the last reference to @sb.
         */
        deactivate_super(cgrp->root->sb);
@@ -888,7 +865,7 @@ static void cgroup_free_fn(struct work_struct *work)
        simple_xattrs_free(&cgrp->xattrs);
-        ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+        kfree(rcu_dereference_raw(cgrp->name));
        kfree(cgrp);
 }
@@ -910,13 +887,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
        } else {
                struct cfent *cfe = __d_cfe(dentry);
                struct cgroup *cgrp = dentry->d_parent->d_fsdata;
-                struct cftype *cft = cfe->type;
                WARN_ONCE(!list_empty(&cfe->node) &&
                          cgrp != &cgrp->root->top_cgroup,
                          "cfe still linked for %s\n", cfe->type->name);
+                simple_xattrs_free(&cfe->xattrs);
                kfree(cfe);
-                simple_xattrs_free(&cft->xattrs);
        }
        iput(inode);
 }
@@ -1108,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
        mutex_lock(&cgroup_root_mutex);
        for_each_subsys(root, ss)
                seq_printf(seq, ",%s", ss->name);
-        if (test_bit(ROOT_NOPREFIX, &root->flags))
+        if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
+                seq_puts(seq, ",sane_behavior");
+        if (root->flags & CGRP_ROOT_NOPREFIX)
                seq_puts(seq, ",noprefix");
-        if (test_bit(ROOT_XATTR, &root->flags))
+        if (root->flags & CGRP_ROOT_XATTR)
                seq_puts(seq, ",xattr");
        if (strlen(root->release_agent_path))
                seq_printf(seq, ",release_agent=%s", root->release_agent_path);
@@ -1172,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        all_ss = true;
                        continue;
                }
+                if (!strcmp(token, "__DEVEL__sane_behavior")) {
+                        opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+                        continue;
+                }
                if (!strcmp(token, "noprefix")) {
-                        set_bit(ROOT_NOPREFIX, &opts->flags);
+                        opts->flags |= CGRP_ROOT_NOPREFIX;
                        continue;
                }
                if (!strcmp(token, "clone_children")) {
@@ -1181,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        continue;
                }
                if (!strcmp(token, "xattr")) {
-                        set_bit(ROOT_XATTR, &opts->flags);
+                        opts->flags |= CGRP_ROOT_XATTR;
                        continue;
                }
                if (!strncmp(token, "release_agent=", 14)) {
@@ -1259,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
        /* Consistency checks */
+        if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+                pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+                if (opts->flags & CGRP_ROOT_NOPREFIX) {
+                        pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
+                        return -EINVAL;
+                }
+                if (opts->cpuset_clone_children) {
+                        pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
+                        return -EINVAL;
+                }
+        }
        /*
         * Option noprefix was introduced just for backward compatibility
         * with the old cpuset, so we allow noprefix only if mounting just
         * the cpuset subsystem.
         */
-        if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
+        if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
-            (opts->subsys_mask & mask))
                return -EINVAL;
@@ -1336,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        struct cgroup_sb_opts opts;
        unsigned long added_mask, removed_mask;
+        if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+                pr_err("cgroup: sane_behavior: remount is not allowed\n");
+                return -EINVAL;
+        }
        mutex_lock(&cgrp->dentry->d_inode->i_mutex);
        mutex_lock(&cgroup_mutex);
        mutex_lock(&cgroup_root_mutex);
@@ -1421,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
        INIT_LIST_HEAD(&root->allcg_list);
        root->number_of_cgroups = 1;
        cgrp->root = root;
-        cgrp->top_cgroup = cgrp;
+        cgrp->name = &root_cgroup_name;
        init_cgroup_housekeeping(cgrp);
        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 }
@@ -1685,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                 * any) is not needed
                 */
                cgroup_drop_root(opts.new_root);
+                if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
+                    root->flags != opts.flags) {
+                        pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
+                        ret = -EINVAL;
+                        goto drop_new_super;
+                }
                /* no subsys rebinding, so refcounts don't change */
                drop_parsed_module_refcounts(opts.subsys_mask);
        }
@@ -1769,49 +1777,48 @@ static struct kobject *cgroup_kobj;
 * @buf: the buffer to write the path into
 * @buflen: the length of the buffer
 *
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
+ * Writes path of cgroup into buf.  Returns 0 on success, -errno on error.
- * reference.  Writes path of cgroup into buf.  Returns 0 on success,
+ *
- * -errno on error.
+ * We can't generate cgroup path using dentry->d_name, as accessing
+ * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
+ * inode's i_mutex, while on the other hand cgroup_path() can be called
+ * with some irq-safe spinlocks held.
 */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
-        struct dentry *dentry = cgrp->dentry;
+        int ret = -ENAMETOOLONG;
        char *start;
-        rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
+        if (!cgrp->parent) {
-                           "cgroup_path() called without proper locking");
+                if (strlcpy(buf, "/", buflen) >= buflen)
+                        return -ENAMETOOLONG;
-        if (cgrp == dummytop) {
-                /*
-                 * Inactive subsystems have no dentry for their root
-                 * cgroup
-                 */
-                strcpy(buf, "/");
                return 0;
        }
        start = buf + buflen - 1;
        *start = '\0';
-        for (;;) {
-                int len = dentry->d_name.len;
+        rcu_read_lock();
+        do {
+                const char *name = cgroup_name(cgrp);
+                int len;
+                len = strlen(name);
                if ((start -= len) < buf)
-                        return -ENAMETOOLONG;
+                        goto out;
-                memcpy(start, dentry->d_name.name, len);
+                memcpy(start, name, len);
-                cgrp = cgrp->parent;
-                if (!cgrp)
-                        break;
-                dentry = cgrp->dentry;
-                if (!cgrp->parent)
-                        continue;
                if (--start < buf)
-                        return -ENAMETOOLONG;
+                        goto out;
                *start = '/';
-        }
+                cgrp = cgrp->parent;
+        } while (cgrp->parent);
+        ret = 0;
        memmove(buf, start, buf + buflen - start);
-        return 0;
+out:
+        rcu_read_unlock();
+        return ret;
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
@@ -1900,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
 *
 * Must be called with cgroup_mutex and threadgroup locked.
 */
-static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+static void cgroup_task_migrate(struct cgroup *oldcgrp,
                                struct task_struct *tsk, struct css_set *newcg)
 {
        struct css_set *oldcg;
@@ -1933,121 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 }
 /**
- * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
- * @cgrp: the cgroup the task is attaching to
- * @tsk: the task to be attached
- *
- * Call with cgroup_mutex and threadgroup locked. May take task_lock of
- * @tsk during call.
- */
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
-        int retval = 0;
-        struct cgroup_subsys *ss, *failed_ss = NULL;
-        struct cgroup *oldcgrp;
-        struct cgroupfs_root *root = cgrp->root;
-        struct cgroup_taskset tset = { };
-        struct css_set *newcg;
-        /* @tsk either already exited or can't exit until the end */
-        if (tsk->flags & PF_EXITING)
-                return -ESRCH;
-        /* Nothing to do if the task is already in that cgroup */
-        oldcgrp = task_cgroup_from_root(tsk, root);
-        if (cgrp == oldcgrp)
-                return 0;
-        tset.single.task = tsk;
-        tset.single.cgrp = oldcgrp;
-        for_each_subsys(root, ss) {
-                if (ss->can_attach) {
-                        retval = ss->can_attach(cgrp, &tset);
-                        if (retval) {
-                                /*
-                                 * Remember on which subsystem the can_attach()
-                                 * failed, so that we only call cancel_attach()
-                                 * against the subsystems whose can_attach()
-                                 * succeeded. (See below)
-                                 */
-                                failed_ss = ss;
-                                goto out;
-                        }
-                }
-        }
-        newcg = find_css_set(tsk->cgroups, cgrp);
-        if (!newcg) {
-                retval = -ENOMEM;
-                goto out;
-        }
-        cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
-        for_each_subsys(root, ss) {
-                if (ss->attach)
-                        ss->attach(cgrp, &tset);
-        }
-out:
-        if (retval) {
-                for_each_subsys(root, ss) {
-                        if (ss == failed_ss)
-                                /*
-                                 * This subsystem was the one that failed the
-                                 * can_attach() check earlier, so we don't need
-                                 * to call cancel_attach() against it or any
-                                 * remaining subsystems.
-                                 */
-                                break;
-                        if (ss->cancel_attach)
-                                ss->cancel_attach(cgrp, &tset);
-                }
-        }
-        return retval;
-}
-/**
- * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
- * @from: attach to all cgroups of a given task
- * @tsk: the task to be attached
- */
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
-{
-        struct cgroupfs_root *root;
-        int retval = 0;
-        cgroup_lock();
-        for_each_active_root(root) {
-                struct cgroup *from_cg = task_cgroup_from_root(from, root);
-                retval = cgroup_attach_task(from_cg, tsk);
-                if (retval)
-                        break;
-        }
-        cgroup_unlock();
-        return retval;
-}
-EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
-/**
- * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
 * @cgrp: the cgroup to attach to
- * @leader: the threadgroup leader task_struct of the group to be attached
+ * @tsk: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
 *
 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * task_lock of each thread in leader's threadgroup individually in turn.
+ * task_lock of @tsk or each thread in the threadgroup individually in turn.
 */
-static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
+                              bool threadgroup)
 {
        int retval, i, group_size;
        struct cgroup_subsys *ss, *failed_ss = NULL;
-        /* guaranteed to be initialized later, but the compiler needs this */
        struct cgroupfs_root *root = cgrp->root;
        /* threadgroup list cursor and array */
-        struct task_struct *tsk;
+        struct task_struct *leader = tsk;
        struct task_and_cgroup *tc;
        struct flex_array *group;
        struct cgroup_taskset tset = { };
@@ -2059,17 +1967,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         * group - group_rwsem prevents new threads from appearing, and if
         * threads exit, this will just be an over-estimate.
         */
-        group_size = get_nr_threads(leader);
+        if (threadgroup)
+                group_size = get_nr_threads(tsk);
+        else
+                group_size = 1;
        /* flex_array supports very large thread-groups better than kmalloc. */
        group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
        if (!group)
                return -ENOMEM;
        /* pre-allocate to guarantee space while iterating in rcu read-side. */
-        retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+        retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
        if (retval)
                goto out_free_group_list;
-        tsk = leader;
        i = 0;
        /*
         * Prevent freeing of tasks while we take a snapshot. Tasks that are
@@ -2098,6 +2008,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
                BUG_ON(retval != 0);
                i++;
+                if (!threadgroup)
+                        break;
        } while_each_thread(leader, tsk);
        rcu_read_unlock();
        /* remember the number of threads in the array for later. */
@@ -2143,7 +2056,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
         */
        for (i = 0; i < group_size; i++) {
                tc = flex_array_get(group, i);
-                cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
+                cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
        }
        /* nothing is sensitive to fork() after this point. */
@@ -2224,11 +2137,11 @@ retry_find_task:
                tsk = tsk->group_leader;
        /*
-         * Workqueue threads may acquire PF_THREAD_BOUND and become
+         * Workqueue threads may acquire PF_NO_SETAFFINITY and become
         * trapped in a cpuset, or RT worker may be born in a cgroup
         * with no rt_runtime allocated.  Just say no.
         */
-        if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
+        if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
                ret = -EINVAL;
                rcu_read_unlock();
                goto out_unlock_cgroup;
@@ -2251,17 +2164,42 @@ retry_find_task:
                        put_task_struct(tsk);
                        goto retry_find_task;
                }
-                ret = cgroup_attach_proc(cgrp, tsk);
+        }
-        } else
-                ret = cgroup_attach_task(cgrp, tsk);
+        ret = cgroup_attach_task(cgrp, tsk, threadgroup);
        threadgroup_unlock(tsk);
        put_task_struct(tsk);
 out_unlock_cgroup:
-        cgroup_unlock();
+        mutex_unlock(&cgroup_mutex);
        return ret;
 }
+/**
+ * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
+ * @from: attach to all cgroups of a given task
+ * @tsk: the task to be attached
+ */
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
+{
+        struct cgroupfs_root *root;
+        int retval = 0;
+        mutex_lock(&cgroup_mutex);
+        for_each_active_root(root) {
+                struct cgroup *from_cg = task_cgroup_from_root(from, root);
+                retval = cgroup_attach_task(from_cg, tsk, false);
+                if (retval)
+                        break;
+        }
+        mutex_unlock(&cgroup_mutex);
+        return retval;
+}
+EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
 {
        return attach_task_by_pid(cgrp, pid, false);
@@ -2272,24 +2210,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
        return attach_task_by_pid(cgrp, tgid, true);
 }
-/**
- * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
- * @cgrp: the cgroup to be checked for liveness
- *
- * On success, returns true; the lock should be later released with
- * cgroup_unlock(). On failure returns false with no lock held.
- */
-bool cgroup_lock_live_group(struct cgroup *cgrp)
-{
-        mutex_lock(&cgroup_mutex);
-        if (cgroup_is_removed(cgrp)) {
-                mutex_unlock(&cgroup_mutex);
-                return false;
-        }
-        return true;
-}
-EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
 {
@@ -2301,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
        mutex_lock(&cgroup_root_mutex);
        strcpy(cgrp->root->release_agent_path, buffer);
        mutex_unlock(&cgroup_root_mutex);
-        cgroup_unlock();
+        mutex_unlock(&cgroup_mutex);
        return 0;
 }
@@ -2312,7 +2232,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
                return -ENODEV;
        seq_puts(seq, cgrp->root->release_agent_path);
        seq_putc(seq, '\n');
-        cgroup_unlock();
+        mutex_unlock(&cgroup_mutex);
+        return 0;
+}
+static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
+                                     struct seq_file *seq)
+{
+        seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
        return 0;
 }
@@ -2537,13 +2464,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
 static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
                            struct inode *new_dir, struct dentry *new_dentry)
 {
+        int ret;
+        struct cgroup_name *name, *old_name;
+        struct cgroup *cgrp;
+        /*
+         * It's convinient to use parent dir's i_mutex to protected
+         * cgrp->name.
+         */
+        lockdep_assert_held(&old_dir->i_mutex);
        if (!S_ISDIR(old_dentry->d_inode->i_mode))
                return -ENOTDIR;
        if (new_dentry->d_inode)
                return -EEXIST;
        if (old_dir != new_dir)
                return -EIO;
-        return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+        cgrp = __d_cgrp(old_dentry);
+        name = cgroup_alloc_name(new_dentry);
+        if (!name)
+                return -ENOMEM;
+        ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+        if (ret) {
+                kfree(name);
+                return ret;
+        }
+        old_name = cgrp->name;
+        rcu_assign_pointer(cgrp->name, name);
+        kfree_rcu(old_name, rcu_head);
+        return 0;
 }
 static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -2551,13 +2505,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
        if (S_ISDIR(dentry->d_inode->i_mode))
                return &__d_cgrp(dentry)->xattrs;
        else
-                return &__d_cft(dentry)->xattrs;
+                return &__d_cfe(dentry)->xattrs;
 }
 static inline int xattr_enabled(struct dentry *dentry)
 {
        struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
-        return test_bit(ROOT_XATTR, &root->flags);
+        return root->flags & CGRP_ROOT_XATTR;
 }
 static bool is_valid_xattr(const char *name)
@@ -2727,9 +2681,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
        umode_t mode;
        char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
-        simple_xattrs_init(&cft->xattrs);
+        if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
-        if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
                strcpy(name, subsys->name);
                strcat(name, ".");
        }
@@ -2753,6 +2705,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
                cfe->type = (void *)cft;
                cfe->dentry = dentry;
                dentry->d_fsdata = cfe;
+                simple_xattrs_init(&cfe->xattrs);
                list_add_tail(&cfe->node, &parent->files);
                cfe = NULL;
        }
@@ -2770,6 +2723,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
        for (cft = cfts; cft->name[0] != '\0'; cft++) {
                /* does cft->flags tell us to skip this file on @cgrp? */
+                if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+                        continue;
                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
                        continue;
                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
@@ -3300,6 +3255,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
        return 0;
 }
+static void cgroup_transfer_one_task(struct task_struct *task,
+                                     struct cgroup_scanner *scan)
+{
+        struct cgroup *new_cgroup = scan->data;
+        mutex_lock(&cgroup_mutex);
+        cgroup_attach_task(new_cgroup, task, false);
+        mutex_unlock(&cgroup_mutex);
+}
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+        struct cgroup_scanner scan;
+        scan.cg = from;
+        scan.test_task = NULL; /* select all tasks in cgroup */
+        scan.process_task = cgroup_transfer_one_task;
+        scan.heap = NULL;
+        scan.data = to;
+        return cgroup_scan_tasks(&scan);
+}
 /*
 * Stuff for reading the 'tasks'/'procs' files.
 *
@@ -3362,35 +3345,14 @@ static void pidlist_free(void *p)
        else
                kfree(p);
 }
-static void *pidlist_resize(void *p, int newcount)
-{
-        void *newlist;
-        /* note: if new alloc fails, old p will still be valid either way */
-        if (is_vmalloc_addr(p)) {
-                newlist = vmalloc(newcount * sizeof(pid_t));
-                if (!newlist)
-                        return NULL;
-                memcpy(newlist, p, newcount * sizeof(pid_t));
-                vfree(p);
-        } else {
-                newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
-        }
-        return newlist;
-}
 /*
 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
- * If the new stripped list is sufficiently smaller and there's enough memory
+ * Returns the number of unique elements.
- * to allocate a new buffer, will let go of the unneeded memory. Returns the
- * number of unique elements.
 */
-/* is the size difference enough that we should re-allocate the array? */
+static int pidlist_uniq(pid_t *list, int length)
-#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
-static int pidlist_uniq(pid_t **p, int length)
 {
        int src, dest = 1;
-        pid_t *list = *p;
-        pid_t *newlist;
        /*
         * we presume the 0th element is unique, so i starts at 1. trivial
@@ -3411,16 +3373,6 @@ static int pidlist_uniq(pid_t **p, int length)
                dest++;
        }
 after:
-        /*
-         * if the length difference is large enough, we want to allocate a
-         * smaller buffer to save memory. if this fails due to out of memory,
-         * we'll just stay with what we've got.
-         */
-        if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
-                newlist = pidlist_resize(list, dest);
-                if (newlist)
-                        *p = newlist;
-        }
        return dest;
 }
@@ -3516,7 +3468,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
        /* now sort & (if procs) strip out duplicates */
        sort(array, length, sizeof(pid_t), cmppid, NULL);
        if (type == CGROUP_FILE_PROCS)
-                length = pidlist_uniq(&array, length);
+                length = pidlist_uniq(array, length);
        l = cgroup_pidlist_find(cgrp, type);
        if (!l) {
                pidlist_free(array);
@@ -3930,11 +3882,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
        if (ret)
                goto fail;
-        if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
+        efile->f_op->poll(efile, &event->pt);
-                event->cft->unregister_event(cgrp, event->cft, event->eventfd);
-                ret = 0;
-                goto fail;
-        }
        /*
         * Events should be removed after rmdir of cgroup directory, but before
@@ -4016,10 +3964,16 @@ static struct cftype files[] = {
        },
        {
                .name = "cgroup.clone_children",
+                .flags = CFTYPE_INSANE,
                .read_u64 = cgroup_clone_children_read,
                .write_u64 = cgroup_clone_children_write,
        },
        {
+                .name = "cgroup.sane_behavior",
+                .flags = CFTYPE_ONLY_ON_ROOT,
+                .read_seq_string = cgroup_sane_behavior_show,
+        },
+        {
                .name = "release_agent",
                .flags = CFTYPE_ONLY_ON_ROOT,
                .read_seq_string = cgroup_release_agent_show,
@@ -4131,17 +4085,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
        if (!(css->flags & CSS_ONLINE))
                return;
-        /*
+        if (ss->css_offline)
-         * css_offline() should be called with cgroup_mutex unlocked.  See
-         * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
-         * details.  This temporary unlocking should go away once
-         * cgroup_mutex is unexported from controllers.
-         */
-        if (ss->css_offline) {
-                mutex_unlock(&cgroup_mutex);
                ss->css_offline(cgrp);
-                mutex_lock(&cgroup_mutex);
-        }
        cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
 }
@@ -4158,6 +4103,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                             umode_t mode)
 {
        struct cgroup *cgrp;
+        struct cgroup_name *name;
        struct cgroupfs_root *root = parent->root;
        int err = 0;
        struct cgroup_subsys *ss;
@@ -4168,9 +4114,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (!cgrp)
                return -ENOMEM;
+        name = cgroup_alloc_name(dentry);
+        if (!name)
+                goto err_free_cgrp;
+        rcu_assign_pointer(cgrp->name, name);
        cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
        if (cgrp->id < 0)
-                goto err_free_cgrp;
+                goto err_free_name;
        /*
         * Only live parents can have children.  Note that the liveliness
@@ -4198,7 +4149,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        cgrp->parent = parent;
        cgrp->root = parent->root;
-        cgrp->top_cgroup = parent->top_cgroup;
        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4241,6 +4191,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        for_each_subsys(root, ss)
                dget(dentry);
+        /* hold a ref to the parent's dentry */
+        dget(parent->dentry);
        /* creation succeeded, notify subsystems */
        for_each_subsys(root, ss) {
                err = online_css(ss, cgrp);
@@ -4276,6 +4229,8 @@ err_free_all:
        deactivate_super(sb);
 err_free_id:
        ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_name:
+        kfree(rcu_dereference_raw(cgrp->name));
 err_free_cgrp:
        kfree(cgrp);
        return err;
@@ -4295,56 +4250,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
-/*
- * Check the reference count on each subsystem. Since we already
- * established that there are no tasks in the cgroup, if the css refcount
- * is also 1, then there should be no outstanding references, so the
- * subsystem is safe to destroy. We scan across all subsystems rather than
- * using the per-hierarchy linked list of mounted subsystems since we can
- * be called via check_for_release() with no synchronization other than
- * RCU, and the subsystem linked list isn't RCU-safe.
- */
-static int cgroup_has_css_refs(struct cgroup *cgrp)
-{
-        int i;
-        /*
-         * We won't need to lock the subsys array, because the subsystems
-         * we're concerned about aren't going anywhere since our cgroup root
-         * has a reference on them.
-         */
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                struct cgroup_subsys *ss = subsys[i];
-                struct cgroup_subsys_state *css;
-                /* Skip subsystems not present or not in this hierarchy */
-                if (ss == NULL || ss->root != cgrp->root)
-                        continue;
-                css = cgrp->subsys[ss->subsys_id];
-                /*
-                 * When called from check_for_release() it's possible
-                 * that by this point the cgroup has been removed
-                 * and the css deleted. But a false-positive doesn't
-                 * matter, since it can only happen if the cgroup
-                 * has been deleted and hence no longer needs the
-                 * release agent to be called anyway.
-                 */
-                if (css && css_refcnt(css) > 1)
-                        return 1;
-        }
-        return 0;
-}
 static int cgroup_destroy_locked(struct cgroup *cgrp)
        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
        struct dentry *d = cgrp->dentry;
        struct cgroup *parent = cgrp->parent;
-        DEFINE_WAIT(wait);
        struct cgroup_event *event, *tmp;
        struct cgroup_subsys *ss;
-        LIST_HEAD(tmp_list);
        lockdep_assert_held(&d->d_inode->i_mutex);
        lockdep_assert_held(&cgroup_mutex);
@@ -4468,7 +4380,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));
-        ss->active = 1;
        BUG_ON(online_css(ss, dummytop));
        mutex_unlock(&cgroup_mutex);
@@ -4573,7 +4484,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        }
        write_unlock(&css_set_lock);
-        ss->active = 1;
        ret = online_css(ss, dummytop);
        if (ret)
                goto err_unload;
@@ -4614,7 +4524,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        mutex_lock(&cgroup_mutex);
        offline_css(ss, dummytop);
-        ss->active = 0;
        if (ss->use_id)
                idr_destroy(&ss->idr);
@@ -4769,7 +4678,7 @@ out:
 */
 /* TODO: Use a proper seq_file iterator */
-static int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, void *v)
 {
        struct pid *pid;
        struct task_struct *tsk;
@@ -4821,19 +4730,6 @@ out:
        return retval;
 }
-static int cgroup_open(struct inode *inode, struct file *file)
-{
-        struct pid *pid = PROC_I(inode)->pid;
-        return single_open(file, proc_cgroup_show, pid);
-}
-const struct file_operations proc_cgroup_operations = {
-        .open           = cgroup_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
@@ -4935,17 +4831,17 @@ void cgroup_post_fork(struct task_struct *child)
         * and addition to css_set.
         */
        if (need_forkexit_callback) {
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * fork/exit callbacks are supported only for builtin
+                 * subsystems, and the builtin section of the subsys
+                 * array is immutable, so we don't need to lock the
+                 * subsys array here. On the other hand, modular section
+                 * of the array can be freed at module unload, so we
+                 * can't touch that.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
-                        /*
-                         * fork/exit callbacks are supported only for
-                         * builtin subsystems and we don't need further
-                         * synchronization as they never go away.
-                         */
-                        if (!ss || ss->module)
-                                continue;
                        if (ss->fork)
                                ss->fork(child);
                }
@@ -5010,13 +4906,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        tsk->cgroups = &init_css_set;
        if (run_callbacks && need_forkexit_callback) {
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                /*
+                 * fork/exit callbacks are supported only for builtin
+                 * subsystems, see cgroup_post_fork() for details.
+                 */
+                for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                        struct cgroup_subsys *ss = subsys[i];
-                        /* modular subsystems can't use callbacks */
-                        if (!ss || ss->module)
-                                continue;
                        if (ss->exit) {
                                struct cgroup *old_cgrp =
                                        rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5030,44 +4926,19 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        put_css_set_taskexit(cg);
 }
-/**
- * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
- * @cgrp: the cgroup in question
- * @task: the task in question
- *
- * See if @cgrp is a descendant of @task's cgroup in the appropriate
- * hierarchy.
- *
- * If we are sending in dummytop, then presumably we are creating
- * the top cgroup in the subsystem.
- *
- * Called only by the ns (nsproxy) cgroup.
- */
-int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
-{
-        int ret;
-        struct cgroup *target;
-        if (cgrp == dummytop)
-                return 1;
-        target = task_cgroup_from_root(task, cgrp->root);
-        while (cgrp != target && cgrp!= cgrp->top_cgroup)
-                cgrp = cgrp->parent;
-        ret = (cgrp == target);
-        return ret;
-}
 static void check_for_release(struct cgroup *cgrp)
 {
        /* All of these checks rely on RCU to keep the cgroup
         * structure alive */
-        if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
+        if (cgroup_is_releasable(cgrp) &&
-            && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
+            !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
-                /* Control Group is currently removeable. If it's not
+                /*
+                 * Control Group is currently removeable. If it's not
                 * already queued for a userspace notification, queue
-                 * it now */
+                 * it now
+                 */
                int need_schedule_work = 0;
                raw_spin_lock(&release_list_lock);
                if (!cgroup_is_removed(cgrp) &&
                    list_empty(&cgrp->release_list)) {
@@ -5100,24 +4971,11 @@ EXPORT_SYMBOL_GPL(__css_tryget);
 /* Caller must verify that the css is not for root cgroup */
 void __css_put(struct cgroup_subsys_state *css)
 {
-        struct cgroup *cgrp = css->cgroup;
        int v;
-        rcu_read_lock();
        v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
+        if (v == 0)
-        switch (v) {
-        case 1:
-                if (notify_on_release(cgrp)) {
-                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
-                        check_for_release(cgrp);
-                }
-                break;
-        case 0:
                schedule_work(&css->dput_work);
-                break;
-        }
-        rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(__css_put);
@@ -5416,55 +5274,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
 }
 EXPORT_SYMBOL_GPL(css_lookup);
-/**
- * css_get_next - lookup next cgroup under specified hierarchy.
- * @ss: pointer to subsystem
- * @id: current position of iteration.
- * @root: pointer to css. search tree under this.
- * @foundid: position of found object.
- *
- * Search next css under the specified hierarchy of rootid. Calling under
- * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
- */
-struct cgroup_subsys_state *
-css_get_next(struct cgroup_subsys *ss, int id,
-             struct cgroup_subsys_state *root, int *foundid)
-{
-        struct cgroup_subsys_state *ret = NULL;
-        struct css_id *tmp;
-        int tmpid;
-        int rootid = css_id(root);
-        int depth = css_depth(root);
-        if (!rootid)
-                return NULL;
-        BUG_ON(!ss->use_id);
-        WARN_ON_ONCE(!rcu_read_lock_held());
-        /* fill start point for scan */
-        tmpid = id;
-        while (1) {
-                /*
-                 * scan next entry from bitmap(tree), tmpid is updated after
-                 * idr_get_next().
-                 */
-                tmp = idr_get_next(&ss->idr, &tmpid);
-                if (!tmp)
-                        break;
-                if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
-                        ret = rcu_dereference(tmp->css);
-                        if (ret) {
-                                *foundid = tmpid;
-                                break;
-                        }
-                }
-                /* continue to scan from next id */
-                tmpid = tmpid + 1;
-        }
-        return ret;
-}
 /*
 * get corresponding css from file open on cgroupfs directory
 */
diff --git a/kernel/compat.c b/kernel/compat.c
index 19971d8c7299..0a09e481b70b 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -516,25 +516,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
        return 0;
 }
-asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
-{
-        struct rusage r;
-        int ret;
-        mm_segment_t old_fs = get_fs();
-        set_fs(KERNEL_DS);
-        ret = sys_getrusage(who, (struct rusage __user *) &r);
-        set_fs(old_fs);
-        if (ret)
-                return ret;
-        if (put_compat_rusage(&r, ru))
-                return -EFAULT;
-        return 0;
-}
 COMPAT_SYSCALL_DEFINE4(wait4,
        compat_pid_t, pid,
        compat_uint_t __user *, stat_addr,
@@ -1138,71 +1119,6 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
 }
 #endif
-struct compat_sysinfo {
-        s32 uptime;
-        u32 loads[3];
-        u32 totalram;
-        u32 freeram;
-        u32 sharedram;
-        u32 bufferram;
-        u32 totalswap;
-        u32 freeswap;
-        u16 procs;
-        u16 pad;
-        u32 totalhigh;
-        u32 freehigh;
-        u32 mem_unit;
-        char _f[20-2*sizeof(u32)-sizeof(int)];
-};
-asmlinkage long
-compat_sys_sysinfo(struct compat_sysinfo __user *info)
-{
-        struct sysinfo s;
-        do_sysinfo(&s);
-        /* Check to see if any memory value is too large for 32-bit and scale
-         *  down if needed
-         */
-        if ((s.totalram >> 32) || (s.totalswap >> 32)) {
-                int bitcount = 0;
-                while (s.mem_unit < PAGE_SIZE) {
-                        s.mem_unit <<= 1;
-                        bitcount++;
-                }
-                s.totalram >>= bitcount;
-                s.freeram >>= bitcount;
-                s.sharedram >>= bitcount;
-                s.bufferram >>= bitcount;
-                s.totalswap >>= bitcount;
-                s.freeswap >>= bitcount;
-                s.totalhigh >>= bitcount;
-                s.freehigh >>= bitcount;
-        }
-        if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
-            __put_user (s.uptime, &info->uptime) ||
-            __put_user (s.loads[0], &info->loads[0]) ||
-            __put_user (s.loads[1], &info->loads[1]) ||
-            __put_user (s.loads[2], &info->loads[2]) ||
-            __put_user (s.totalram, &info->totalram) ||
-            __put_user (s.freeram, &info->freeram) ||
-            __put_user (s.sharedram, &info->sharedram) ||
-            __put_user (s.bufferram, &info->bufferram) ||
-            __put_user (s.totalswap, &info->totalswap) ||
-            __put_user (s.freeswap, &info->freeswap) ||
-            __put_user (s.procs, &info->procs) ||
-            __put_user (s.totalhigh, &info->totalhigh) ||
-            __put_user (s.freehigh, &info->freehigh) ||
-            __put_user (s.mem_unit, &info->mem_unit))
-                return -EFAULT;
-        return 0;
-}
 COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
                       compat_pid_t, pid,
                       struct compat_timespec __user *, interval)
diff --git a/kernel/configs.c b/kernel/configs.c
index 42e8fa075eed..c18b1f1ae515 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,7 +79,7 @@ static int __init ikconfig_init(void)
        if (!entry)
                return -ENOMEM;
-        entry->size = kernel_config_data_size;
+        proc_set_size(entry, kernel_config_data_size);
        return 0;
 }
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
new file mode 100644
index 000000000000..59ab052ef7a0
--- /dev/null
+++ b/kernel/cpu/Makefile
@@ -0,0 +1 @@
+obj-y   = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
new file mode 100644
index 000000000000..8b86c0c68edf
--- /dev/null
+++ b/kernel/cpu/idle.c
@@ -0,0 +1,116 @@
+/*
+ * Generic entry point for the idle threads
+ */
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+#include <asm/tlb.h>
+#include <trace/events/power.h>
+static int __read_mostly cpu_idle_force_poll;
+void cpu_idle_poll_ctrl(bool enable)
+{
+        if (enable) {
+                cpu_idle_force_poll++;
+        } else {
+                cpu_idle_force_poll--;
+                WARN_ON_ONCE(cpu_idle_force_poll < 0);
+        }
+}
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+        cpu_idle_force_poll = 1;
+        return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+        cpu_idle_force_poll = 0;
+        return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+static inline int cpu_idle_poll(void)
+{
+        trace_cpu_idle_rcuidle(0, smp_processor_id());
+        local_irq_enable();
+        while (!need_resched())
+                cpu_relax();
+        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+        return 1;
+}
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+        cpu_idle_force_poll = 1;
+}
+/*
+ * Generic idle loop implementation
+ */
+static void cpu_idle_loop(void)
+{
+        while (1) {
+                tick_nohz_idle_enter();
+                while (!need_resched()) {
+                        check_pgt_cache();
+                        rmb();
+                        if (cpu_is_offline(smp_processor_id()))
+                                arch_cpu_idle_dead();
+                        local_irq_disable();
+                        arch_cpu_idle_enter();
+                        /*
+                         * In poll mode we reenable interrupts and spin.
+                         *
+                         * Also if we detected in the wakeup from idle
+                         * path that the tick broadcast device expired
+                         * for us, we don't want to go deep idle as we
+                         * know that the IPI is going to arrive right
+                         * away
+                         */
+                        if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+                                cpu_idle_poll();
+                        } else {
+                                current_clr_polling();
+                                if (!need_resched()) {
+                                        stop_critical_timings();
+                                        rcu_idle_enter();
+                                        arch_cpu_idle();
+                                        WARN_ON_ONCE(irqs_disabled());
+                                        rcu_idle_exit();
+                                        start_critical_timings();
+                                } else {
+                                        local_irq_enable();
+                                }
+                                current_set_polling();
+                        }
+                        arch_cpu_idle_exit();
+                }
+                tick_nohz_idle_exit();
+                schedule_preempt_disabled();
+        }
+}
+void cpu_startup_entry(enum cpuhp_state state)
+{
+        current_set_polling();
+        arch_cpu_idle_prepare();
+        cpu_idle_loop();
+}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4f9dfe43ecbd..64b3f791bbe5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -265,17 +265,6 @@ static DEFINE_MUTEX(cpuset_mutex);
 static DEFINE_MUTEX(callback_mutex);
 /*
- * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
- * buffers.  They are statically allocated to prevent using excess stack
- * when calling cpuset_print_task_mems_allowed().
- */
-#define CPUSET_NAME_LEN         (128)
-#define CPUSET_NODELIST_LEN     (256)
-static char cpuset_name[CPUSET_NAME_LEN];
-static char cpuset_nodelist[CPUSET_NODELIST_LEN];
-static DEFINE_SPINLOCK(cpuset_buffer_lock);
-/*
 * CPU / memory hotplug is handled asynchronously.
 */
 static struct workqueue_struct *cpuset_propagate_hotplug_wq;
@@ -780,25 +769,26 @@ static void rebuild_sched_domains_locked(void)
        lockdep_assert_held(&cpuset_mutex);
        get_online_cpus();
+        /*
+         * We have raced with CPU hotplug. Don't do anything to avoid
+         * passing doms with offlined cpu to partition_sched_domains().
+         * Anyways, hotplug work item will rebuild sched domains.
+         */
+        if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+                goto out;
        /* Generate domain masks and attrs */
        ndoms = generate_sched_domains(&doms, &attr);
        /* Have scheduler rebuild the domains */
        partition_sched_domains(ndoms, doms, attr);
+out:
        put_online_cpus();
 }
 #else /* !CONFIG_SMP */
 static void rebuild_sched_domains_locked(void)
 {
 }
-static int generate_sched_domains(cpumask_var_t **domains,
-                        struct sched_domain_attr **attributes)
-{
-        *domains = NULL;
-        return 1;
-}
 #endif /* CONFIG_SMP */
 void rebuild_sched_domains(void)
@@ -1388,16 +1378,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
        cgroup_taskset_for_each(task, cgrp, tset) {
                /*
-                 * Kthreads bound to specific cpus cannot be moved to a new
+                 * Kthreads which disallow setaffinity shouldn't be moved
-                 * cpuset; we cannot change their cpu affinity and
+                 * to a new cpuset; we don't want to change their cpu
-                 * isolating such threads by their set of allowed nodes is
+                 * affinity and isolating such threads by their set of
-                 * unnecessary.  Thus, cpusets are not applicable for such
+                 * allowed nodes is unnecessary.  Thus, cpusets are not
-                 * threads.  This prevents checking for success of
+                 * applicable for such threads.  This prevents checking for
-                 * set_cpus_allowed_ptr() on all attached tasks before
+                 * success of set_cpus_allowed_ptr() on all attached tasks
-                 * cpus_allowed may be changed.
+                 * before cpus_allowed may be changed.
                 */
                ret = -EINVAL;
-                if (task->flags & PF_THREAD_BOUND)
+                if (task->flags & PF_NO_SETAFFINITY)
                        goto out_unlock;
                ret = security_task_setscheduler(task);
                if (ret)
@@ -2005,50 +1995,6 @@ int __init cpuset_init(void)
        return 0;
 }
-/**
- * cpuset_do_move_task - move a given task to another cpuset
- * @tsk: pointer to task_struct the task to move
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
- * Return nonzero to stop the walk through the tasks.
- */
-static void cpuset_do_move_task(struct task_struct *tsk,
-                                struct cgroup_scanner *scan)
-{
-        struct cgroup *new_cgroup = scan->data;
-        cgroup_lock();
-        cgroup_attach_task(new_cgroup, tsk);
-        cgroup_unlock();
-}
-/**
- * move_member_tasks_to_cpuset - move tasks from one cpuset to another
- * @from: cpuset in which the tasks currently reside
- * @to: cpuset to which the tasks will be moved
- *
- * Called with cpuset_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
- *
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
- */
-static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
-{
-        struct cgroup_scanner scan;
-        scan.cg = from->css.cgroup;
-        scan.test_task = NULL; /* select all tasks in cgroup */
-        scan.process_task = cpuset_do_move_task;
-        scan.heap = NULL;
-        scan.data = to->css.cgroup;
-        if (cgroup_scan_tasks(&scan))
-                printk(KERN_ERR "move_member_tasks_to_cpuset: "
-                                "cgroup_scan_tasks failed\n");
-}
 /*
 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2069,7 +2015,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
                        nodes_empty(parent->mems_allowed))
                parent = parent_cs(parent);
-        move_member_tasks_to_cpuset(cs, parent);
+        if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+                rcu_read_lock();
+                printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
+                       cgroup_name(cs->css.cgroup));
+                rcu_read_unlock();
+        }
 }
 /**
@@ -2222,17 +2173,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
        flush_workqueue(cpuset_propagate_hotplug_wq);
        /* rebuild sched domains if cpus_allowed has changed */
-        if (cpus_updated) {
+        if (cpus_updated)
-                struct sched_domain_attr *attr;
+                rebuild_sched_domains();
-                cpumask_var_t *doms;
-                int ndoms;
-                mutex_lock(&cpuset_mutex);
-                ndoms = generate_sched_domains(&doms, &attr);
-                mutex_unlock(&cpuset_mutex);
-                partition_sched_domains(ndoms, doms, attr);
-        }
 }
 void cpuset_update_active_cpus(bool cpu_online)
@@ -2251,7 +2193,6 @@ void cpuset_update_active_cpus(bool cpu_online)
        schedule_work(&cpuset_hotplug_work);
 }
-#ifdef CONFIG_MEMORY_HOTPLUG
 /*
 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
 * Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2263,20 +2204,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
        schedule_work(&cpuset_hotplug_work);
        return NOTIFY_OK;
 }
-#endif
+static struct notifier_block cpuset_track_online_nodes_nb = {
+        .notifier_call = cpuset_track_online_nodes,
+        .priority = 10,         /* ??! */
+};
 /**
 * cpuset_init_smp - initialize cpus_allowed
 *
 * Description: Finish top cpuset after cpu, node maps are initialized
- **/
+ */
 void __init cpuset_init_smp(void)
 {
        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        top_cpuset.mems_allowed = node_states[N_MEMORY];
-        hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+        register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
        cpuset_propagate_hotplug_wq =
                alloc_ordered_workqueue("cpuset_hotplug", 0);
@@ -2592,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
        return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
 }
+#define CPUSET_NODELIST_LEN     (256)
 /**
 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
 * @task: pointer to task_struct of some task.
@@ -2602,25 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
 */
 void cpuset_print_task_mems_allowed(struct task_struct *tsk)
 {
-        struct dentry *dentry;
+         /* Statically allocated to prevent using excess stack. */
+        static char cpuset_nodelist[CPUSET_NODELIST_LEN];
+        static DEFINE_SPINLOCK(cpuset_buffer_lock);
-        dentry = task_cs(tsk)->css.cgroup->dentry;
+        struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
-        spin_lock(&cpuset_buffer_lock);
-        if (!dentry) {
+        rcu_read_lock();
-                strcpy(cpuset_name, "/");
+        spin_lock(&cpuset_buffer_lock);
-        } else {
-                spin_lock(&dentry->d_lock);
-                strlcpy(cpuset_name, (const char *)dentry->d_name.name,
-                        CPUSET_NAME_LEN);
-                spin_unlock(&dentry->d_lock);
-        }
        nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
                           tsk->mems_allowed);
        printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
-               tsk->comm, cpuset_name, cpuset_nodelist);
+               tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
        spin_unlock(&cpuset_buffer_lock);
+        rcu_read_unlock();
 }
 /*
@@ -2666,7 +2609,7 @@ void __cpuset_memory_pressure_bump(void)
 *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
 *    anyway.
 */
-static int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, void *unused_v)
 {
        struct pid *pid;
        struct task_struct *tsk;
@@ -2700,19 +2643,6 @@ out_free:
 out:
        return retval;
 }
-static int cpuset_open(struct inode *inode, struct file *file)
-{
-        struct pid *pid = PROC_I(inode)->pid;
-        return single_open(file, proc_cpuset_show, pid);
-}
-const struct file_operations proc_cpuset_operations = {
-        .open           = cpuset_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
 #endif /* CONFIG_PROC_PID_CPUSET */
 /* Display task mems_allowed in /proc/<pid>/status file. */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index c26278fd4851..0506d447aed2 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -775,7 +775,7 @@ static void sysrq_handle_dbg(int key)
 static struct sysrq_key_op sysrq_dbg_op = {
        .handler        = sysrq_handle_dbg,
-        .help_msg       = "debug(G)",
+        .help_msg       = "debug(g)",
        .action_msg     = "DEBUG",
 };
 #endif
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b0cd86501c30..6b41c1899a8b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -18,6 +18,7 @@
 #include <linux/poll.h>
 #include <linux/slab.h>
 #include <linux/hash.h>
+#include <linux/tick.h>
 #include <linux/sysfs.h>
 #include <linux/dcache.h>
 #include <linux/percpu.h>
@@ -37,6 +38,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/mm_types.h>
+#include <linux/cgroup.h>
 #include "internal.h"
@@ -234,6 +236,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 #ifdef CONFIG_CGROUP_PERF
 /*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+        u64                             time;
+        u64                             timestamp;
+};
+struct perf_cgroup {
+        struct cgroup_subsys_state      css;
+        struct perf_cgroup_info __percpu *info;
+};
+/*
 * Must ensure cgroup is pinned (css_get) before calling
 * this function. In other words, we cannot call this function
 * if there is no cgroup event for the current CPU context.
@@ -251,7 +267,22 @@ perf_cgroup_match(struct perf_event *event)
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-        return !event->cgrp || event->cgrp == cpuctx->cgrp;
+        /* @event doesn't care about cgroup */
+        if (!event->cgrp)
+                return true;
+        /* wants specific cgroup scope but @cpuctx isn't associated with any */
+        if (!cpuctx->cgrp)
+                return false;
+        /*
+         * Cgroup scoping is recursive.  An event enabled for a cgroup is
+         * also enabled for all its descendant cgroups.  If @cpuctx's
+         * cgroup is a descendant of @event's (the test covers identity
+         * case), it's a match.
+         */
+        return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
+                                    event->cgrp->css.cgroup);
 }
 static inline bool perf_tryget_cgroup(struct perf_event *event)
@@ -655,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
        WARN_ON(!irqs_disabled());
-        if (list_empty(&cpuctx->rotation_list))
+        if (list_empty(&cpuctx->rotation_list)) {
+                int was_empty = list_empty(head);
                list_add(&cpuctx->rotation_list, head);
+                if (was_empty)
+                        tick_nohz_full_kick();
+        }
 }
 static void get_ctx(struct perf_event_context *ctx)
@@ -961,9 +996,15 @@ static void perf_event__header_size(struct perf_event *event)
        if (sample_type & PERF_SAMPLE_PERIOD)
                size += sizeof(data->period);
+        if (sample_type & PERF_SAMPLE_WEIGHT)
+                size += sizeof(data->weight);
        if (sample_type & PERF_SAMPLE_READ)
                size += event->read_size;
+        if (sample_type & PERF_SAMPLE_DATA_SRC)
+                size += sizeof(data->data_src.val);
        event->header_size = size;
 }
@@ -2555,6 +2596,16 @@ done:
                list_del_init(&cpuctx->rotation_list);
 }
+#ifdef CONFIG_NO_HZ_FULL
+bool perf_event_can_stop_tick(void)
+{
+        if (list_empty(&__get_cpu_var(rotation_list)))
+                return true;
+        else
+                return false;
+}
+#endif
 void perf_event_task_tick(void)
 {
        struct list_head *head = &__get_cpu_var(rotation_list);
@@ -4178,6 +4229,12 @@ void perf_output_sample(struct perf_output_handle *handle,
                perf_output_sample_ustack(handle,
                                          data->stack_user_size,
                                          data->regs_user.regs);
+        if (sample_type & PERF_SAMPLE_WEIGHT)
+                perf_output_put(handle, data->weight);
+        if (sample_type & PERF_SAMPLE_DATA_SRC)
+                perf_output_put(handle, data->data_src.val);
 }
 void perf_prepare_sample(struct perf_event_header *header,
@@ -4434,12 +4491,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)
                        if (ctxn < 0)
                                goto next;
                        ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                        if (ctx)
+                                perf_event_task_ctx(ctx, task_event);
                }
-                if (ctx)
-                        perf_event_task_ctx(ctx, task_event);
 next:
                put_cpu_ptr(pmu->pmu_cpu_context);
        }
+        if (task_event->task_ctx)
+                perf_event_task_ctx(task_event->task_ctx, task_event);
        rcu_read_unlock();
 }
@@ -4593,6 +4653,7 @@ void perf_event_comm(struct task_struct *task)
        struct perf_event_context *ctx;
        int ctxn;
+        rcu_read_lock();
        for_each_task_context_nr(ctxn) {
                ctx = task->perf_event_ctxp[ctxn];
                if (!ctx)
@@ -4600,6 +4661,7 @@ void perf_event_comm(struct task_struct *task)
                perf_event_enable_on_exec(ctx);
        }
+        rcu_read_unlock();
        if (!atomic_read(&nr_comm_events))
                return;
@@ -4734,7 +4796,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
        } else {
                if (arch_vma_name(mmap_event->vma)) {
                        name = strncpy(tmp, arch_vma_name(mmap_event->vma),
-                                       sizeof(tmp));
+                                       sizeof(tmp) - 1);
+                        tmp[sizeof(tmp) - 1] = '\0';
                        goto got_name;
                }
@@ -4761,6 +4824,9 @@ got_name:
        mmap_event->file_name = name;
        mmap_event->file_size = size;
+        if (!(vma->vm_flags & VM_EXEC))
+                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
        rcu_read_lock();
@@ -5327,7 +5393,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
 static int perf_swevent_init(struct perf_event *event)
 {
-        int event_id = event->attr.config;
+        u64 event_id = event->attr.config;
        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;
@@ -5647,6 +5713,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
                event->attr.sample_period = NSEC_PER_SEC / freq;
                hwc->sample_period = event->attr.sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
+                hwc->last_period = hwc->sample_period;
                event->attr.freq = 0;
        }
 }
@@ -5982,6 +6049,7 @@ skip_type:
        if (pmu->pmu_cpu_context)
                goto got_cpu_context;
+        ret = -ENOMEM;
        pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
        if (!pmu->pmu_cpu_context)
                goto free_dev;
@@ -7509,12 +7577,5 @@ struct cgroup_subsys perf_subsys = {
        .css_free       = perf_cgroup_css_free,
        .exit           = perf_cgroup_exit,
        .attach         = perf_cgroup_attach,
-        /*
-         * perf_event cgroup doesn't handle nesting correctly.
-         * ctx->nr_cgroups adjustments should be propagated through the
-         * cgroup hierarchy.  Fix it and remove the following.
-         */
-        .broken_hierarchy = true,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8b..eb675c4d59df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
        int                             page_order;     /* allocation order  */
 #endif
        int                             nr_pages;       /* nr of data pages  */
-        int                             writable;       /* are we writable   */
+        int                             overwrite;      /* can overwrite itself */
        atomic_t                        poll;           /* POLL_ for wakeups */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff3973..cd55144270b5 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
 static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
                              unsigned long offset, unsigned long head)
 {
-        unsigned long mask;
+        unsigned long sz = perf_data_size(rb);
+        unsigned long mask = sz - 1;
-        if (!rb->writable)
+        /*
+         * check if user-writable
+         * overwrite : over-write its own tail
+         * !overwrite: buffer possibly drops events.
+         */
+        if (rb->overwrite)
                return true;
-        mask = perf_data_size(rb) - 1;
+        /*
+         * verify that payload is not bigger than buffer
+         * otherwise masking logic may fail to detect
+         * the "not enough space" condition
+         */
+        if ((head - offset) > sz)
+                return false;
        offset = (offset - tail) & mask;
        head   = (head   - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
                rb->watermark = max_size / 2;
        if (flags & RING_BUFFER_WRITABLE)
-                rb->writable = 1;
+                rb->overwrite = 0;
+        else
+                rb->overwrite = 1;
        atomic_set(&rb->refcount, 1);
@@ -312,11 +326,16 @@ void rb_free(struct ring_buffer *rb)
 }
 #else
+static int data_page_nr(struct ring_buffer *rb)
+{
+        return rb->nr_pages << page_order(rb);
+}
 struct page *
 perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
 {
-        if (pgoff > (1UL << page_order(rb)))
+        /* The '>' counts in the user page. */
+        if (pgoff > data_page_nr(rb))
                return NULL;
        return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
@@ -336,10 +355,11 @@ static void rb_free_work(struct work_struct *work)
        int i, nr;
        rb = container_of(work, struct ring_buffer, work);
-        nr = 1 << page_order(rb);
+        nr = data_page_nr(rb);
        base = rb->user_page;
-        for (i = 0; i < nr + 1; i++)
+        /* The '<=' counts in the user page. */
+        for (i = 0; i <= nr; i++)
                perf_mmap_unmark_page(base + (i * PAGE_SIZE));
        vfree(base);
@@ -373,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
        rb->user_page = all_buf;
        rb->data_pages[0] = all_buf + PAGE_SIZE;
        rb->page_order = ilog2(nr_pages);
-        rb->nr_pages = 1;
+        rb->nr_pages = !!nr_pages;
        ring_buffer_init(rb, watermark, flags);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a567c8c7ef31..f3569747d629 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -75,6 +75,15 @@ struct uprobe {
        struct arch_uprobe      arch;
 };
+struct return_instance {
+        struct uprobe           *uprobe;
+        unsigned long           func;
+        unsigned long           orig_ret_vaddr; /* original return address */
+        bool                    chained;        /* true, if instance is nested */
+        struct return_instance  *next;          /* keep as stack */
+};
 /*
 * valid_vma: Verify if the specified vma is an executable vma
 * Relax restrictions while unregistering: vm_flags might have
@@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
        return *insn == UPROBE_SWBP_INSN;
 }
-static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
+/**
+ * is_trap_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_trap_insn
+ * Returns true if @insn is a breakpoint instruction.
+ *
+ * This function is needed for the case where an architecture has multiple
+ * trap instructions (like powerpc).
+ */
+bool __weak is_trap_insn(uprobe_opcode_t *insn)
+{
+        return is_swbp_insn(insn);
+}
+static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
 {
        void *kaddr = kmap_atomic(page);
-        memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
+        memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+        kunmap_atomic(kaddr);
+}
+static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+{
+        void *kaddr = kmap_atomic(page);
+        memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
        kunmap_atomic(kaddr);
 }
@@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
        uprobe_opcode_t old_opcode;
        bool is_swbp;
-        copy_opcode(page, vaddr, &old_opcode);
+        /*
+         * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
+         * We do not check if it is any other 'trap variant' which could
+         * be conditional trap instruction such as the one powerpc supports.
+         *
+         * The logic is that we do not care if the underlying instruction
+         * is a trap variant; uprobes always wins over any other (gdb)
+         * breakpoint.
+         */
+        copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
        is_swbp = is_swbp_insn(&old_opcode);
        if (is_swbp_insn(new_opcode)) {
@@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
 * Expect the breakpoint instruction to be the smallest size instruction for
 * the architecture. If an arch has variable length instruction and the
 * breakpoint instruction is not of the smallest length instruction
- * supported by that architecture then we need to modify is_swbp_at_addr and
+ * supported by that architecture then we need to modify is_trap_at_addr and
 * write_opcode accordingly. This would never be a problem for archs that
 * have fixed length instructions.
 */
@@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
                        uprobe_opcode_t opcode)
 {
        struct page *old_page, *new_page;
-        void *vaddr_old, *vaddr_new;
        struct vm_area_struct *vma;
        int ret;
@@ -246,15 +284,8 @@ retry:
        __SetPageUptodate(new_page);
-        /* copy the page now that we've got it stable */
+        copy_highpage(new_page, old_page);
-        vaddr_old = kmap_atomic(old_page);
+        copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
-        vaddr_new = kmap_atomic(new_page);
-        memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
-        memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
-        kunmap_atomic(vaddr_new);
-        kunmap_atomic(vaddr_old);
        ret = anon_vma_prepare(vma);
        if (ret)
@@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
                        unsigned long nbytes, loff_t offset)
 {
        struct page *page;
-        void *vaddr;
-        unsigned long off;
-        pgoff_t idx;
-        if (!filp)
-                return -EINVAL;
        if (!mapping->a_ops->readpage)
                return -EIO;
-        idx = offset >> PAGE_CACHE_SHIFT;
-        off = offset & ~PAGE_MASK;
        /*
         * Ensure that the page that has the original instruction is
         * populated and in page-cache.
         */
-        page = read_mapping_page(mapping, idx, filp);
+        page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
        if (IS_ERR(page))
                return PTR_ERR(page);
-        vaddr = kmap_atomic(page);
+        copy_from_page(page, offset, insn, nbytes);
-        memcpy(insn, vaddr + off, nbytes);
-        kunmap_atomic(vaddr);
        page_cache_release(page);
        return 0;
@@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
                goto out;
        ret = -ENOTSUPP;
-        if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
+        if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
                goto out;
        ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
                down_write(&mm->mmap_sem);
                vma = find_vma(mm, info->vaddr);
                if (!vma || !valid_vma(vma, is_register) ||
-                    vma->vm_file->f_mapping->host != uprobe->inode)
+                    file_inode(vma->vm_file) != uprobe->inode)
                        goto unlock;
                if (vma->vm_start > info->vaddr ||
@@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
        struct uprobe *uprobe;
        int ret;
+        /* Uprobe must have at least one set consumer */
+        if (!uc->handler && !uc->ret_handler)
+                return -EINVAL;
        /* Racy, just to catch the obvious mistakes */
        if (offset > i_size_read(inode))
                return -EINVAL;
@@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
                loff_t offset;
                if (!valid_vma(vma, false) ||
-                    vma->vm_file->f_mapping->host != uprobe->inode)
+                    file_inode(vma->vm_file) != uprobe->inode)
                        continue;
                offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
@@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
        if (no_uprobe_events() || !valid_vma(vma, true))
                return 0;
-        inode = vma->vm_file->f_mapping->host;
+        inode = file_inode(vma->vm_file);
        if (!inode)
                return 0;
@@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
        struct inode *inode;
        struct rb_node *n;
-        inode = vma->vm_file->f_mapping->host;
+        inode = file_inode(vma->vm_file);
        min = vaddr_to_offset(vma, start);
        max = min + (end - start) - 1;
@@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void)
 {
        struct mm_struct *mm = current->mm;
        struct xol_area *area;
+        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
        area = mm->uprobes_state.xol_area;
        if (area)
@@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void)
        if (!area->page)
                goto free_bitmap;
+        /* allocate first slot of task's xol_area for the return probes */
+        set_bit(0, area->bitmap);
+        copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+        atomic_set(&area->slot_count, 1);
        init_waitqueue_head(&area->wq);
        if (!xol_add_vma(area))
                return area;
@@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
 static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
 {
        struct xol_area *area;
-        unsigned long offset;
        unsigned long xol_vaddr;
-        void *vaddr;
        area = get_xol_area();
        if (!area)
@@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
                return 0;
        /* Initialize the slot */
-        offset = xol_vaddr & ~PAGE_MASK;
+        copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
-        vaddr = kmap_atomic(area->page);
-        memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
-        kunmap_atomic(vaddr);
        /*
         * We probably need flush_icache_user_range() but it needs vma.
         * This should work on supported architectures too.
@@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
 void uprobe_free_utask(struct task_struct *t)
 {
        struct uprobe_task *utask = t->utask;
+        struct return_instance *ri, *tmp;
        if (!utask)
                return;
@@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t)
        if (utask->active_uprobe)
                put_uprobe(utask->active_uprobe);
+        ri = utask->return_instances;
+        while (ri) {
+                tmp = ri;
+                ri = ri->next;
+                put_uprobe(tmp->uprobe);
+                kfree(tmp);
+        }
        xol_free_insn_slot(t);
        kfree(utask);
        t->utask = NULL;
@@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void)
        return current->utask;
 }
+/*
+ * Current area->vaddr notion assume the trampoline address is always
+ * equal area->vaddr.
+ *
+ * Returns -1 in case the xol_area is not allocated.
+ */
+static unsigned long get_trampoline_vaddr(void)
+{
+        struct xol_area *area;
+        unsigned long trampoline_vaddr = -1;
+        area = current->mm->uprobes_state.xol_area;
+        smp_read_barrier_depends();
+        if (area)
+                trampoline_vaddr = area->vaddr;
+        return trampoline_vaddr;
+}
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+{
+        struct return_instance *ri;
+        struct uprobe_task *utask;
+        unsigned long orig_ret_vaddr, trampoline_vaddr;
+        bool chained = false;
+        if (!get_xol_area())
+                return;
+        utask = get_utask();
+        if (!utask)
+                return;
+        if (utask->depth >= MAX_URETPROBE_DEPTH) {
+                printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
+                                " nestedness limit pid/tgid=%d/%d\n",
+                                current->pid, current->tgid);
+                return;
+        }
+        ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+        if (!ri)
+                goto fail;
+        trampoline_vaddr = get_trampoline_vaddr();
+        orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
+        if (orig_ret_vaddr == -1)
+                goto fail;
+        /*
+         * We don't want to keep trampoline address in stack, rather keep the
+         * original return address of first caller thru all the consequent
+         * instances. This also makes breakpoint unwrapping easier.
+         */
+        if (orig_ret_vaddr == trampoline_vaddr) {
+                if (!utask->return_instances) {
+                        /*
+                         * This situation is not possible. Likely we have an
+                         * attack from user-space.
+                         */
+                        pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
+                                                current->pid, current->tgid);
+                        goto fail;
+                }
+                chained = true;
+                orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
+        }
+        atomic_inc(&uprobe->ref);
+        ri->uprobe = uprobe;
+        ri->func = instruction_pointer(regs);
+        ri->orig_ret_vaddr = orig_ret_vaddr;
+        ri->chained = chained;
+        utask->depth++;
+        /* add instance to the stack */
+        ri->next = utask->return_instances;
+        utask->return_instances = ri;
+        return;
+ fail:
+        kfree(ri);
+}
 /* Prepare to single-step probed instruction out of line. */
 static int
 pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
@@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
        clear_bit(MMF_HAS_UPROBES, &mm->flags);
 }
-static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
+static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 {
        struct page *page;
        uprobe_opcode_t opcode;
@@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
        if (result < 0)
                return result;
-        copy_opcode(page, vaddr, &opcode);
+        copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
        put_page(page);
 out:
-        return is_swbp_insn(&opcode);
+        /* This needs to return true for any variant of the trap insn */
+        return is_trap_insn(&opcode);
 }
 static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
@@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
        vma = find_vma(mm, bp_vaddr);
        if (vma && vma->vm_start <= bp_vaddr) {
                if (valid_vma(vma, false)) {
-                        struct inode *inode = vma->vm_file->f_mapping->host;
+                        struct inode *inode = file_inode(vma->vm_file);
                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);
                        uprobe = find_uprobe(inode, offset);
                }
                if (!uprobe)
-                        *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
+                        *is_swbp = is_trap_at_addr(mm, bp_vaddr);
        } else {
                *is_swbp = -EFAULT;
        }
@@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 {
        struct uprobe_consumer *uc;
        int remove = UPROBE_HANDLER_REMOVE;
+        bool need_prep = false; /* prepare return uprobe, when needed */
        down_read(&uprobe->register_rwsem);
        for (uc = uprobe->consumers; uc; uc = uc->next) {
-                int rc = uc->handler(uc, regs);
+                int rc = 0;
+                if (uc->handler) {
+                        rc = uc->handler(uc, regs);
+                        WARN(rc & ~UPROBE_HANDLER_MASK,
+                                "bad rc=0x%x from %pf()\n", rc, uc->handler);
+                }
+                if (uc->ret_handler)
+                        need_prep = true;
-                WARN(rc & ~UPROBE_HANDLER_MASK,
-                        "bad rc=0x%x from %pf()\n", rc, uc->handler);
                remove &= rc;
        }
+        if (need_prep && !remove)
+                prepare_uretprobe(uprobe, regs); /* put bp at return */
        if (remove && uprobe->consumers) {
                WARN_ON(!uprobe_is_active(uprobe));
                unapply_uprobe(uprobe, current->mm);
@@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
        up_read(&uprobe->register_rwsem);
 }
+static void
+handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+{
+        struct uprobe *uprobe = ri->uprobe;
+        struct uprobe_consumer *uc;
+        down_read(&uprobe->register_rwsem);
+        for (uc = uprobe->consumers; uc; uc = uc->next) {
+                if (uc->ret_handler)
+                        uc->ret_handler(uc, ri->func, regs);
+        }
+        up_read(&uprobe->register_rwsem);
+}
+static bool handle_trampoline(struct pt_regs *regs)
+{
+        struct uprobe_task *utask;
+        struct return_instance *ri, *tmp;
+        bool chained;
+        utask = current->utask;
+        if (!utask)
+                return false;
+        ri = utask->return_instances;
+        if (!ri)
+                return false;
+        /*
+         * TODO: we should throw out return_instance's invalidated by
+         * longjmp(), currently we assume that the probed function always
+         * returns.
+         */
+        instruction_pointer_set(regs, ri->orig_ret_vaddr);
+        for (;;) {
+                handle_uretprobe_chain(ri, regs);
+                chained = ri->chained;
+                put_uprobe(ri->uprobe);
+                tmp = ri;
+                ri = ri->next;
+                kfree(tmp);
+                if (!chained)
+                        break;
+                utask->depth--;
+                BUG_ON(!ri);
+        }
+        utask->return_instances = ri;
+        return true;
+}
 /*
 * Run handler and ask thread to singlestep.
 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs)
        int uninitialized_var(is_swbp);
        bp_vaddr = uprobe_get_swbp_addr(regs);
-        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+        if (bp_vaddr == get_trampoline_vaddr()) {
+                if (handle_trampoline(regs))
+                        return;
+                pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
+                                                current->pid, current->tgid);
+        }
+        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
        if (!uprobe) {
                if (is_swbp > 0) {
                        /* No matching uprobe; signal SIGTRAP. */
@@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs)
 */
 int uprobe_pre_sstep_notifier(struct pt_regs *regs)
 {
-        if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
+        if (!current->mm)
+                return 0;
+        if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+            (!current->utask || !current->utask->return_instances))
                return 0;
        set_thread_flag(TIF_UPROBE);
diff --git a/kernel/exit.c b/kernel/exit.c
index 51e485ca9935..af2eb3cbd499 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -835,7 +835,7 @@ void do_exit(long code)
        /*
         * Make sure we are holding no locks:
         */
-        debug_check_no_locks_held();
+        debug_check_no_locks_held(tsk);
        /*
         * We can do this unlocked here. The futex code uses this flag
         * just to verify whether the pi state cleanup has been done
@@ -847,7 +847,7 @@ void do_exit(long code)
                exit_io_context(tsk);
        if (tsk->splice_pipe)
-                __free_pipe_info(tsk->splice_pipe);
+                free_pipe_info(tsk->splice_pipe);
        if (tsk->task_frag.page)
                put_page(tsk->task_frag.page);
@@ -1629,9 +1629,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
        }
        put_pid(pid);
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(5, ret, which, upid, infop, options, ru);
        return ret;
 }
@@ -1669,8 +1666,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
        ret = do_wait(&wo);
        put_pid(pid);
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
        return ret;
 }
diff --git a/kernel/extable.c b/kernel/extable.c
index fe35a634bf76..67460b93b1a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,10 +41,10 @@ u32 __initdata main_extable_sort_needed = 1;
 /* Sort the kernel's built-in exception table */
 void __init sort_main_extable(void)
 {
-        if (main_extable_sort_needed)
+        if (main_extable_sort_needed) {
+                pr_notice("Sorting __ex_table...\n");
                sort_extable(__start___ex_table, __stop___ex_table);
-        else
+        }
-                pr_notice("__ex_table already sorted, skipping sort\n");
 }
 /* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 8d932b1c9056..7d40687b1434 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
+        if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+                return ERR_PTR(-EINVAL);
        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
@@ -1230,7 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->utime = p->stime = p->gtime = 0;
        p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        p->prev_cputime.utime = p->prev_cputime.stime = 0;
 #endif
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -1674,10 +1677,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int, tls_val)
 #endif
 {
-        long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+        return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
-        asmlinkage_protect(5, ret, clone_flags, newsp,
-                        parent_tidptr, child_tidptr, tls_val);
-        return ret;
 }
 #endif
@@ -1807,7 +1807,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
         * If unsharing a user namespace must also unshare the thread.
         */
        if (unshare_flags & CLONE_NEWUSER)
-                unshare_flags |= CLONE_THREAD;
+                unshare_flags |= CLONE_THREAD | CLONE_FS;
        /*
         * If unsharing a pid namespace must also unshare the thread.
         */
diff --git a/kernel/futex.c b/kernel/futex.c
index f0090a993dab..b26dcfc02c94 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -223,7 +223,8 @@ static void drop_futex_key_refs(union futex_key *key)
 * @rw:         mapping needs to be read/write (values: VERIFY_READ,
 *              VERIFY_WRITE)
 *
- * Returns a negative error code or 0
+ * Return: a negative error code or 0
+ *
 * The key words are stored in *key on success.
 *
 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
@@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 *                      be "current" except in the case of requeue pi.
 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
 *
- * Returns:
+ * Return:
- *  0 - ready to wait
+ *  0 - ready to wait;
- *  1 - acquired the lock
+ *  1 - acquired the lock;
 * <0 - error
 *
 * The hb->lock and futex_key refs shall be held by the caller.
@@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
 * hb1 and hb2 must be held by the caller.
 *
- * Returns:
+ * Return:
- *  0 - failed to acquire the lock atomicly
+ *  0 - failed to acquire the lock atomically;
- *  1 - acquired the lock
+ *  1 - acquired the lock;
 * <0 - error
 */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
 * uaddr2 atomically on behalf of the top waiter.
 *
- * Returns:
+ * Return:
- * >=0 - on success, the number of tasks requeued or woken
+ * >=0 - on success, the number of tasks requeued or woken;
 *  <0 - on error
 */
 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
 * be paired with exactly one earlier call to queue_me().
 *
- * Returns:
+ * Return:
- *   1 - if the futex_q was still queued (and we removed unqueued it)
+ *   1 - if the futex_q was still queued (and we removed unqueued it);
 *   0 - if the futex_q was already removed by the waking thread
 */
 static int unqueue_me(struct futex_q *q)
@@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
 * the pi_state owner as well as handle race conditions that may allow us to
 * acquire the lock. Must be called with the hb lock held.
 *
- * Returns:
+ * Return:
- *  1 - success, lock taken
+ *  1 - success, lock taken;
- *  0 - success, lock not taken
+ *  0 - success, lock not taken;
 * <0 - on error (-EFAULT)
 */
 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 * Return with the hb lock held and a q.key reference on success, and unlocked
 * with no q.key reference on failure.
 *
- * Returns:
+ * Return:
- *  0 - uaddr contains val and hb has been locked
+ *  0 - uaddr contains val and hb has been locked;
 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
 */
 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2203,9 +2204,9 @@ pi_faulted:
 * the wakeup and return the appropriate error code to the caller.  Must be
 * called with the hb lock held.
 *
- * Returns
+ * Return:
- *  0 - no early wakeup detected
+ *  0 = no early wakeup detected;
- * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
 */
 static inline
 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 * @val:        the expected value of uaddr
 * @abs_time:   absolute timeout
 * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
- * @clockrt:    whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
 * @uaddr2:     the pi futex we will take prior to returning to user-space
 *
 * The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 * there was a need to.
 *
 * We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following:
+ * via the following--
 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
 * 2) wakeup on uaddr2 after a requeue
 * 3) signal
@@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 *
 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
 *
- * Returns:
+ * Return:
- *  0 - On success
+ *  0 - On success;
 * <0 - On error
 */
 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812d3feb..fd4b13b131f8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -63,6 +63,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
+        .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
        .clock_base =
        {
                {
@@ -83,6 +84,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
                        .get_time = &ktime_get_boottime,
                        .resolution = KTIME_LOW_RES,
                },
+                {
+                        .index = HRTIMER_BASE_TAI,
+                        .clockid = CLOCK_TAI,
+                        .get_time = &ktime_get_clocktai,
+                        .resolution = KTIME_LOW_RES,
+                },
        }
 };
@@ -90,6 +97,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
        [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
        [CLOCK_MONOTONIC]       = HRTIMER_BASE_MONOTONIC,
        [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
+        [CLOCK_TAI]             = HRTIMER_BASE_TAI,
 };
 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -106,8 +114,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
        ktime_t xtim, mono, boot;
        struct timespec xts, tom, slp;
+        s32 tai_offset;
        get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
+        tai_offset = timekeeping_get_tai_offset();
        xtim = timespec_to_ktime(xts);
        mono = ktime_add(xtim, timespec_to_ktime(tom));
@@ -115,6 +125,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
        base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
        base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
        base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
+        base->clock_base[HRTIMER_BASE_TAI].softirq_time =
+                                ktime_add(xtim, ktime_set(tai_offset, 0));
 }
 /*
@@ -160,7 +172,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 */
 static int hrtimer_get_target(int this_cpu, int pinned)
 {
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
        if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
                return get_nohz_timer_target();
 #endif
@@ -275,6 +287,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
        } else {
                unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+                /* Make sure nsec fits into long */
+                if (unlikely(nsec > KTIME_SEC_MAX))
+                        return (ktime_t){ .tv64 = KTIME_MAX };
                tmp = ktime_set((long)nsec, rem);
        }
@@ -651,8 +667,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 {
        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+        ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
-        return ktime_get_update_offsets(offs_real, offs_boot);
+        return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
 }
 /*
@@ -1010,7 +1027,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 * @timer:      the timer to be added
 * @tim:        expiry time
 * @delta_ns:   "slack" range for the timer
- * @mode:       expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ * @mode:       expiry mode: absolute (HRTIMER_MODE_ABS) or
+ *              relative (HRTIMER_MODE_REL)
 *
 * Returns:
 *  0 on success
@@ -1027,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 * hrtimer_start - (re)start an hrtimer on the current CPU
 * @timer:      the timer to be added
 * @tim:        expiry time
- * @mode:       expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ * @mode:       expiry mode: absolute (HRTIMER_MODE_ABS) or
+ *              relative (HRTIMER_MODE_REL)
 *
 * Returns:
 *  0 on success
@@ -1106,7 +1125,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /**
 * hrtimer_get_next_event - get the time until next expiry event
 *
@@ -1309,6 +1328,8 @@ retry:
                                expires = ktime_sub(hrtimer_get_expires(timer),
                                                    base->offset);
+                                if (expires.tv64 < 0)
+                                        expires.tv64 = KTIME_MAX;
                                if (expires.tv64 < expires_next.tv64)
                                        expires_next = expires;
                                break;
@@ -1642,8 +1663,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;
-        raw_spin_lock_init(&cpu_base->lock);
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                cpu_base->clock_base[i].cpu_base = cpu_base;
                timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 96f3a1d9c379..5a83dde8ca0c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -462,9 +462,23 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
                if (domain->ops->map) {
                        ret = domain->ops->map(domain, virq, hwirq);
                        if (ret != 0) {
-                                pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
+                                /*
-                                       virq, hwirq, ret);
+                                 * If map() returns -EPERM, this interrupt is protected
-                                WARN_ON(1);
+                                 * by the firmware or some other service and shall not
+                                 * be mapped.
+                                 *
+                                 * Since on some platforms we blindly try to map everything
+                                 * we end up with a log full of backtraces.
+                                 *
+                                 * So instead, we silently fail on -EPERM, it is the
+                                 * responsibility of the PIC driver to display a relevant
+                                 * message if needed.
+                                 */
+                                if (ret != -EPERM) {
+                                        pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
+                                               virq, hwirq, ret);
+                                        WARN_ON(1);
+                                }
                                irq_data->domain = NULL;
                                irq_data->hwirq = 0;
                                goto err_unmap;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 397db02209ed..19ed5c425c3b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
 static ssize_t write_irq_affinity(int type, struct file *file,
                const char __user *buffer, size_t count, loff_t *pos)
 {
-        unsigned int irq = (int)(long)PDE(file_inode(file))->data;
+        unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
        cpumask_var_t new_value;
        int err;
@@ -131,17 +131,17 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
 static int irq_affinity_proc_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
+        return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
 }
 static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
+        return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
 }
 static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
+        return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
 }
 static const struct file_operations irq_affinity_proc_fops = {
@@ -212,7 +212,7 @@ out:
 static int default_affinity_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, default_affinity_show, PDE(inode)->data);
+        return single_open(file, default_affinity_show, PDE_DATA(inode));
 }
 static const struct file_operations default_affinity_proc_fops = {
@@ -233,7 +233,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
 static int irq_node_proc_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, irq_node_proc_show, PDE(inode)->data);
+        return single_open(file, irq_node_proc_show, PDE_DATA(inode));
 }
 static const struct file_operations irq_node_proc_fops = {
@@ -256,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
 static int irq_spurious_proc_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
+        return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
 }
 static const struct file_operations irq_spurious_proc_fops = {
@@ -366,11 +366,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 {
-        if (action->dir) {
+        proc_remove(action->dir);
-                struct irq_desc *desc = irq_to_desc(irq);
-                remove_proc_entry(action->dir->name, desc->dir);
-        }
 }
 static void register_default_affinity_proc(void)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2169feeba529..3127ad52cdb2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -84,9 +84,11 @@ static int is_ksym_addr(unsigned long addr)
 /*
 * Expand a compressed symbol data into the resulting uncompressed string,
+ * if uncompressed string is too long (>= maxlen), it will be truncated,
 * given the offset to where the symbol is in the compressed stream.
 */
-static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
+static unsigned int kallsyms_expand_symbol(unsigned int off,
+                                           char *result, size_t maxlen)
 {
        int len, skipped_first = 0;
        const u8 *tptr, *data;
@@ -113,15 +115,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
                while (*tptr) {
                        if (skipped_first) {
+                                if (maxlen <= 1)
+                                        goto tail;
                                *result = *tptr;
                                result++;
+                                maxlen--;
                        } else
                                skipped_first = 1;
                        tptr++;
                }
        }
-        *result = '\0';
+tail:
+        if (maxlen)
+                *result = '\0';
        /* Return to offset to the next symbol. */
        return off;
@@ -176,7 +183,7 @@ unsigned long kallsyms_lookup_name(const char *name)
        unsigned int off;
        for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
-                off = kallsyms_expand_symbol(off, namebuf);
+                off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
                if (strcmp(namebuf, name) == 0)
                        return kallsyms_addresses[i];
@@ -195,7 +202,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
        int ret;
        for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
-                off = kallsyms_expand_symbol(off, namebuf);
+                off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
                ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
                if (ret != 0)
                        return ret;
@@ -294,7 +301,8 @@ const char *kallsyms_lookup(unsigned long addr,
                pos = get_symbol_pos(addr, symbolsize, offset);
                /* Grab name */
-                kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
+                kallsyms_expand_symbol(get_symbol_offset(pos),
+                                       namebuf, KSYM_NAME_LEN);
                if (modname)
                        *modname = NULL;
                return namebuf;
@@ -315,7 +323,8 @@ int lookup_symbol_name(unsigned long addr, char *symname)
                pos = get_symbol_pos(addr, NULL, NULL);
                /* Grab name */
-                kallsyms_expand_symbol(get_symbol_offset(pos), symname);
+                kallsyms_expand_symbol(get_symbol_offset(pos),
+                                       symname, KSYM_NAME_LEN);
                return 0;
        }
        /* See if it's in a module. */
@@ -333,7 +342,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
                pos = get_symbol_pos(addr, size, offset);
                /* Grab name */
-                kallsyms_expand_symbol(get_symbol_offset(pos), name);
+                kallsyms_expand_symbol(get_symbol_offset(pos),
+                                       name, KSYM_NAME_LEN);
                modname[0] = '\0';
                return 0;
        }
@@ -463,7 +473,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
        iter->type = kallsyms_get_symbol_type(off);
-        off = kallsyms_expand_symbol(off, iter->name);
+        off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name));
        return off - iter->nameoff;
 }
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bddd3d7a74b6..59f7b55ba745 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -55,7 +55,7 @@ struct resource crashk_res = {
        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
 struct resource crashk_low_res = {
-        .name  = "Crash kernel low",
+        .name  = "Crash kernel",
        .start = 0,
        .end   = 0,
        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
@@ -786,7 +786,7 @@ static int kimage_load_normal_segment(struct kimage *image,
                                         struct kexec_segment *segment)
 {
        unsigned long maddr;
-        unsigned long ubytes, mbytes;
+        size_t ubytes, mbytes;
        int result;
        unsigned char __user *buf;
@@ -819,13 +819,9 @@ static int kimage_load_normal_segment(struct kimage *image,
                /* Start with a clear page */
                clear_page(ptr);
                ptr += maddr & ~PAGE_MASK;
-                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+                mchunk = min_t(size_t, mbytes,
-                if (mchunk > mbytes)
+                                PAGE_SIZE - (maddr & ~PAGE_MASK));
-                        mchunk = mbytes;
+                uchunk = min(ubytes, mchunk);
-                uchunk = mchunk;
-                if (uchunk > ubytes)
-                        uchunk = ubytes;
                result = copy_from_user(ptr, buf, uchunk);
                kunmap(page);
@@ -850,7 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image,
         * We do things a page at a time for the sake of kmap.
         */
        unsigned long maddr;
-        unsigned long ubytes, mbytes;
+        size_t ubytes, mbytes;
        int result;
        unsigned char __user *buf;
@@ -871,13 +867,10 @@ static int kimage_load_crash_segment(struct kimage *image,
                }
                ptr = kmap(page);
                ptr += maddr & ~PAGE_MASK;
-                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+                mchunk = min_t(size_t, mbytes,
-                if (mchunk > mbytes)
+                                PAGE_SIZE - (maddr & ~PAGE_MASK));
-                        mchunk = mbytes;
+                uchunk = min(ubytes, mchunk);
+                if (mchunk > uchunk) {
-                uchunk = mchunk;
-                if (uchunk > ubytes) {
-                        uchunk = ubytes;
                        /* Zero the trailing part of the page */
                        memset(ptr + uchunk, 0, mchunk - uchunk);
                }
@@ -1118,12 +1111,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
 {
        unsigned long addr;
-        for (addr = begin; addr < end; addr += PAGE_SIZE) {
+        for (addr = begin; addr < end; addr += PAGE_SIZE)
-                ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
+                free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-                init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
-                free_page((unsigned long)__va(addr));
-                totalram_pages++;
-        }
 }
 int crash_shrink_memory(unsigned long new_size)
@@ -1368,35 +1357,114 @@ static int __init parse_crashkernel_simple(char 		*cmdline,
        return 0;
 }
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+        [SUFFIX_HIGH] = ",high",
+        [SUFFIX_LOW]  = ",low",
+        [SUFFIX_NULL] = NULL,
+};
 /*
- * That function is the entry point for command line parsing and should be
+ * That function parses "suffix"  crashkernel command lines like
- * called from the arch-specific code.
+ *
+ *      crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
 */
+static int __init parse_crashkernel_suffix(char *cmdline,
+                                           unsigned long long   *crash_size,
+                                           unsigned long long   *crash_base,
+                                           const char *suffix)
+{
+        char *cur = cmdline;
+        *crash_size = memparse(cmdline, &cur);
+        if (cmdline == cur) {
+                pr_warn("crashkernel: memory value expected\n");
+                return -EINVAL;
+        }
+        /* check with suffix */
+        if (strncmp(cur, suffix, strlen(suffix))) {
+                pr_warn("crashkernel: unrecognized char\n");
+                return -EINVAL;
+        }
+        cur += strlen(suffix);
+        if (*cur != ' ' && *cur != '\0') {
+                pr_warn("crashkernel: unrecognized char\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+static __init char *get_last_crashkernel(char *cmdline,
+                             const char *name,
+                             const char *suffix)
+{
+        char *p = cmdline, *ck_cmdline = NULL;
+        /* find crashkernel and use the last one if there are more */
+        p = strstr(p, name);
+        while (p) {
+                char *end_p = strchr(p, ' ');
+                char *q;
+                if (!end_p)
+                        end_p = p + strlen(p);
+                if (!suffix) {
+                        int i;
+                        /* skip the one with any known suffix */
+                        for (i = 0; suffix_tbl[i]; i++) {
+                                q = end_p - strlen(suffix_tbl[i]);
+                                if (!strncmp(q, suffix_tbl[i],
+                                             strlen(suffix_tbl[i])))
+                                        goto next;
+                        }
+                        ck_cmdline = p;
+                } else {
+                        q = end_p - strlen(suffix);
+                        if (!strncmp(q, suffix, strlen(suffix)))
+                                ck_cmdline = p;
+                }
+next:
+                p = strstr(p+1, name);
+        }
+        if (!ck_cmdline)
+                return NULL;
+        return ck_cmdline;
+}
 static int __init __parse_crashkernel(char *cmdline,
                             unsigned long long system_ram,
                             unsigned long long *crash_size,
                             unsigned long long *crash_base,
-                                const char *name)
+                             const char *name,
+                             const char *suffix)
 {
-        char    *p = cmdline, *ck_cmdline = NULL;
        char    *first_colon, *first_space;
+        char    *ck_cmdline;
        BUG_ON(!crash_size || !crash_base);
        *crash_size = 0;
        *crash_base = 0;
-        /* find crashkernel and use the last one if there are more */
+        ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-        p = strstr(p, name);
-        while (p) {
-                ck_cmdline = p;
-                p = strstr(p+1, name);
-        }
        if (!ck_cmdline)
                return -EINVAL;
        ck_cmdline += strlen(name);
+        if (suffix)
+                return parse_crashkernel_suffix(ck_cmdline, crash_size,
+                                crash_base, suffix);
        /*
         * if the commandline contains a ':', then that's the extended
         * syntax -- if not, it must be the classic syntax
@@ -1413,13 +1481,26 @@ static int __init __parse_crashkernel(char *cmdline,
        return 0;
 }
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
 int __init parse_crashkernel(char *cmdline,
                             unsigned long long system_ram,
                             unsigned long long *crash_size,
                             unsigned long long *crash_base)
 {
        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                                        "crashkernel=");
+                                        "crashkernel=", NULL);
+}
+int __init parse_crashkernel_high(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base)
+{
+        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
 }
 int __init parse_crashkernel_low(char *cmdline,
@@ -1428,7 +1509,7 @@ int __init parse_crashkernel_low(char *cmdline,
                             unsigned long long *crash_base)
 {
        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-                                        "crashkernel_low=");
+                                "crashkernel=", suffix_tbl[SUFFIX_LOW]);
 }
 static void update_vmcoreinfo_note(void)
@@ -1452,14 +1533,13 @@ void vmcoreinfo_append_str(const char *fmt, ...)
 {
        va_list args;
        char buf[0x50];
-        int r;
+        size_t r;
        va_start(args, fmt);
        r = vsnprintf(buf, sizeof(buf), fmt, args);
        va_end(args);
-        if (r + vmcoreinfo_size > vmcoreinfo_max_size)
+        r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-                r = vmcoreinfo_max_size - vmcoreinfo_size;
        memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
@@ -1489,7 +1569,7 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_SYMBOL(swapper_pg_dir);
 #endif
        VMCOREINFO_SYMBOL(_stext);
-        VMCOREINFO_SYMBOL(vmlist);
+        VMCOREINFO_SYMBOL(vmap_area_list);
 #ifndef CONFIG_NEED_MULTIPLE_NODES
        VMCOREINFO_SYMBOL(mem_map);
@@ -1527,7 +1607,8 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_OFFSET(free_area, free_list);
        VMCOREINFO_OFFSET(list_head, next);
        VMCOREINFO_OFFSET(list_head, prev);
-        VMCOREINFO_OFFSET(vm_struct, addr);
+        VMCOREINFO_OFFSET(vmap_area, va_start);
+        VMCOREINFO_OFFSET(vmap_area, list);
        VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
        log_buf_kexec_setup();
        VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 56dd34976d7b..1296e72e4161 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -77,6 +77,7 @@ static void free_modprobe_argv(struct subprocess_info *info)
 static int call_modprobe(char *module_name, int wait)
 {
+        struct subprocess_info *info;
        static char *envp[] = {
                "HOME=/",
                "TERM=linux",
@@ -98,8 +99,15 @@ static int call_modprobe(char *module_name, int wait)
        argv[3] = module_name;  /* check free_modprobe_argv() */
        argv[4] = NULL;
-        return call_usermodehelper_fns(modprobe_path, argv, envp,
+        info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
-                wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
+                                         NULL, free_modprobe_argv, NULL);
+        if (!info)
+                goto free_module_name;
+        return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+free_module_name:
+        kfree(module_name);
 free_argv:
        kfree(argv);
 out:
@@ -502,14 +510,28 @@ static void helper_unlock(void)
 * @argv: arg vector for process
 * @envp: environment for process
 * @gfp_mask: gfp mask for memory allocation
+ * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
 *
 * Returns either %NULL on allocation failure, or a subprocess_info
 * structure.  This should be passed to call_usermodehelper_exec to
 * exec the process and free the structure.
+ *
+ * The init function is used to customize the helper process prior to
+ * exec.  A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
+ * be freed.  This can be used for freeing the argv and envp.  The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
 */
-static
 struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
-                                                  char **envp, gfp_t gfp_mask)
+                char **envp, gfp_t gfp_mask,
+                int (*init)(struct subprocess_info *info, struct cred *new),
+                void (*cleanup)(struct subprocess_info *info),
+                void *data)
 {
        struct subprocess_info *sub_info;
        sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
@@ -520,50 +542,27 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
        sub_info->path = path;
        sub_info->argv = argv;
        sub_info->envp = envp;
+        sub_info->cleanup = cleanup;
+        sub_info->init = init;
+        sub_info->data = data;
  out:
        return sub_info;
 }
+EXPORT_SYMBOL(call_usermodehelper_setup);
-/**
- * call_usermodehelper_setfns - set a cleanup/init function
- * @info: a subprocess_info returned by call_usermodehelper_setup
- * @cleanup: a cleanup function
- * @init: an init function
- * @data: arbitrary context sensitive data
- *
- * The init function is used to customize the helper process prior to
- * exec.  A non-zero return code causes the process to error out, exit,
- * and return the failure to the calling process
- *
- * The cleanup function is just before ethe subprocess_info is about to
- * be freed.  This can be used for freeing the argv and envp.  The
- * Function must be runnable in either a process context or the
- * context in which call_usermodehelper_exec is called.
- */
-static
-void call_usermodehelper_setfns(struct subprocess_info *info,
-                    int (*init)(struct subprocess_info *info, struct cred *new),
-                    void (*cleanup)(struct subprocess_info *info),
-                    void *data)
-{
-        info->cleanup = cleanup;
-        info->init = init;
-        info->data = data;
-}
 /**
 * call_usermodehelper_exec - start a usermode application
 * @sub_info: information about the subprocessa
 * @wait: wait for the application to finish and return status.
- *        when -1 don't wait at all, but you get no useful error back when
+ *        when UMH_NO_WAIT don't wait at all, but you get no useful error back
- *        the program couldn't be exec'ed. This makes it safe to call
+ *        when the program couldn't be exec'ed. This makes it safe to call
 *        from interrupt context.
 *
 * Runs a user-space application.  The application is started
 * asynchronously if wait is not set, and runs as a child of keventd.
 * (ie. it runs with full root capabilities).
 */
-static
 int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
        DECLARE_COMPLETION_ONSTACK(done);
@@ -615,31 +614,34 @@ unlock:
        helper_unlock();
        return retval;
 }
+EXPORT_SYMBOL(call_usermodehelper_exec);
-/*
+/**
- * call_usermodehelper_fns() will not run the caller-provided cleanup function
+ * call_usermodehelper() - prepare and start a usermode application
- * if a memory allocation failure is experienced.  So the caller might need to
+ * @path: path to usermode executable
- * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
+ * @argv: arg vector for process
- * the necessaary cleanup within the caller.
+ * @envp: environment for process
+ * @wait: wait for the application to finish and return status.
+ *        when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ *        when the program couldn't be exec'ed. This makes it safe to call
+ *        from interrupt context.
+ *
+ * This function is the equivalent to use call_usermodehelper_setup() and
+ * call_usermodehelper_exec().
 */
-int call_usermodehelper_fns(
+int call_usermodehelper(char *path, char **argv, char **envp, int wait)
-        char *path, char **argv, char **envp, int wait,
-        int (*init)(struct subprocess_info *info, struct cred *new),
-        void (*cleanup)(struct subprocess_info *), void *data)
 {
        struct subprocess_info *info;
        gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
-        info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
+        info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
+                                         NULL, NULL, NULL);
        if (info == NULL)
                return -ENOMEM;
-        call_usermodehelper_setfns(info, init, cleanup, data);
        return call_usermodehelper_exec(info, wait);
 }
-EXPORT_SYMBOL(call_usermodehelper_fns);
+EXPORT_SYMBOL(call_usermodehelper);
 static int proc_cap_handler(struct ctl_table *table, int write,
                         void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e35be53f6613..3fed7f0cbcdf 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -794,16 +794,16 @@ out:
 }
 #ifdef CONFIG_SYSCTL
-/* This should be called with kprobe_mutex locked */
 static void __kprobes optimize_all_kprobes(void)
 {
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i;
+        mutex_lock(&kprobe_mutex);
        /* If optimization is already allowed, just return */
        if (kprobes_allow_optimization)
-                return;
+                goto out;
        kprobes_allow_optimization = true;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void)
                                optimize_kprobe(p);
        }
        printk(KERN_INFO "Kprobes globally optimized\n");
+out:
+        mutex_unlock(&kprobe_mutex);
 }
-/* This should be called with kprobe_mutex locked */
 static void __kprobes unoptimize_all_kprobes(void)
 {
        struct hlist_head *head;
        struct kprobe *p;
        unsigned int i;
+        mutex_lock(&kprobe_mutex);
        /* If optimization is already prohibited, just return */
-        if (!kprobes_allow_optimization)
+        if (!kprobes_allow_optimization) {
+                mutex_unlock(&kprobe_mutex);
                return;
+        }
        kprobes_allow_optimization = false;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void)
                                unoptimize_kprobe(p, false);
                }
        }
+        mutex_unlock(&kprobe_mutex);
        /* Wait for unoptimizing completion */
        wait_for_kprobe_optimizer();
        printk(KERN_INFO "Kprobes globally unoptimized\n");
 }
+static DEFINE_MUTEX(kprobe_sysctl_mutex);
 int sysctl_kprobes_optimization;
 int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
                                      void __user *buffer, size_t *length,
@@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
 {
        int ret;
-        mutex_lock(&kprobe_mutex);
+        mutex_lock(&kprobe_sysctl_mutex);
        sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
                optimize_all_kprobes();
        else
                unoptimize_all_kprobes();
-        mutex_unlock(&kprobe_mutex);
+        mutex_unlock(&kprobe_sysctl_mutex);
        return ret;
 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9baf..760e86df8c20 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
+#include <linux/uaccess.h>
 #include <trace/events/sched.h>
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -52,8 +53,21 @@ enum KTHREAD_BITS {
        KTHREAD_IS_PARKED,
 };
-#define to_kthread(tsk) \
+#define __to_kthread(vfork)     \
-        container_of((tsk)->vfork_done, struct kthread, exited)
+        container_of(vfork, struct kthread, exited)
+static inline struct kthread *to_kthread(struct task_struct *k)
+{
+        return __to_kthread(k->vfork_done);
+}
+static struct kthread *to_live_kthread(struct task_struct *k)
+{
+        struct completion *vfork = ACCESS_ONCE(k->vfork_done);
+        if (likely(vfork))
+                return __to_kthread(vfork);
+        return NULL;
+}
 /**
 * kthread_should_stop - should this kthread return now?
@@ -122,14 +136,32 @@ void *kthread_data(struct task_struct *task)
        return to_kthread(task)->data;
 }
+/**
+ * probe_kthread_data - speculative version of kthread_data()
+ * @task: possible kthread task in question
+ *
+ * @task could be a kthread task.  Return the data value specified when it
+ * was created if accessible.  If @task isn't a kthread task or its data is
+ * inaccessible for any reason, %NULL is returned.  This function requires
+ * that @task itself is safe to dereference.
+ */
+void *probe_kthread_data(struct task_struct *task)
+{
+        struct kthread *kthread = to_kthread(task);
+        void *data = NULL;
+        probe_kernel_read(&data, &kthread->data, sizeof(data));
+        return data;
+}
 static void __kthread_parkme(struct kthread *self)
 {
-        __set_current_state(TASK_INTERRUPTIBLE);
+        __set_current_state(TASK_PARKED);
        while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
                if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
                        complete(&self->parked);
                schedule();
-                __set_current_state(TASK_INTERRUPTIBLE);
+                __set_current_state(TASK_PARKED);
        }
        clear_bit(KTHREAD_IS_PARKED, &self->flags);
        __set_current_state(TASK_RUNNING);
@@ -256,11 +288,16 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
 {
+        /* Must have done schedule() in kthread() before we set_task_cpu */
+        if (!wait_task_inactive(p, state)) {
+                WARN_ON(1);
+                return;
+        }
        /* It's safe because the task is inactive. */
        do_set_cpus_allowed(p, cpumask_of(cpu));
-        p->flags |= PF_THREAD_BOUND;
+        p->flags |= PF_NO_SETAFFINITY;
 }
 /**
@@ -274,12 +311,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
 */
 void kthread_bind(struct task_struct *p, unsigned int cpu)
 {
-        /* Must have done schedule() in kthread() before we set_task_cpu */
+        __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
-        if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
-                WARN_ON(1);
-                return;
-        }
-        __kthread_bind(p, cpu);
 }
 EXPORT_SYMBOL(kthread_bind);
@@ -311,17 +343,20 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
        return p;
 }
-static struct kthread *task_get_live_kthread(struct task_struct *k)
+static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
 {
-        struct kthread *kthread;
+        clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+        /*
-        get_task_struct(k);
+         * We clear the IS_PARKED bit here as we don't wait
-        kthread = to_kthread(k);
+         * until the task has left the park code. So if we'd
-        /* It might have exited */
+         * park before that happens we'd see the IS_PARKED bit
-        barrier();
+         * which might be about to be cleared.
-        if (k->vfork_done != NULL)
+         */
-                return kthread;
+        if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-        return NULL;
+                if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+                        __kthread_bind(k, kthread->cpu, TASK_PARKED);
+                wake_up_state(k, TASK_PARKED);
+        }
 }
 /**
@@ -334,23 +369,10 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
 */
 void kthread_unpark(struct task_struct *k)
 {
-        struct kthread *kthread = task_get_live_kthread(k);
+        struct kthread *kthread = to_live_kthread(k);
-        if (kthread) {
+        if (kthread)
-                clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+                __kthread_unpark(k, kthread);
-                /*
-                 * We clear the IS_PARKED bit here as we don't wait
-                 * until the task has left the park code. So if we'd
-                 * park before that happens we'd see the IS_PARKED bit
-                 * which might be about to be cleared.
-                 */
-                if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-                        if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
-                                __kthread_bind(k, kthread->cpu);
-                        wake_up_process(k);
-                }
-        }
-        put_task_struct(k);
 }
 /**
@@ -367,7 +389,7 @@ void kthread_unpark(struct task_struct *k)
 */
 int kthread_park(struct task_struct *k)
 {
-        struct kthread *kthread = task_get_live_kthread(k);
+        struct kthread *kthread = to_live_kthread(k);
        int ret = -ENOSYS;
        if (kthread) {
@@ -380,7 +402,6 @@ int kthread_park(struct task_struct *k)
                }
                ret = 0;
        }
-        put_task_struct(k);
        return ret;
 }
@@ -401,21 +422,23 @@ int kthread_park(struct task_struct *k)
 */
 int kthread_stop(struct task_struct *k)
 {
-        struct kthread *kthread = task_get_live_kthread(k);
+        struct kthread *kthread;
        int ret;
        trace_sched_kthread_stop(k);
+        get_task_struct(k);
+        kthread = to_live_kthread(k);
        if (kthread) {
                set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
-                clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+                __kthread_unpark(k, kthread);
                wake_up_process(k);
                wait_for_completion(&kthread->exited);
        }
        ret = k->exit_code;
        put_task_struct(k);
-        trace_sched_kthread_stop_ret(ret);
+        trace_sched_kthread_stop_ret(ret);
        return ret;
 }
 EXPORT_SYMBOL(kthread_stop);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 259db207b5d9..6a3bccba7e7d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)
 unsigned long nr_stack_trace_entries;
 static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static void print_lockdep_off(const char *bug_msg)
+{
+        printk(KERN_DEBUG "%s\n", bug_msg);
+        printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+        printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+}
 static int save_trace(struct stack_trace *trace)
 {
        trace->nr_entries = 0;
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)
                if (!debug_locks_off_graph_unlock())
                        return 0;
-                printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
+                print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
-                printk("turning off the locking correctness validator.\n");
                dump_stack();
                return 0;
@@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
                }
                raw_local_irq_restore(flags);
-                printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
+                print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
-                printk("turning off the locking correctness validator.\n");
                dump_stack();
                return NULL;
        }
@@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)
                if (!debug_locks_off_graph_unlock())
                        return NULL;
-                printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
+                print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
-                printk("turning off the locking correctness validator.\n");
                dump_stack();
                return NULL;
        }
@@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
        struct lock_class *class = hlock_class(hlock);
        struct list_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
-        struct held_lock *hlock_curr, *hlock_next;
+        struct held_lock *hlock_curr;
        int i, j;
        /*
@@ -2048,8 +2052,7 @@ cache_hit:
                if (!debug_locks_off_graph_unlock())
                        return 0;
-                printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
+                print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
-                printk("turning off the locking correctness validator.\n");
                dump_stack();
                return 0;
        }
@@ -2057,12 +2060,10 @@ cache_hit:
        chain->chain_key = chain_key;
        chain->irq_context = hlock->irq_context;
        /* Find the first held_lock of current chain */
-        hlock_next = hlock;
        for (i = curr->lockdep_depth - 1; i >= 0; i--) {
                hlock_curr = curr->held_locks + i;
-                if (hlock_curr->irq_context != hlock_next->irq_context)
+                if (hlock_curr->irq_context != hlock->irq_context)
                        break;
-                hlock_next = hlock;
        }
        i++;
        chain->depth = curr->lockdep_depth + 1 - i;
@@ -3190,9 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 #endif
        if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
                debug_locks_off();
-                printk("BUG: MAX_LOCK_DEPTH too low, depth: %i  max: %lu!\n",
+                print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
+                printk(KERN_DEBUG "depth: %i  max: %lu!\n",
                       curr->lockdep_depth, MAX_LOCK_DEPTH);
-                printk("turning off the locking correctness validator.\n");
                lockdep_print_held_locks(current);
                debug_show_all_locks();
@@ -4088,7 +4089,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
 }
 EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
-static void print_held_locks_bug(void)
+static void print_held_locks_bug(struct task_struct *curr)
 {
        if (!debug_locks_off())
                return;
@@ -4097,21 +4098,22 @@ static void print_held_locks_bug(void)
        printk("\n");
        printk("=====================================\n");
-        printk("[ BUG: %s/%d still has locks held! ]\n",
+        printk("[ BUG: lock held at task exit time! ]\n");
-               current->comm, task_pid_nr(current));
        print_kernel_ident();
        printk("-------------------------------------\n");
-        lockdep_print_held_locks(current);
+        printk("%s/%d is exiting with locks still held!\n",
+                curr->comm, task_pid_nr(curr));
+        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
        dump_stack();
 }
-void debug_check_no_locks_held(void)
+void debug_check_no_locks_held(struct task_struct *task)
 {
-        if (unlikely(current->lockdep_depth > 0))
+        if (unlikely(task->lockdep_depth > 0))
-                print_held_locks_bug();
+                print_held_locks_bug(task);
 }
-EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
 void debug_show_all_locks(void)
 {
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
index 246b4c6e6135..4a9a86d12c8b 100644
--- a/kernel/modsign_certificate.S
+++ b/kernel/modsign_certificate.S
@@ -1,15 +1,8 @@
-/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
+#include <linux/export.h>
-#ifndef SYMBOL_PREFIX
-#define ASM_SYMBOL(sym) sym
-#else
-#define PASTE2(x,y) x##y
-#define PASTE(x,y) PASTE2(x,y)
-#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
-#endif
 #define GLOBAL(name)    \
-        .globl ASM_SYMBOL(name);        \
+        .globl VMLINUX_SYMBOL(name);    \
-        ASM_SYMBOL(name):
+        VMLINUX_SYMBOL(name):
        .section ".init.data","aw"
diff --git a/kernel/module.c b/kernel/module.c
index 0925c9a71975..b049939177f6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1209,10 +1209,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
        /* Since this should be found in kernel (which can't be removed),
         * no locking is necessary. */
-        if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+        if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
                         &crc, true, false))
                BUG();
-        return check_version(sechdrs, versindex, "module_layout", mod, crc,
+        return check_version(sechdrs, versindex,
+                             VMLINUX_SYMBOL_STR(module_layout), mod, crc,
                             NULL);
 }
@@ -1861,12 +1862,12 @@ static void free_module(struct module *mod)
 {
        trace_module_free(mod);
-        /* Delete from various lists */
-        mutex_lock(&module_mutex);
-        stop_machine(__unlink_module, mod, NULL);
-        mutex_unlock(&module_mutex);
        mod_sysfs_teardown(mod);
+        /* We leave it in list to prevent duplicate loads, but make sure
+         * that noone uses it while it's being deconstructed. */
+        mod->state = MODULE_STATE_UNFORMED;
        /* Remove dynamic debug info */
        ddebug_remove_module(mod->name);
@@ -1879,6 +1880,11 @@ static void free_module(struct module *mod)
        /* Free any allocated parameters. */
        destroy_params(mod->kp, mod->num_kp);
+        /* Now we can delete it from the lists */
+        mutex_lock(&module_mutex);
+        stop_machine(__unlink_module, mod, NULL);
+        mutex_unlock(&module_mutex);
        /* This may be NULL, but that's OK */
        unset_module_init_ro_nx(mod);
        module_free(mod, mod->module_init);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 52f23011b6e0..ad53a664f113 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -37,6 +37,12 @@
 # include <asm/mutex.h>
 #endif
+/*
+ * A negative mutex count indicates that waiters are sleeping waiting for the
+ * mutex.
+ */
+#define MUTEX_SHOW_NO_WAITER(mutex)     (atomic_read(&(mutex)->count) >= 0)
 void
 __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 {
@@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
        spin_lock_init(&lock->wait_lock);
        INIT_LIST_HEAD(&lock->wait_list);
        mutex_clear_owner(lock);
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+        lock->spin_mlock = NULL;
+#endif
        debug_mutex_init(lock, name, key);
 }
@@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * In order to avoid a stampede of mutex spinners from acquiring the mutex
+ * more or less simultaneously, the spinners need to acquire a MCS lock
+ * first before spinning on the owner field.
+ *
+ * We don't inline mspin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+struct mspin_node {
+        struct mspin_node *next ;
+        int               locked;       /* 1 if lock acquired */
+};
+#define MLOCK(mutex)    ((struct mspin_node **)&((mutex)->spin_mlock))
+static noinline
+void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
+{
+        struct mspin_node *prev;
+        /* Init node */
+        node->locked = 0;
+        node->next   = NULL;
+        prev = xchg(lock, node);
+        if (likely(prev == NULL)) {
+                /* Lock acquired */
+                node->locked = 1;
+                return;
+        }
+        ACCESS_ONCE(prev->next) = node;
+        smp_wmb();
+        /* Wait until the lock holder passes the lock down */
+        while (!ACCESS_ONCE(node->locked))
+                arch_mutex_cpu_relax();
+}
+static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
+{
+        struct mspin_node *next = ACCESS_ONCE(node->next);
+        if (likely(!next)) {
+                /*
+                 * Release the lock by setting it to NULL
+                 */
+                if (cmpxchg(lock, node, NULL) == node)
+                        return;
+                /* Wait until the next pointer is set */
+                while (!(next = ACCESS_ONCE(node->next)))
+                        arch_mutex_cpu_relax();
+        }
+        ACCESS_ONCE(next->locked) = 1;
+        smp_wmb();
+}
+/*
+ * Mutex spinning code migrated from kernel/sched/core.c
+ */
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+        if (lock->owner != owner)
+                return false;
+        /*
+         * Ensure we emit the owner->on_cpu, dereference _after_ checking
+         * lock->owner still matches owner, if that fails, owner might
+         * point to free()d memory, if it still matches, the rcu_read_lock()
+         * ensures the memory stays valid.
+         */
+        barrier();
+        return owner->on_cpu;
+}
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+static noinline
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+        rcu_read_lock();
+        while (owner_running(lock, owner)) {
+                if (need_resched())
+                        break;
+                arch_mutex_cpu_relax();
+        }
+        rcu_read_unlock();
+        /*
+         * We break out the loop above on need_resched() and when the
+         * owner changed, which is a sign for heavy contention. Return
+         * success only when lock->owner is NULL.
+         */
+        return lock->owner == NULL;
+}
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline int mutex_can_spin_on_owner(struct mutex *lock)
+{
+        int retval = 1;
+        rcu_read_lock();
+        if (lock->owner)
+                retval = lock->owner->on_cpu;
+        rcu_read_unlock();
+        /*
+         * if lock->owner is not set, the mutex owner may have just acquired
+         * it and not set the owner yet or the mutex has been released.
+         */
+        return retval;
+}
+#endif
 static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 /**
@@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
         *
         * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
         * to serialize everything.
+         *
+         * The mutex spinners are queued up using MCS lock so that only one
+         * spinner can compete for the mutex. However, if mutex spinning isn't
+         * going to happen, there is no point in going through the lock/unlock
+         * overhead.
         */
+        if (!mutex_can_spin_on_owner(lock))
+                goto slowpath;
        for (;;) {
                struct task_struct *owner;
+                struct mspin_node  node;
                /*
                 * If there's an owner, wait for it to either
                 * release the lock or go to sleep.
                 */
+                mspin_lock(MLOCK(lock), &node);
                owner = ACCESS_ONCE(lock->owner);
-                if (owner && !mutex_spin_on_owner(lock, owner))
+                if (owner && !mutex_spin_on_owner(lock, owner)) {
+                        mspin_unlock(MLOCK(lock), &node);
                        break;
+                }
-                if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+                if ((atomic_read(&lock->count) == 1) &&
+                    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
                        lock_acquired(&lock->dep_map, ip);
                        mutex_set_owner(lock);
+                        mspin_unlock(MLOCK(lock), &node);
                        preempt_enable();
                        return 0;
                }
+                mspin_unlock(MLOCK(lock), &node);
                /*
                 * When there's no owner, we might have preempted between the
@@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 */
                arch_mutex_cpu_relax();
        }
+slowpath:
 #endif
        spin_lock_mutex(&lock->wait_lock, flags);
@@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        list_add_tail(&waiter.list, &lock->wait_list);
        waiter.task = task;
-        if (atomic_xchg(&lock->count, -1) == 1)
+        if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
                goto done;
        lock_contended(&lock->dep_map, ip);
@@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
                 * that when we release the lock, we properly wake up the
                 * other waiters:
                 */
-                if (atomic_xchg(&lock->count, -1) == 1)
+                if (MUTEX_SHOW_NO_WAITER(lock) &&
+                   (atomic_xchg(&lock->count, -1) == 1))
                        break;
                /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index afc0456f227a..364ceab15f0c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,7 +22,7 @@
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 #include <linux/file.h>
 #include <linux/syscalls.h>
@@ -241,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
        const struct proc_ns_operations *ops;
        struct task_struct *tsk = current;
        struct nsproxy *new_nsproxy;
-        struct proc_inode *ei;
+        struct proc_ns *ei;
        struct file *file;
        int err;
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
                return PTR_ERR(file);
        err = -EINVAL;
-        ei = PROC_I(file_inode(file));
+        ei = get_proc_ns(file_inode(file));
        ops = ei->ns_ops;
        if (nstype && (ops->type != nstype))
                goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index 7c57cc9eee2c..167ec097ce8b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -22,7 +22,6 @@
 #include <linux/sysrq.h>
 #include <linux/init.h>
 #include <linux/nmi.h>
-#include <linux/dmi.h>
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -400,13 +399,8 @@ struct slowpath_args {
 static void warn_slowpath_common(const char *file, int line, void *caller,
                                 unsigned taint, struct slowpath_args *args)
 {
-        const char *board;
        printk(KERN_WARNING "------------[ cut here ]------------\n");
        printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
-        board = dmi_get_system_info(DMI_PRODUCT_NAME);
-        if (board)
-                printk(KERN_WARNING "Hardware name: %s\n", board);
        if (args)
                vprintk(args->fmt, args->args);
diff --git a/kernel/pid.c b/kernel/pid.c
index 047dc6264638..0db3e791a06d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
+#include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
 #define pid_hashfn(nr, ns)      \
@@ -51,9 +52,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
-#define BITS_PER_PAGE           (PAGE_SIZE*8)
-#define BITS_PER_PAGE_MASK      (BITS_PER_PAGE-1)
 static inline int mk_pid(struct pid_namespace *pid_ns,
                struct pidmap *map, int off)
 {
@@ -183,15 +181,19 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
                                break;
                }
                if (likely(atomic_read(&map->nr_free))) {
-                        do {
+                        for ( ; ; ) {
                                if (!test_and_set_bit(offset, map->page)) {
                                        atomic_dec(&map->nr_free);
                                        set_last_pid(pid_ns, last, pid);
                                        return pid;
                                }
                                offset = find_next_offset(map, offset);
+                                if (offset >= BITS_PER_PAGE)
+                                        break;
                                pid = mk_pid(pid_ns, map, offset);
-                        } while (offset < BITS_PER_PAGE && pid < pid_max);
+                                if (pid >= pid_max)
+                                        break;
+                        }
                }
                if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
                        ++map;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c1c3dc1c6023..6917e8edb48e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,12 +15,10 @@
 #include <linux/err.h>
 #include <linux/acct.h>
 #include <linux/slab.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 #include <linux/reboot.h>
 #include <linux/export.h>
-#define BITS_PER_PAGE           (PAGE_SIZE*8)
 struct pid_cache {
        int nr_ids;
        char name[16];
@@ -181,6 +179,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        int nr;
        int rc;
        struct task_struct *task, *me = current;
+        int init_pids = thread_group_leader(me) ? 1 : 2;
        /* Don't allow any more processes into the pid namespace */
        disable_pid_allocation(pid_ns);
@@ -230,7 +229,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
         */
        for (;;) {
                set_current_state(TASK_UNINTERRUPTIBLE);
-                if (pid_ns->nr_hashed == 1)
+                if (pid_ns->nr_hashed == init_pids)
                        break;
                schedule();
        }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8fd709c9bb58..42670e9b44e0 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,6 +10,8 @@
 #include <linux/kernel_stat.h>
 #include <trace/events/timer.h>
 #include <linux/random.h>
+#include <linux/tick.h>
+#include <linux/workqueue.h>
 /*
 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -153,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer,
        }
 }
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime:    The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero.  Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+        if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
+                return 1;
+        return 0;
+}
 static inline cputime_t prof_ticks(struct task_struct *p)
 {
        cputime_t utime, stime;
@@ -636,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
        return 0;
 }
+#ifdef CONFIG_NO_HZ_FULL
+static void nohz_kick_work_fn(struct work_struct *work)
+{
+        tick_nohz_full_kick_all();
+}
+static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
+/*
+ * We need the IPIs to be sent from sane process context.
+ * The posix cpu timers are always set with irqs disabled.
+ */
+static void posix_cpu_timer_kick_nohz(void)
+{
+        schedule_work(&nohz_kick_work);
+}
+bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
+{
+        if (!task_cputime_zero(&tsk->cputime_expires))
+                return false;
+        if (tsk->signal->cputimer.running)
+                return false;
+        return true;
+}
+#else
+static inline void posix_cpu_timer_kick_nohz(void) { }
+#endif
 /*
 * Guts of sys_timer_settime for CPU timers.
 * This is called with the timer locked and interrupts disabled.
@@ -794,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                sample_to_timespec(timer->it_clock,
                                   old_incr, &old->it_interval);
        }
+        if (!ret)
+                posix_cpu_timer_kick_nohz();
        return ret;
 }
@@ -1008,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
        }
 }
-/**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime:    The struct to compare.
- *
- * Checks @cputime to see if all fields are zero.  Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
-{
-        if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
-                return 1;
-        return 0;
-}
 /*
 * Check for any per-thread CPU timers that have fired and move them
 * off the tsk->*_timers list onto the firing list.  Per-thread timers
@@ -1336,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk)
                        cpu_timer_fire(timer);
                spin_unlock(&timer->it_lock);
        }
+        /*
+         * In case some timers were rescheduled after the queue got emptied,
+         * wake up full dynticks CPUs.
+         */
+        if (tsk->signal->cputimer.running)
+                posix_cpu_timer_kick_nohz();
 }
 /*
@@ -1366,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                }
                if (!*newval)
-                        return;
+                        goto out;
                *newval += now.cpu;
        }
@@ -1384,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                        tsk->signal->cputime_expires.virt_exp = *newval;
                break;
        }
+out:
+        posix_cpu_timer_kick_nohz();
 }
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 6edbb2c55c22..424c2d4265c9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -40,38 +40,31 @@
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/idr.h>
+#include <linux/hash.h>
 #include <linux/posix-clock.h>
 #include <linux/posix-timers.h>
 #include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <linux/export.h>
+#include <linux/hashtable.h>
 /*
- * Management arrays for POSIX timers.   Timers are kept in slab memory
+ * Management arrays for POSIX timers. Timers are now kept in static hash table
- * Timer ids are allocated by an external routine that keeps track of the
+ * with 512 entries.
- * id and the timer.  The external interface is:
+ * Timer ids are allocated by local routine, which selects proper hash head by
- *
+ * key, constructed from current->signal address and per signal struct counter.
- * void *idr_find(struct idr *idp, int id);           to find timer_id <id>
+ * This keeps timer ids unique per process, but now they can intersect between
- * int idr_get_new(struct idr *idp, void *ptr);       to get a new id and
+ * processes.
- *                                                    related it to <ptr>
- * void idr_remove(struct idr *idp, int id);          to release <id>
- * void idr_init(struct idr *idp);                    to initialize <idp>
- *                                                    which we supply.
- * The idr_get_new *may* call slab for more memory so it must not be
- * called under a spin lock.  Likewise idr_remore may release memory
- * (but it may be ok to do this under a lock...).
- * idr_find is just a memory look up and is quite fast.  A -1 return
- * indicates that the requested id does not exist.
 */
 /*
 * Lets keep our timers in a slab cache :-)
 */
 static struct kmem_cache *posix_timers_cache;
-static struct idr posix_timers_id;
-static DEFINE_SPINLOCK(idr_lock);
+static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
+static DEFINE_SPINLOCK(hash_lock);
 /*
 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
@@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
        __timr;                                                            \
 })
+static int hash(struct signal_struct *sig, unsigned int nr)
+{
+        return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+}
+static struct k_itimer *__posix_timers_find(struct hlist_head *head,
+                                            struct signal_struct *sig,
+                                            timer_t id)
+{
+        struct k_itimer *timer;
+        hlist_for_each_entry_rcu(timer, head, t_hash) {
+                if ((timer->it_signal == sig) && (timer->it_id == id))
+                        return timer;
+        }
+        return NULL;
+}
+static struct k_itimer *posix_timer_by_id(timer_t id)
+{
+        struct signal_struct *sig = current->signal;
+        struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+        return __posix_timers_find(head, sig, id);
+}
+static int posix_timer_add(struct k_itimer *timer)
+{
+        struct signal_struct *sig = current->signal;
+        int first_free_id = sig->posix_timer_id;
+        struct hlist_head *head;
+        int ret = -ENOENT;
+        do {
+                spin_lock(&hash_lock);
+                head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
+                if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+                        hlist_add_head_rcu(&timer->t_hash, head);
+                        ret = sig->posix_timer_id;
+                }
+                if (++sig->posix_timer_id < 0)
+                        sig->posix_timer_id = 0;
+                if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
+                        /* Loop over all possible ids completed */
+                        ret = -EAGAIN;
+                spin_unlock(&hash_lock);
+        } while (ret == -ENOENT);
+        return ret;
+}
 static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 {
        spin_unlock_irqrestore(&timr->it_lock, flags);
@@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
        return 0;
 }
+static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+{
+        timekeeping_clocktai(tp);
+        return 0;
+}
 /*
 * Initialize everything, well, just everything in Posix clocks/timers ;)
@@ -261,6 +309,16 @@ static __init int init_posix_timers(void)
                .clock_getres   = posix_get_coarse_res,
                .clock_get      = posix_get_monotonic_coarse,
        };
+        struct k_clock clock_tai = {
+                .clock_getres   = hrtimer_get_res,
+                .clock_get      = posix_get_tai,
+                .nsleep         = common_nsleep,
+                .nsleep_restart = hrtimer_nanosleep_restart,
+                .timer_create   = common_timer_create,
+                .timer_set      = common_timer_set,
+                .timer_get      = common_timer_get,
+                .timer_del      = common_timer_del,
+        };
        struct k_clock clock_boottime = {
                .clock_getres   = hrtimer_get_res,
                .clock_get      = posix_get_boottime,
@@ -278,11 +336,11 @@ static __init int init_posix_timers(void)
        posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
        posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
        posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
+        posix_timers_register_clock(CLOCK_TAI, &clock_tai);
        posix_timers_cache = kmem_cache_create("posix_timers_cache",
                                        sizeof (struct k_itimer), 0, SLAB_PANIC,
                                        NULL);
-        idr_init(&posix_timers_id);
        return 0;
 }
@@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 {
        if (it_id_set) {
                unsigned long flags;
-                spin_lock_irqsave(&idr_lock, flags);
+                spin_lock_irqsave(&hash_lock, flags);
-                idr_remove(&posix_timers_id, tmr->it_id);
+                hlist_del_rcu(&tmr->t_hash);
-                spin_unlock_irqrestore(&idr_lock, flags);
+                spin_unlock_irqrestore(&hash_lock, flags);
        }
        put_pid(tmr->it_pid);
        sigqueue_free(tmr->sigq);
@@ -552,22 +610,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                return -EAGAIN;
        spin_lock_init(&new_timer->it_lock);
+        new_timer_id = posix_timer_add(new_timer);
-        idr_preload(GFP_KERNEL);
+        if (new_timer_id < 0) {
-        spin_lock_irq(&idr_lock);
+                error = new_timer_id;
-        error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
-        spin_unlock_irq(&idr_lock);
-        idr_preload_end();
-        if (error < 0) {
-                /*
-                 * Weird looking, but we return EAGAIN if the IDR is
-                 * full (proper POSIX return value for this)
-                 */
-                if (error == -ENOSPC)
-                        error = -EAGAIN;
                goto out;
        }
-        new_timer_id = error;
        it_id_set = IT_ID_SET;
        new_timer->it_id = (timer_t) new_timer_id;
@@ -645,7 +692,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
                return NULL;
        rcu_read_lock();
-        timr = idr_find(&posix_timers_id, (int)timer_id);
+        timr = posix_timer_by_id(timer_id);
        if (timr) {
                spin_lock_irqsave(&timr->it_lock, *flags);
                if (timr->it_signal == current->signal) {
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b1dc456474b5..463aa6736751 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -4,6 +4,7 @@
 * Originally from swsusp.
 */
+#include <linux/console.h>
 #include <linux/vt_kern.h>
 #include <linux/kbd_kern.h>
 #include <linux/vt.h>
@@ -14,8 +15,120 @@
 static int orig_fgconsole, orig_kmsg;
+static DEFINE_MUTEX(vt_switch_mutex);
+struct pm_vt_switch {
+        struct list_head head;
+        struct device *dev;
+        bool required;
+};
+static LIST_HEAD(pm_vt_switch_list);
+/**
+ * pm_vt_switch_required - indicate VT switch at suspend requirements
+ * @dev: device
+ * @required: if true, caller needs VT switch at suspend/resume time
+ *
+ * The different console drivers may or may not require VT switches across
+ * suspend/resume, depending on how they handle restoring video state and
+ * what may be running.
+ *
+ * Drivers can indicate support for switchless suspend/resume, which can
+ * save time and flicker, by using this routine and passing 'false' as
+ * the argument.  If any loaded driver needs VT switching, or the
+ * no_console_suspend argument has been passed on the command line, VT
+ * switches will occur.
+ */
+void pm_vt_switch_required(struct device *dev, bool required)
+{
+        struct pm_vt_switch *entry, *tmp;
+        mutex_lock(&vt_switch_mutex);
+        list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+                if (tmp->dev == dev) {
+                        /* already registered, update requirement */
+                        tmp->required = required;
+                        goto out;
+                }
+        }
+        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+        if (!entry)
+                goto out;
+        entry->required = required;
+        entry->dev = dev;
+        list_add(&entry->head, &pm_vt_switch_list);
+out:
+        mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_required);
+/**
+ * pm_vt_switch_unregister - stop tracking a device's VT switching needs
+ * @dev: device
+ *
+ * Remove @dev from the vt switch list.
+ */
+void pm_vt_switch_unregister(struct device *dev)
+{
+        struct pm_vt_switch *tmp;
+        mutex_lock(&vt_switch_mutex);
+        list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+                if (tmp->dev == dev) {
+                        list_del(&tmp->head);
+                        break;
+                }
+        }
+        mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_unregister);
+/*
+ * There are three cases when a VT switch on suspend/resume are required:
+ *   1) no driver has indicated a requirement one way or another, so preserve
+ *      the old behavior
+ *   2) console suspend is disabled, we want to see debug messages across
+ *      suspend/resume
+ *   3) any registered driver indicates it needs a VT switch
+ *
+ * If none of these conditions is present, meaning we have at least one driver
+ * that doesn't need the switch, and none that do, we can avoid it to make
+ * resume look a little prettier (and suspend too, but that's usually hidden,
+ * e.g. when closing the lid on a laptop).
+ */
+static bool pm_vt_switch(void)
+{
+        struct pm_vt_switch *entry;
+        bool ret = true;
+        mutex_lock(&vt_switch_mutex);
+        if (list_empty(&pm_vt_switch_list))
+                goto out;
+        if (!console_suspend_enabled)
+                goto out;
+        list_for_each_entry(entry, &pm_vt_switch_list, head) {
+                if (entry->required)
+                        goto out;
+        }
+        ret = false;
+out:
+        mutex_unlock(&vt_switch_mutex);
+        return ret;
+}
 int pm_prepare_console(void)
 {
+        if (!pm_vt_switch())
+                return 0;
        orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
        if (orig_fgconsole < 0)
                return 1;
@@ -26,6 +139,9 @@ int pm_prepare_console(void)
 void pm_restore_console(void)
 {
+        if (!pm_vt_switch())
+                return;
        if (orig_fgconsole >= 0) {
                vt_move_to_console(orig_fgconsole, 0);
                vt_kmsg_redirect(orig_kmsg);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 68197a4e8fc9..7ef6866b521d 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -32,7 +32,7 @@ static void handle_poweroff(int key)
 static struct sysrq_key_op      sysrq_poweroff_op = {
        .handler        = handle_poweroff,
-        .help_msg       = "powerOff",
+        .help_msg       = "poweroff(o)",
        .action_msg     = "Power Off",
        .enable_mask    = SYSRQ_ENABLE_BOOT,
 };
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index d4feda084a3a..bef86d121eb2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -76,8 +76,20 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
 bool valid_state(suspend_state_t state)
 {
-        if (state == PM_SUSPEND_FREEZE)
+        if (state == PM_SUSPEND_FREEZE) {
-                return true;
+#ifdef CONFIG_PM_DEBUG
+                if (pm_test_level != TEST_NONE &&
+                    pm_test_level != TEST_FREEZER &&
+                    pm_test_level != TEST_DEVICES &&
+                    pm_test_level != TEST_PLATFORM) {
+                        printk(KERN_WARNING "Unsupported pm_test mode for "
+                                        "freeze state, please choose "
+                                        "none/freezer/devices/platform.\n");
+                        return false;
+                }
+#endif
+                        return true;
+        }
        /*
         * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
         * support and need to be valid to the lowlevel
@@ -184,6 +196,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                        goto Platform_wake;
        }
+        if (suspend_test(TEST_PLATFORM))
+                goto Platform_wake;
        /*
         * PM_SUSPEND_FREEZE equals
         * frozen processes + suspended devices + idle processors.
@@ -195,9 +210,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                goto Platform_wake;
        }
-        if (suspend_test(TEST_PLATFORM))
-                goto Platform_wake;
        error = disable_nonboot_cpus();
        if (error || suspend_test(TEST_CPUS))
                goto Enable_cpus;
diff --git a/kernel/printk.c b/kernel/printk.c
index 0b31715f335a..96dcfcd9a2d4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -43,19 +43,13 @@
 #include <linux/rculist.h>
 #include <linux/poll.h>
 #include <linux/irq_work.h>
+#include <linux/utsname.h>
 #include <asm/uaccess.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
-/*
- * Architectures can override it:
- */
-void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
-{
-}
 /* printk's without a loglevel use this.. */
 #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
@@ -63,8 +57,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
 #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
-DECLARE_WAIT_QUEUE_HEAD(log_wait);
 int console_printk[4] = {
        DEFAULT_CONSOLE_LOGLEVEL,       /* console_loglevel */
        DEFAULT_MESSAGE_LOGLEVEL,       /* default_message_loglevel */
@@ -224,6 +216,7 @@ struct log {
 static DEFINE_RAW_SPINLOCK(logbuf_lock);
 #ifdef CONFIG_PRINTK
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
@@ -609,7 +602,8 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
                /* return error when data has vanished underneath us */
                if (user->seq < log_first_seq)
                        ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
-                ret = POLLIN|POLLRDNORM;
+                else
+                        ret = POLLIN|POLLRDNORM;
        }
        raw_spin_unlock_irq(&logbuf_lock);
@@ -1266,7 +1260,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
 {
        struct console *con;
-        trace_console(text, 0, len, len);
+        trace_console(text, len);
        if (level >= console_loglevel && !ignore_loglevel)
                return;
@@ -1724,6 +1718,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; }
 #endif /* CONFIG_PRINTK */
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+void early_vprintk(const char *fmt, va_list ap)
+{
+        if (early_console) {
+                char buf[512];
+                int n = vscnprintf(buf, sizeof(buf), fmt, ap);
+                early_console->write(early_console, buf, n);
+        }
+}
+asmlinkage void early_printk(const char *fmt, ...)
+{
+        va_list ap;
+        va_start(ap, fmt);
+        early_vprintk(fmt, ap);
+        va_end(ap);
+}
+#endif
 static int __add_preferred_console(char *name, int idx, char *options,
                                   char *brl_options)
 {
@@ -1957,45 +1974,6 @@ int is_console_locked(void)
        return console_locked;
 }
-/*
- * Delayed printk version, for scheduler-internal messages:
- */
-#define PRINTK_BUF_SIZE         512
-#define PRINTK_PENDING_WAKEUP   0x01
-#define PRINTK_PENDING_SCHED    0x02
-static DEFINE_PER_CPU(int, printk_pending);
-static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
-static void wake_up_klogd_work_func(struct irq_work *irq_work)
-{
-        int pending = __this_cpu_xchg(printk_pending, 0);
-        if (pending & PRINTK_PENDING_SCHED) {
-                char *buf = __get_cpu_var(printk_sched_buf);
-                printk(KERN_WARNING "[sched_delayed] %s", buf);
-        }
-        if (pending & PRINTK_PENDING_WAKEUP)
-                wake_up_interruptible(&log_wait);
-}
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
-        .func = wake_up_klogd_work_func,
-        .flags = IRQ_WORK_LAZY,
-};
-void wake_up_klogd(void)
-{
-        preempt_disable();
-        if (waitqueue_active(&log_wait)) {
-                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
-                irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
-        }
-        preempt_enable();
-}
 static void console_cont_flush(char *text, size_t size)
 {
        unsigned long flags;
@@ -2458,6 +2436,44 @@ static int __init printk_late_init(void)
 late_initcall(printk_late_init);
 #if defined CONFIG_PRINTK
+/*
+ * Delayed printk version, for scheduler-internal messages:
+ */
+#define PRINTK_BUF_SIZE         512
+#define PRINTK_PENDING_WAKEUP   0x01
+#define PRINTK_PENDING_SCHED    0x02
+static DEFINE_PER_CPU(int, printk_pending);
+static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
+{
+        int pending = __this_cpu_xchg(printk_pending, 0);
+        if (pending & PRINTK_PENDING_SCHED) {
+                char *buf = __get_cpu_var(printk_sched_buf);
+                printk(KERN_WARNING "[sched_delayed] %s", buf);
+        }
+        if (pending & PRINTK_PENDING_WAKEUP)
+                wake_up_interruptible(&log_wait);
+}
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+        .func = wake_up_klogd_work_func,
+        .flags = IRQ_WORK_LAZY,
+};
+void wake_up_klogd(void)
+{
+        preempt_disable();
+        if (waitqueue_active(&log_wait)) {
+                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+                irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+        }
+        preempt_enable();
+}
 int printk_sched(const char *fmt, ...)
 {
@@ -2834,4 +2850,65 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+static char dump_stack_arch_desc_str[128];
+/**
+ * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
+ * @fmt: printf-style format string
+ * @...: arguments for the format string
+ *
+ * The configured string will be printed right after utsname during task
+ * dumps.  Usually used to add arch-specific system identifiers.  If an
+ * arch wants to make use of such an ID string, it should initialize this
+ * as soon as possible during boot.
+ */
+void __init dump_stack_set_arch_desc(const char *fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
+                  fmt, args);
+        va_end(args);
+}
+/**
+ * dump_stack_print_info - print generic debug info for dump_stack()
+ * @log_lvl: log level
+ *
+ * Arch-specific dump_stack() implementations can use this function to
+ * print out the same debug information as the generic dump_stack().
+ */
+void dump_stack_print_info(const char *log_lvl)
+{
+        printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
+               log_lvl, raw_smp_processor_id(), current->pid, current->comm,
+               print_tainted(), init_utsname()->release,
+               (int)strcspn(init_utsname()->version, " "),
+               init_utsname()->version);
+        if (dump_stack_arch_desc_str[0] != '\0')
+                printk("%sHardware name: %s\n",
+                       log_lvl, dump_stack_arch_desc_str);
+        print_worker_info(log_lvl, current);
+}
+/**
+ * show_regs_print_info - print generic debug info for show_regs()
+ * @log_lvl: log level
+ *
+ * show_regs() implementations can use this function to print out generic
+ * debug information.
+ */
+void show_regs_print_info(const char *log_lvl)
+{
+        dump_stack_print_info(log_lvl);
+        printk("%stask: %p ti: %p task.ti: %p\n",
+               log_lvl, current, current_thread_info(),
+               task_thread_info(current));
+}
 #endif
diff --git a/kernel/profile.c b/kernel/profile.c
index dc3384ee874e..0bf400737660 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -462,10 +462,10 @@ static const struct file_operations prof_cpu_mask_proc_fops = {
        .write          = prof_cpu_mask_proc_write,
 };
-void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
+void create_prof_cpu_mask(void)
 {
        /* create /proc/irq/prof_cpu_mask */
-        proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
+        proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
 }
 /*
@@ -600,7 +600,7 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
                            NULL, &proc_profile_operations);
        if (!entry)
                return 0;
-        entry->size = (1+prof_len) * sizeof(atomic_t);
+        proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
        hotcpu_notifier(profile_cpu_callback, 0);
        return 0;
 }
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index acbd28424d81..17ae54da0ec2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -24,6 +24,7 @@
 #include <linux/regset.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/cn_proc.h>
+#include <linux/compat.h>
 static int ptrace_trapping_sleep_fn(void *flags)
@@ -618,6 +619,81 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
        return error;
 }
+static int ptrace_peek_siginfo(struct task_struct *child,
+                                unsigned long addr,
+                                unsigned long data)
+{
+        struct ptrace_peeksiginfo_args arg;
+        struct sigpending *pending;
+        struct sigqueue *q;
+        int ret, i;
+        ret = copy_from_user(&arg, (void __user *) addr,
+                                sizeof(struct ptrace_peeksiginfo_args));
+        if (ret)
+                return -EFAULT;
+        if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED)
+                return -EINVAL; /* unknown flags */
+        if (arg.nr < 0)
+                return -EINVAL;
+        if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
+                pending = &child->signal->shared_pending;
+        else
+                pending = &child->pending;
+        for (i = 0; i < arg.nr; ) {
+                siginfo_t info;
+                s32 off = arg.off + i;
+                spin_lock_irq(&child->sighand->siglock);
+                list_for_each_entry(q, &pending->list, list) {
+                        if (!off--) {
+                                copy_siginfo(&info, &q->info);
+                                break;
+                        }
+                }
+                spin_unlock_irq(&child->sighand->siglock);
+                if (off >= 0) /* beyond the end of the list */
+                        break;
+#ifdef CONFIG_COMPAT
+                if (unlikely(is_compat_task())) {
+                        compat_siginfo_t __user *uinfo = compat_ptr(data);
+                        ret = copy_siginfo_to_user32(uinfo, &info);
+                        ret |= __put_user(info.si_code, &uinfo->si_code);
+                } else
+#endif
+                {
+                        siginfo_t __user *uinfo = (siginfo_t __user *) data;
+                        ret = copy_siginfo_to_user(uinfo, &info);
+                        ret |= __put_user(info.si_code, &uinfo->si_code);
+                }
+                if (ret) {
+                        ret = -EFAULT;
+                        break;
+                }
+                data += sizeof(siginfo_t);
+                i++;
+                if (signal_pending(current))
+                        break;
+                cond_resched();
+        }
+        if (i > 0)
+                return i;
+        return ret;
+}
 #ifdef PTRACE_SINGLESTEP
 #define is_singlestep(request)          ((request) == PTRACE_SINGLESTEP)
@@ -748,6 +824,10 @@ int ptrace_request(struct task_struct *child, long request,
                ret = put_user(child->ptrace_message, datalp);
                break;
+        case PTRACE_PEEKSIGINFO:
+                ret = ptrace_peek_siginfo(child, addr, data);
+                break;
        case PTRACE_GETSIGINFO:
                ret = ptrace_getsiginfo(child, &siginfo);
                if (!ret)
diff --git a/kernel/range.c b/kernel/range.c
index 9b8ae2d6ed68..071b0ab455cb 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -97,7 +97,8 @@ void subtract_range(struct range *range, int az, u64 start, u64 end)
                                range[i].end = range[j].end;
                                range[i].start = end;
                        } else {
-                                printk(KERN_ERR "run of slot in ranges\n");
+                                pr_err("%s: run out of slot in ranges\n",
+                                        __func__);
                        }
                        range[j].end = start;
                        continue;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 5b8ad827fd86..16ea67925015 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -64,7 +64,7 @@
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-#define RCU_STATE_INITIALIZER(sname, cr) { \
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
        .level = { &sname##_state.node[0] }, \
        .call = cr, \
        .fqs_state = RCU_GP_IDLE, \
@@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
        .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
        .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
        .name = #sname, \
+        .abbr = sabbr, \
 }
 struct rcu_state rcu_sched_state =
-        RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
+        RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 static struct rcu_state *rcu_state;
@@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
 module_param(jiffies_till_first_fqs, ulong, 0644);
 module_param(jiffies_till_next_fqs, ulong, 0644);
+static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+                                  struct rcu_data *rdp);
 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
 static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(int cpu);
@@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
        if (rcu_gp_in_progress(rsp))
                return 0;  /* No, a grace period is already in progress. */
+        if (rcu_nocb_needs_gp(rsp))
+                return 1;  /* Yes, a no-CBs CPU needs one. */
        if (!rdp->nxttail[RCU_NEXT_TAIL])
                return 0;  /* No, this is a no-CBs (or offline) CPU. */
        if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
@@ -794,6 +799,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
                rdp->offline_fqs++;
                return 1;
        }
+        /*
+         * There is a possibility that a CPU in adaptive-ticks state
+         * might run in the kernel with the scheduling-clock tick disabled
+         * for an extended time period.  Invoke rcu_kick_nohz_cpu() to
+         * force the CPU to restart the scheduling-clock tick in this
+         * CPU is in this state.
+         */
+        rcu_kick_nohz_cpu(rdp->cpu);
        return 0;
 }
@@ -1035,10 +1050,11 @@ static void init_callback_list(struct rcu_data *rdp)
 {
        int i;
+        if (init_nocb_callback_list(rdp))
+                return;
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
-        init_nocb_callback_list(rdp);
 }
 /*
@@ -1071,6 +1087,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
 }
 /*
+ * Trace-event helper function for rcu_start_future_gp() and
+ * rcu_nocb_wait_gp().
+ */
+static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+                                unsigned long c, char *s)
+{
+        trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
+                                      rnp->completed, c, rnp->level,
+                                      rnp->grplo, rnp->grphi, s);
+}
+/*
+ * Start some future grace period, as needed to handle newly arrived
+ * callbacks.  The required future grace periods are recorded in each
+ * rcu_node structure's ->need_future_gp field.
+ *
+ * The caller must hold the specified rcu_node structure's ->lock.
+ */
+static unsigned long __maybe_unused
+rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
+{
+        unsigned long c;
+        int i;
+        struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+        /*
+         * Pick up grace-period number for new callbacks.  If this
+         * grace period is already marked as needed, return to the caller.
+         */
+        c = rcu_cbs_completed(rdp->rsp, rnp);
+        trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
+        if (rnp->need_future_gp[c & 0x1]) {
+                trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
+                return c;
+        }
+        /*
+         * If either this rcu_node structure or the root rcu_node structure
+         * believe that a grace period is in progress, then we must wait
+         * for the one following, which is in "c".  Because our request
+         * will be noticed at the end of the current grace period, we don't
+         * need to explicitly start one.
+         */
+        if (rnp->gpnum != rnp->completed ||
+            ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+                rnp->need_future_gp[c & 0x1]++;
+                trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
+                return c;
+        }
+        /*
+         * There might be no grace period in progress.  If we don't already
+         * hold it, acquire the root rcu_node structure's lock in order to
+         * start one (if needed).
+         */
+        if (rnp != rnp_root)
+                raw_spin_lock(&rnp_root->lock);
+        /*
+         * Get a new grace-period number.  If there really is no grace
+         * period in progress, it will be smaller than the one we obtained
+         * earlier.  Adjust callbacks as needed.  Note that even no-CBs
+         * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
+         */
+        c = rcu_cbs_completed(rdp->rsp, rnp_root);
+        for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
+                if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
+                        rdp->nxtcompleted[i] = c;
+        /*
+         * If the needed for the required grace period is already
+         * recorded, trace and leave.
+         */
+        if (rnp_root->need_future_gp[c & 0x1]) {
+                trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
+                goto unlock_out;
+        }
+        /* Record the need for the future grace period. */
+        rnp_root->need_future_gp[c & 0x1]++;
+        /* If a grace period is not already in progress, start one. */
+        if (rnp_root->gpnum != rnp_root->completed) {
+                trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
+        } else {
+                trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
+                rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+        }
+unlock_out:
+        if (rnp != rnp_root)
+                raw_spin_unlock(&rnp_root->lock);
+        return c;
+}
+/*
+ * Clean up any old requests for the just-ended grace period.  Also return
+ * whether any additional grace periods have been requested.  Also invoke
+ * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
+ * waiting for this grace period to complete.
+ */
+static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+        int c = rnp->completed;
+        int needmore;
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+        rcu_nocb_gp_cleanup(rsp, rnp);
+        rnp->need_future_gp[c & 0x1] = 0;
+        needmore = rnp->need_future_gp[(c + 1) & 0x1];
+        trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
+        return needmore;
+}
+/*
 * If there is room, assign a ->completed number to any callbacks on
 * this CPU that have not already been assigned.  Also accelerate any
 * callbacks that were previously assigned a ->completed number that has
@@ -1129,6 +1259,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
                rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
                rdp->nxtcompleted[i] = c;
        }
+        /* Record any needed additional grace periods. */
+        rcu_start_future_gp(rnp, rdp);
        /* Trace depending on how much we were able to accelerate. */
        if (!*rdp->nxttail[RCU_WAIT_TAIL])
@@ -1308,9 +1440,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
                rdp = this_cpu_ptr(rsp->rda);
                rcu_preempt_check_blocked_tasks(rnp);
                rnp->qsmask = rnp->qsmaskinit;
-                rnp->gpnum = rsp->gpnum;
+                ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
                WARN_ON_ONCE(rnp->completed != rsp->completed);
-                rnp->completed = rsp->completed;
+                ACCESS_ONCE(rnp->completed) = rsp->completed;
                if (rnp == rdp->mynode)
                        rcu_start_gp_per_cpu(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
@@ -1319,7 +1451,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock_irq(&rnp->lock);
 #ifdef CONFIG_PROVE_RCU_DELAY
-                if ((random32() % (rcu_num_nodes * 8)) == 0)
+                if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
+                    system_state == SYSTEM_RUNNING)
                        schedule_timeout_uninterruptible(2);
 #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
                cond_resched();
@@ -1361,6 +1494,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 static void rcu_gp_cleanup(struct rcu_state *rsp)
 {
        unsigned long gp_duration;
+        int nocb = 0;
        struct rcu_data *rdp;
        struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1390,17 +1524,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
         */
        rcu_for_each_node_breadth_first(rsp, rnp) {
                raw_spin_lock_irq(&rnp->lock);
-                rnp->completed = rsp->gpnum;
+                ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+                rdp = this_cpu_ptr(rsp->rda);
+                if (rnp == rdp->mynode)
+                        __rcu_process_gp_end(rsp, rnp, rdp);
+                nocb += rcu_future_gp_cleanup(rsp, rnp);
                raw_spin_unlock_irq(&rnp->lock);
                cond_resched();
        }
        rnp = rcu_get_root(rsp);
        raw_spin_lock_irq(&rnp->lock);
+        rcu_nocb_gp_set(rnp, nocb);
        rsp->completed = rsp->gpnum; /* Declare grace period done. */
        trace_rcu_grace_period(rsp->name, rsp->completed, "end");
        rsp->fqs_state = RCU_GP_IDLE;
        rdp = this_cpu_ptr(rsp->rda);
+        rcu_advance_cbs(rsp, rnp, rdp);  /* Reduce false positives below. */
        if (cpu_needs_another_gp(rsp, rdp))
                rsp->gp_flags = 1;
        raw_spin_unlock_irq(&rnp->lock);
@@ -1476,57 +1616,62 @@ static int __noreturn rcu_gp_kthread(void *arg)
 /*
 * Start a new RCU grace period if warranted, re-initializing the hierarchy
 * in preparation for detecting the next grace period.  The caller must hold
- * the root node's ->lock, which is released before return.  Hard irqs must
+ * the root node's ->lock and hard irqs must be disabled.
- * be disabled.
 *
 * Note that it is legal for a dying CPU (which is marked as offline) to
 * invoke this function.  This can happen when the dying CPU reports its
 * quiescent state.
 */
 static void
-rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
+rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
-        __releases(rcu_get_root(rsp)->lock)
+                      struct rcu_data *rdp)
 {
-        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+        if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
-        struct rcu_node *rnp = rcu_get_root(rsp);
-        if (!rsp->gp_kthread ||
-            !cpu_needs_another_gp(rsp, rdp)) {
                /*
                 * Either we have not yet spawned the grace-period
                 * task, this CPU does not need another grace period,
                 * or a grace period is already in progress.
                 * Either way, don't start a new grace period.
                 */
-                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
-        /*
-         * Because there is no grace period in progress right now,
-         * any callbacks we have up to this point will be satisfied
-         * by the next grace period.  So this is a good place to
-         * assign a grace period number to recently posted callbacks.
-         */
-        rcu_accelerate_cbs(rsp, rnp, rdp);
        rsp->gp_flags = RCU_GP_FLAG_INIT;
-        raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
-        /* Ensure that CPU is aware of completion of last grace period. */
-        rcu_process_gp_end(rsp, rdp);
-        local_irq_restore(flags);
        /* Wake up rcu_gp_kthread() to start the grace period. */
        wake_up(&rsp->gp_wq);
 }
 /*
+ * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
+ * callbacks.  Note that rcu_start_gp_advanced() cannot do this because it
+ * is invoked indirectly from rcu_advance_cbs(), which would result in
+ * endless recursion -- or would do so if it wasn't for the self-deadlock
+ * that is encountered beforehand.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp)
+{
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+        struct rcu_node *rnp = rcu_get_root(rsp);
+        /*
+         * If there is no grace period in progress right now, any
+         * callbacks we have up to this point will be satisfied by the
+         * next grace period.  Also, advancing the callbacks reduces the
+         * probability of false positives from cpu_needs_another_gp()
+         * resulting in pointless grace periods.  So, advance callbacks
+         * then start the grace period!
+         */
+        rcu_advance_cbs(rsp, rnp, rdp);
+        rcu_start_gp_advanced(rsp, rnp, rdp);
+}
+/*
 * Report a full set of quiescent states to the specified rcu_state
 * data structure.  This involves cleaning up after the prior grace
 * period and letting rcu_start_gp() start up the next grace period
- * if one is needed.  Note that the caller must hold rnp->lock, as
+ * if one is needed.  Note that the caller must hold rnp->lock, which
- * required by rcu_start_gp(), which will release it.
+ * is released before return.
 */
 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
@@ -1685,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
                          struct rcu_node *rnp, struct rcu_data *rdp)
 {
        /* No-CBs CPUs do not have orphanable callbacks. */
-        if (is_nocb_cpu(rdp->cpu))
+        if (rcu_is_nocb_cpu(rdp->cpu))
                return;
        /*
@@ -2124,7 +2269,8 @@ __rcu_process_callbacks(struct rcu_state *rsp)
        local_irq_save(flags);
        if (cpu_needs_another_gp(rsp, rdp)) {
                raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
-                rcu_start_gp(rsp, flags);  /* releases above lock */
+                rcu_start_gp(rsp);
+                raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
        } else {
                local_irq_restore(flags);
        }
@@ -2169,7 +2315,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 static void invoke_rcu_core(void)
 {
-        raise_softirq(RCU_SOFTIRQ);
+        if (cpu_online(smp_processor_id()))
+                raise_softirq(RCU_SOFTIRQ);
 }
 /*
@@ -2204,11 +2351,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
                /* Start a new grace period if one not already started. */
                if (!rcu_gp_in_progress(rsp)) {
-                        unsigned long nestflag;
                        struct rcu_node *rnp_root = rcu_get_root(rsp);
-                        raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+                        raw_spin_lock(&rnp_root->lock);
-                        rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+                        rcu_start_gp(rsp);
+                        raw_spin_unlock(&rnp_root->lock);
                } else {
                        /* Give the grace period a kick. */
                        rdp->blimit = LONG_MAX;
@@ -2628,19 +2775,27 @@ static int rcu_pending(int cpu)
 }
 /*
- * Check to see if any future RCU-related work will need to be done
+ * Return true if the specified CPU has any callback.  If all_lazy is
- * by the current CPU, even if none need be done immediately, returning
+ * non-NULL, store an indication of whether all callbacks are lazy.
- * 1 if so.
+ * (If there are no callbacks, all of them are deemed to be lazy.)
 */
-static int rcu_cpu_has_callbacks(int cpu)
+static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
 {
+        bool al = true;
+        bool hc = false;
+        struct rcu_data *rdp;
        struct rcu_state *rsp;
-        /* RCU callbacks either ready or pending? */
+        for_each_rcu_flavor(rsp) {
-        for_each_rcu_flavor(rsp)
+                rdp = per_cpu_ptr(rsp->rda, cpu);
-                if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
+                if (rdp->qlen != rdp->qlen_lazy)
-                        return 1;
+                        al = false;
-        return 0;
+                if (rdp->nxtlist)
+                        hc = true;
+        }
+        if (all_lazy)
+                *all_lazy = al;
+        return hc;
 }
 /*
@@ -2747,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
         * corresponding CPU's preceding callbacks have been invoked.
         */
        for_each_possible_cpu(cpu) {
-                if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
+                if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
                        continue;
                rdp = per_cpu_ptr(rsp->rda, cpu);
-                if (is_nocb_cpu(cpu)) {
+                if (rcu_is_nocb_cpu(cpu)) {
                        _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
                                           rsp->n_barrier_done);
                        atomic_inc(&rsp->barrier_cpu_count);
@@ -2859,7 +3014,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
        atomic_set(&rdp->dynticks->dynticks,
                   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
-        rcu_prepare_for_idle_init(cpu);
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
        /* Add CPU to rcu_node bitmasks. */
@@ -2909,7 +3063,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
        struct rcu_state *rsp;
-        int ret = NOTIFY_OK;
        trace_rcu_utilization("Start CPU hotplug");
        switch (action) {
@@ -2923,21 +3076,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                rcu_boost_kthread_setaffinity(rnp, -1);
                break;
        case CPU_DOWN_PREPARE:
-                if (nocb_cpu_expendable(cpu))
+                rcu_boost_kthread_setaffinity(rnp, cpu);
-                        rcu_boost_kthread_setaffinity(rnp, cpu);
-                else
-                        ret = NOTIFY_BAD;
                break;
        case CPU_DYING:
        case CPU_DYING_FROZEN:
-                /*
-                 * The whole machine is "stopped" except this CPU, so we can
-                 * touch any data without introducing corruption. We send the
-                 * dying CPU's callbacks to an arbitrarily chosen online CPU.
-                 */
                for_each_rcu_flavor(rsp)
                        rcu_cleanup_dying_cpu(rsp);
-                rcu_cleanup_after_idle(cpu);
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
@@ -2950,7 +3094,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                break;
        }
        trace_rcu_utilization("End CPU hotplug");
-        return ret;
+        return NOTIFY_OK;
 }
 /*
@@ -3085,6 +3229,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
                        }
                        rnp->level = i;
                        INIT_LIST_HEAD(&rnp->blkd_tasks);
+                        rcu_init_one_nocb(rnp);
                }
        }
@@ -3170,8 +3315,7 @@ void __init rcu_init(void)
        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
-        rcu_init_nocb();
+        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
        /*
         * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index c896b5045d9d..da77a8f57ff9 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,18 +88,13 @@ struct rcu_dynticks {
        int dynticks_nmi_nesting;   /* Track NMI nesting level. */
        atomic_t dynticks;          /* Even value for idle, else odd. */
 #ifdef CONFIG_RCU_FAST_NO_HZ
-        int dyntick_drain;          /* Prepare-for-idle state variable. */
+        bool all_lazy;              /* Are all CPU's CBs lazy? */
-        unsigned long dyntick_holdoff;
-                                    /* No retries for the jiffy of failure. */
-        struct timer_list idle_gp_timer;
-                                    /* Wake up CPU sleeping with callbacks. */
-        unsigned long idle_gp_timer_expires;
-                                    /* When to wake up CPU (for repost). */
-        bool idle_first_pass;       /* First pass of attempt to go idle? */
        unsigned long nonlazy_posted;
                                    /* # times non-lazy CBs posted to CPU. */
        unsigned long nonlazy_posted_snap;
                                    /* idle-period nonlazy_posted snapshot. */
+        unsigned long last_accelerate;
+                                    /* Last jiffy CBs were accelerated. */
        int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
@@ -134,9 +129,6 @@ struct rcu_node {
                                /*  elements that need to drain to allow the */
                                /*  current expedited grace period to */
                                /*  complete (only for TREE_PREEMPT_RCU). */
-        atomic_t wakemask;      /* CPUs whose kthread needs to be awakened. */
-                                /*  Since this has meaning only for leaf */
-                                /*  rcu_node structures, 32 bits suffices. */
        unsigned long qsmaskinit;
                                /* Per-GP initial value for qsmask & expmask. */
        unsigned long grpmask;  /* Mask to apply to parent qsmask. */
@@ -196,6 +188,12 @@ struct rcu_node {
                                /* Refused to boost: not sure why, though. */
                                /*  This can happen due to race conditions. */
 #endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_NOCB_CPU
+        wait_queue_head_t nocb_gp_wq[2];
+                                /* Place for rcu_nocb_kthread() to wait GP. */
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+        int need_future_gp[2];
+                                /* Counts of upcoming no-CB GP requests. */
        raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
 } ____cacheline_internodealigned_in_smp;
@@ -328,6 +326,11 @@ struct rcu_data {
        struct task_struct *nocb_kthread;
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+        /* 8) RCU CPU stall data. */
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+        unsigned int softirq_snap;      /* Snapshot of softirq activity. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
        int cpu;
        struct rcu_state *rsp;
 };
@@ -375,12 +378,6 @@ struct rcu_state {
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
        void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
                     void (*func)(struct rcu_head *head));
-#ifdef CONFIG_RCU_NOCB_CPU
-        void (*call_remote)(struct rcu_head *head,
-                     void (*func)(struct rcu_head *head));
-                                                /* call_rcu() flavor, but for */
-                                                /*  placing on remote CPU. */
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
        /* The following fields are guarded by the root rcu_node's lock. */
@@ -443,6 +440,7 @@ struct rcu_state {
        unsigned long gp_max;                   /* Maximum GP duration in */
                                                /*  jiffies. */
        char *name;                             /* Name of structure. */
+        char abbr;                              /* Abbreviated name. */
        struct list_head flavors;               /* List of RCU flavors. */
 };
@@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
                                                 struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 static void __cpuinit rcu_prepare_kthreads(int cpu);
-static void rcu_prepare_for_idle_init(int cpu);
 static void rcu_cleanup_after_idle(int cpu);
 static void rcu_prepare_for_idle(int cpu);
 static void rcu_idle_count_callbacks_posted(void);
@@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
 static void print_cpu_stall_info_end(void);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
-static bool is_nocb_cpu(int cpu);
+static int rcu_nocb_needs_gp(struct rcu_state *rsp);
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
                            bool lazy);
 static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
                                      struct rcu_data *rdp);
-static bool nocb_cpu_expendable(int cpu);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void init_nocb_callback_list(struct rcu_data *rdp);
+static void rcu_kick_nohz_cpu(int cpu);
-static void __init rcu_init_nocb(void);
+static bool init_nocb_callback_list(struct rcu_data *rdp);
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c1cc7e17ff9d..170814dc418f 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,6 +28,7 @@
 #include <linux/gfp.h>
 #include <linux/oom.h>
 #include <linux/smpboot.h>
+#include <linux/tick.h>
 #define RCU_KTHREAD_PRIO 1
@@ -85,11 +86,21 @@ static void __init rcu_bootup_announce_oddness(void)
        if (nr_cpu_ids != NR_CPUS)
                printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
 #ifdef CONFIG_RCU_NOCB_CPU
+#ifndef CONFIG_RCU_NOCB_CPU_NONE
+        if (!have_rcu_nocb_mask) {
+                alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+                have_rcu_nocb_mask = true;
+        }
+#ifdef CONFIG_RCU_NOCB_CPU_ZERO
+        pr_info("\tExperimental no-CBs CPU 0\n");
+        cpumask_set_cpu(0, rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
+#ifdef CONFIG_RCU_NOCB_CPU_ALL
+        pr_info("\tExperimental no-CBs for all CPUs\n");
+        cpumask_setall(rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
        if (have_rcu_nocb_mask) {
-                if (cpumask_test_cpu(0, rcu_nocb_mask)) {
-                        cpumask_clear_cpu(0, rcu_nocb_mask);
-                        pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
-                }
                cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
                pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
                if (rcu_nocb_poll)
@@ -101,7 +112,7 @@ static void __init rcu_bootup_announce_oddness(void)
 #ifdef CONFIG_TREE_PREEMPT_RCU
 struct rcu_state rcu_preempt_state =
-        RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
+        RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
@@ -1533,14 +1544,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
        *delta_jiffies = ULONG_MAX;
-        return rcu_cpu_has_callbacks(cpu);
+        return rcu_cpu_has_callbacks(cpu, NULL);
-}
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
 }
 /*
@@ -1577,16 +1581,6 @@ static void rcu_idle_count_callbacks_posted(void)
 *
 * The following three proprocessor symbols control this state machine:
 *
- * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
- *      to satisfy RCU.  Beyond this point, it is better to incur a periodic
- *      scheduling-clock interrupt than to loop through the state machine
- *      at full power.
- * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
- *      optional if RCU does not need anything immediately from this
- *      CPU, even if this CPU still has RCU callbacks queued.  The first
- *      times through the state machine are mandatory: we need to give
- *      the state machine a chance to communicate a quiescent state
- *      to the RCU core.
 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
 *      to sleep in dyntick-idle mode with RCU callbacks pending.  This
 *      is sized to be roughly one RCU grace period.  Those energy-efficiency
@@ -1602,186 +1596,108 @@ static void rcu_idle_count_callbacks_posted(void)
 * adjustment, they can be converted into kernel config parameters, though
 * making the state machine smarter might be a better option.
 */
-#define RCU_IDLE_FLUSHES 5              /* Number of dyntick-idle tries. */
-#define RCU_IDLE_OPT_FLUSHES 3          /* Optional dyntick-idle tries. */
 #define RCU_IDLE_GP_DELAY 4             /* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
-extern int tick_nohz_enabled;
+static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
+module_param(rcu_idle_gp_delay, int, 0644);
-/*
+static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
- * Does the specified flavor of RCU have non-lazy callbacks pending on
+module_param(rcu_idle_lazy_gp_delay, int, 0644);
- * the specified CPU?  Both RCU flavor and CPU are specified by the
- * rcu_data structure.
- */
-static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
-{
-        return rdp->qlen != rdp->qlen_lazy;
-}
-#ifdef CONFIG_TREE_PREEMPT_RCU
+extern int tick_nohz_enabled;
 /*
- * Are there non-lazy RCU-preempt callbacks?  (There cannot be if there
+ * Try to advance callbacks for all flavors of RCU on the current CPU.
- * is no RCU-preempt in the kernel.)
+ * Afterwards, if there are any callbacks ready for immediate invocation,
+ * return true.
 */
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
+static bool rcu_try_advance_all_cbs(void)
 {
-        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
+        bool cbs_ready = false;
+        struct rcu_data *rdp;
-        return __rcu_cpu_has_nonlazy_callbacks(rdp);
+        struct rcu_node *rnp;
-}
+        struct rcu_state *rsp;
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
+        for_each_rcu_flavor(rsp) {
-{
+                rdp = this_cpu_ptr(rsp->rda);
-        return 0;
+                rnp = rdp->mynode;
-}
-#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
+                /*
+                 * Don't bother checking unless a grace period has
+                 * completed since we last checked and there are
+                 * callbacks not yet ready to invoke.
+                 */
+                if (rdp->completed != rnp->completed &&
+                    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
+                        rcu_process_gp_end(rsp, rdp);
-/*
+                if (cpu_has_callbacks_ready_to_invoke(rdp))
- * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
+                        cbs_ready = true;
- */
+        }
-static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
+        return cbs_ready;
-{
-        return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
-               __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
-               rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
 }
 /*
- * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
+ * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
- * callbacks on this CPU, (2) this CPU has not yet attempted to enter
+ * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
- * dyntick-idle mode, or (3) this CPU is in the process of attempting to
+ * caller to set the timeout based on whether or not there are non-lazy
- * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
+ * callbacks.
- * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
- * it is better to incur scheduling-clock interrupts than to spin
- * continuously for the same time duration!
 *
- * The delta_jiffies argument is used to store the time when RCU is
+ * The caller must have disabled interrupts.
- * going to need the CPU again if it still has callbacks.  The reason
- * for this is that rcu_prepare_for_idle() might need to post a timer,
- * but if so, it will do so after tick_nohz_stop_sched_tick() has set
- * the wakeup time for this CPU.  This means that RCU's timer can be
- * delayed until the wakeup time, which defeats the purpose of posting
- * a timer.
 */
-int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+int rcu_needs_cpu(int cpu, unsigned long *dj)
 {
        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-        /* Flag a new idle sojourn to the idle-entry state machine. */
+        /* Snapshot to detect later posting of non-lazy callback. */
-        rdtp->idle_first_pass = 1;
+        rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
        /* If no callbacks, RCU doesn't need the CPU. */
-        if (!rcu_cpu_has_callbacks(cpu)) {
+        if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
-                *delta_jiffies = ULONG_MAX;
+                *dj = ULONG_MAX;
                return 0;
        }
-        if (rdtp->dyntick_holdoff == jiffies) {
-                /* RCU recently tried and failed, so don't try again. */
+        /* Attempt to advance callbacks. */
-                *delta_jiffies = 1;
+        if (rcu_try_advance_all_cbs()) {
+                /* Some ready to invoke, so initiate later invocation. */
+                invoke_rcu_core();
                return 1;
        }
-        /* Set up for the possibility that RCU will post a timer. */
+        rdtp->last_accelerate = jiffies;
-        if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
+        /* Request timer delay depending on laziness, and round. */
-                                          RCU_IDLE_GP_DELAY) - jiffies;
+        if (rdtp->all_lazy) {
+                *dj = round_up(rcu_idle_gp_delay + jiffies,
+                               rcu_idle_gp_delay) - jiffies;
        } else {
-                *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
+                *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
-                *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
        }
        return 0;
 }
 /*
- * Handler for smp_call_function_single().  The only point of this
+ * Prepare a CPU for idle from an RCU perspective.  The first major task
- * handler is to wake the CPU up, so the handler does only tracing.
+ * is to sense whether nohz mode has been enabled or disabled via sysfs.
- */
+ * The second major task is to check to see if a non-lazy callback has
-void rcu_idle_demigrate(void *unused)
+ * arrived at a CPU that previously had only lazy callbacks.  The third
-{
+ * major task is to accelerate (that is, assign grace-period numbers to)
-        trace_rcu_prep_idle("Demigrate");
+ * any recently arrived callbacks.
-}
-/*
- * Timer handler used to force CPU to start pushing its remaining RCU
- * callbacks in the case where it entered dyntick-idle mode with callbacks
- * pending.  The hander doesn't really need to do anything because the
- * real work is done upon re-entry to idle, or by the next scheduling-clock
- * interrupt should idle not be re-entered.
- *
- * One special case: the timer gets migrated without awakening the CPU
- * on which the timer was scheduled on.  In this case, we must wake up
- * that CPU.  We do so with smp_call_function_single().
- */
-static void rcu_idle_gp_timer_func(unsigned long cpu_in)
-{
-        int cpu = (int)cpu_in;
-        trace_rcu_prep_idle("Timer");
-        if (cpu != smp_processor_id())
-                smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
-        else
-                WARN_ON_ONCE(1); /* Getting here can hang the system... */
-}
-/*
- * Initialize the timer used to pull CPUs out of dyntick-idle mode.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
-        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-        rdtp->dyntick_holdoff = jiffies - 1;
-        setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
-        rdtp->idle_gp_timer_expires = jiffies - 1;
-        rdtp->idle_first_pass = 1;
-}
-/*
- * Clean up for exit from idle.  Because we are exiting from idle, there
- * is no longer any point to ->idle_gp_timer, so cancel it.  This will
- * do nothing if this timer is not active, so just cancel it unconditionally.
- */
-static void rcu_cleanup_after_idle(int cpu)
-{
-        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-        del_timer(&rdtp->idle_gp_timer);
-        trace_rcu_prep_idle("Cleanup after idle");
-        rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
-}
-/*
- * Check to see if any RCU-related work can be done by the current CPU,
- * and if so, schedule a softirq to get it done.  This function is part
- * of the RCU implementation; it is -not- an exported member of the RCU API.
- *
- * The idea is for the current CPU to clear out all work required by the
- * RCU core for the current grace period, so that this CPU can be permitted
- * to enter dyntick-idle mode.  In some cases, it will need to be awakened
- * at the end of the grace period by whatever CPU ends the grace period.
- * This allows CPUs to go dyntick-idle more quickly, and to reduce the
- * number of wakeups by a modest integer factor.
- *
- * Because it is not legal to invoke rcu_process_callbacks() with irqs
- * disabled, we do one pass of force_quiescent_state(), then do a
- * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * later.  The ->dyntick_drain field controls the sequencing.
 *
 * The caller must have disabled interrupts.
 */
 static void rcu_prepare_for_idle(int cpu)
 {
-        struct timer_list *tp;
+        struct rcu_data *rdp;
        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        struct rcu_node *rnp;
+        struct rcu_state *rsp;
        int tne;
        /* Handle nohz enablement switches conservatively. */
        tne = ACCESS_ONCE(tick_nohz_enabled);
        if (tne != rdtp->tick_nohz_enabled_snap) {
-                if (rcu_cpu_has_callbacks(cpu))
+                if (rcu_cpu_has_callbacks(cpu, NULL))
                        invoke_rcu_core(); /* force nohz to see update. */
                rdtp->tick_nohz_enabled_snap = tne;
                return;
@@ -1789,125 +1705,56 @@ static void rcu_prepare_for_idle(int cpu)
        if (!tne)
                return;
-        /* Adaptive-tick mode, where usermode execution is idle to RCU. */
+        /* If this is a no-CBs CPU, no callbacks, just return. */
-        if (!is_idle_task(current)) {
+        if (rcu_is_nocb_cpu(cpu))
-                rdtp->dyntick_holdoff = jiffies - 1;
-                if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                        trace_rcu_prep_idle("User dyntick with callbacks");
-                        rdtp->idle_gp_timer_expires =
-                                round_up(jiffies + RCU_IDLE_GP_DELAY,
-                                         RCU_IDLE_GP_DELAY);
-                } else if (rcu_cpu_has_callbacks(cpu)) {
-                        rdtp->idle_gp_timer_expires =
-                                round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
-                        trace_rcu_prep_idle("User dyntick with lazy callbacks");
-                } else {
-                        return;
-                }
-                tp = &rdtp->idle_gp_timer;
-                mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
                return;
-        }
        /*
-         * If this is an idle re-entry, for example, due to use of
+         * If a non-lazy callback arrived at a CPU having only lazy
-         * RCU_NONIDLE() or the new idle-loop tracing API within the idle
+         * callbacks, invoke RCU core for the side-effect of recalculating
-         * loop, then don't take any state-machine actions, unless the
+         * idle duration on re-entry to idle.
-         * momentary exit from idle queued additional non-lazy callbacks.
-         * Instead, repost the ->idle_gp_timer if this CPU has callbacks
-         * pending.
         */
-        if (!rdtp->idle_first_pass &&
+        if (rdtp->all_lazy &&
-            (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
+            rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
-                if (rcu_cpu_has_callbacks(cpu)) {
+                invoke_rcu_core();
-                        tp = &rdtp->idle_gp_timer;
-                        mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
-                }
                return;
        }
-        rdtp->idle_first_pass = 0;
-        rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
        /*
-         * If there are no callbacks on this CPU, enter dyntick-idle mode.
+         * If we have not yet accelerated this jiffy, accelerate all
-         * Also reset state to avoid prejudicing later attempts.
+         * callbacks on this CPU.
         */
-        if (!rcu_cpu_has_callbacks(cpu)) {
+        if (rdtp->last_accelerate == jiffies)
-                rdtp->dyntick_holdoff = jiffies - 1;
-                rdtp->dyntick_drain = 0;
-                trace_rcu_prep_idle("No callbacks");
                return;
+        rdtp->last_accelerate = jiffies;
+        for_each_rcu_flavor(rsp) {
+                rdp = per_cpu_ptr(rsp->rda, cpu);
+                if (!*rdp->nxttail[RCU_DONE_TAIL])
+                        continue;
+                rnp = rdp->mynode;
+                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                rcu_accelerate_cbs(rsp, rnp, rdp);
+                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
        }
+}
-        /*
+/*
-         * If in holdoff mode, just return.  We will presumably have
+ * Clean up for exit from idle.  Attempt to advance callbacks based on
-         * refrained from disabling the scheduling-clock tick.
+ * any grace periods that elapsed while the CPU was idle, and if any
-         */
+ * callbacks are now ready to invoke, initiate invocation.
-        if (rdtp->dyntick_holdoff == jiffies) {
+ */
-                trace_rcu_prep_idle("In holdoff");
+static void rcu_cleanup_after_idle(int cpu)
-                return;
+{
-        }
+        struct rcu_data *rdp;
+        struct rcu_state *rsp;
-        /* Check and update the ->dyntick_drain sequencing. */
+        if (rcu_is_nocb_cpu(cpu))
-        if (rdtp->dyntick_drain <= 0) {
-                /* First time through, initialize the counter. */
-                rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
-        } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
-                   !rcu_pending(cpu) &&
-                   !local_softirq_pending()) {
-                /* Can we go dyntick-idle despite still having callbacks? */
-                rdtp->dyntick_drain = 0;
-                rdtp->dyntick_holdoff = jiffies;
-                if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                        trace_rcu_prep_idle("Dyntick with callbacks");
-                        rdtp->idle_gp_timer_expires =
-                                round_up(jiffies + RCU_IDLE_GP_DELAY,
-                                         RCU_IDLE_GP_DELAY);
-                } else {
-                        rdtp->idle_gp_timer_expires =
-                                round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
-                        trace_rcu_prep_idle("Dyntick with lazy callbacks");
-                }
-                tp = &rdtp->idle_gp_timer;
-                mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
-                rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
-                return; /* Nothing more to do immediately. */
-        } else if (--(rdtp->dyntick_drain) <= 0) {
-                /* We have hit the limit, so time to give up. */
-                rdtp->dyntick_holdoff = jiffies;
-                trace_rcu_prep_idle("Begin holdoff");
-                invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
                return;
-        }
+        rcu_try_advance_all_cbs();
+        for_each_rcu_flavor(rsp) {
-        /*
+                rdp = per_cpu_ptr(rsp->rda, cpu);
-         * Do one step of pushing the remaining RCU callbacks through
+                if (cpu_has_callbacks_ready_to_invoke(rdp))
-         * the RCU core state machine.
+                        invoke_rcu_core();
-         */
-#ifdef CONFIG_TREE_PREEMPT_RCU
-        if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
-                rcu_preempt_qs(cpu);
-                force_quiescent_state(&rcu_preempt_state);
-        }
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-        if (per_cpu(rcu_sched_data, cpu).nxtlist) {
-                rcu_sched_qs(cpu);
-                force_quiescent_state(&rcu_sched_state);
-        }
-        if (per_cpu(rcu_bh_data, cpu).nxtlist) {
-                rcu_bh_qs(cpu);
-                force_quiescent_state(&rcu_bh_state);
-        }
-        /*
-         * If RCU callbacks are still pending, RCU still needs this CPU.
-         * So try forcing the callbacks through the grace period.
-         */
-        if (rcu_cpu_has_callbacks(cpu)) {
-                trace_rcu_prep_idle("More callbacks");
-                invoke_rcu_core();
-        } else {
-                trace_rcu_prep_idle("Callbacks drained");
        }
 }
@@ -2015,16 +1862,13 @@ early_initcall(rcu_register_oom_notifier);
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 {
        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-        struct timer_list *tltp = &rdtp->idle_gp_timer;
+        unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
-        char c;
-        c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
+        sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
-        if (timer_pending(tltp))
+                rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
-                sprintf(cp, "drain=%d %c timer=%lu",
+                ulong2long(nlpd),
-                        rdtp->dyntick_drain, c, tltp->expires - jiffies);
+                rdtp->all_lazy ? 'L' : '.',
-        else
+                rdtp->tick_nohz_enabled_snap ? '.' : 'D');
-                sprintf(cp, "drain=%d %c timer not pending",
-                        rdtp->dyntick_drain, c);
 }
 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2070,10 +1914,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
                ticks_value = rsp->gpnum - rdp->gpnum;
        }
        print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-        printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
+        printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
               cpu, ticks_value, ticks_title,
               atomic_read(&rdtp->dynticks) & 0xfff,
               rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
+               rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
               fast_no_hz);
 }
@@ -2087,6 +1932,7 @@ static void print_cpu_stall_info_end(void)
 static void zero_cpu_stall_ticks(struct rcu_data *rdp)
 {
        rdp->ticks_this_gp = 0;
+        rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
 }
 /* Increment ->ticks_this_gp for all flavors of RCU. */
@@ -2165,8 +2011,49 @@ static int __init parse_rcu_nocb_poll(char *arg)
 }
 early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+/*
+ * Do any no-CBs CPUs need another grace period?
+ *
+ * Interrupts must be disabled.  If the caller does not hold the root
+ * rnp_node structure's ->lock, the results are advisory only.
+ */
+static int rcu_nocb_needs_gp(struct rcu_state *rsp)
+{
+        struct rcu_node *rnp = rcu_get_root(rsp);
+        return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
+}
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+        wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+}
+/*
+ * Set the root rcu_node structure's ->need_future_gp field
+ * based on the sum of those of all rcu_node structures.  This does
+ * double-count the root rcu_node structure's requests, but this
+ * is necessary to handle the possibility of a rcu_nocb_kthread()
+ * having awakened during the time that the rcu_node structures
+ * were being updated for the end of the previous grace period.
+ */
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+        rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
+}
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+        init_waitqueue_head(&rnp->nocb_gp_wq[0]);
+        init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+}
 /* Is the specified CPU a no-CPUs CPU? */
-static bool is_nocb_cpu(int cpu)
+bool rcu_is_nocb_cpu(int cpu)
 {
        if (have_rcu_nocb_mask)
                return cpumask_test_cpu(cpu, rcu_nocb_mask);
@@ -2224,9 +2111,16 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
                            bool lazy)
 {
-        if (!is_nocb_cpu(rdp->cpu))
+        if (!rcu_is_nocb_cpu(rdp->cpu))
                return 0;
        __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
+        if (__is_kfree_rcu_offset((unsigned long)rhp->func))
+                trace_rcu_kfree_callback(rdp->rsp->name, rhp,
+                                         (unsigned long)rhp->func,
+                                         rdp->qlen_lazy, rdp->qlen);
+        else
+                trace_rcu_callback(rdp->rsp->name, rhp,
+                                   rdp->qlen_lazy, rdp->qlen);
        return 1;
 }
@@ -2241,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
        long qll = rsp->qlen_lazy;
        /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
-        if (!is_nocb_cpu(smp_processor_id()))
+        if (!rcu_is_nocb_cpu(smp_processor_id()))
                return 0;
        rsp->qlen = 0;
        rsp->qlen_lazy = 0;
@@ -2265,95 +2159,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
 }
 /*
- * There must be at least one non-no-CBs CPU in operation at any given
+ * If necessary, kick off a new grace period, and either way wait
- * time, because no-CBs CPUs are not capable of initiating grace periods
+ * for a subsequent grace period to complete.
- * independently.  This function therefore complains if the specified
- * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
- * avoid offlining the last such CPU.  (Recursion is a wonderful thing,
- * but you have to have a base case!)
 */
-static bool nocb_cpu_expendable(int cpu)
+static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 {
-        cpumask_var_t non_nocb_cpus;
+        unsigned long c;
-        int ret;
+        bool d;
+        unsigned long flags;
+        struct rcu_node *rnp = rdp->mynode;
+        raw_spin_lock_irqsave(&rnp->lock, flags);
+        c = rcu_start_future_gp(rnp, rdp);
+        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /*
-         * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
+         * Wait for the grace period.  Do so interruptibly to avoid messing
-         * then offlining this CPU is harmless.  Let it happen.
+         * up the load average.
         */
-        if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
+        trace_rcu_future_gp(rnp, rdp, c, "StartWait");
-                return 1;
+        for (;;) {
+                wait_event_interruptible(
-        /* If no memory, play it safe and keep the CPU around. */
+                        rnp->nocb_gp_wq[c & 0x1],
-        if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
+                        (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
-                return 0;
+                if (likely(d))
-        cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
+                        break;
-        cpumask_clear_cpu(cpu, non_nocb_cpus);
+                flush_signals(current);
-        ret = !cpumask_empty(non_nocb_cpus);
+                trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
-        free_cpumask_var(non_nocb_cpus);
+        }
-        return ret;
+        trace_rcu_future_gp(rnp, rdp, c, "EndWait");
-}
+        smp_mb(); /* Ensure that CB invocation happens after GP end. */
-/*
- * Helper structure for remote registry of RCU callbacks.
- * This is needed for when a no-CBs CPU needs to start a grace period.
- * If it just invokes call_rcu(), the resulting callback will be queued,
- * which can result in deadlock.
- */
-struct rcu_head_remote {
-        struct rcu_head *rhp;
-        call_rcu_func_t *crf;
-        void (*func)(struct rcu_head *rhp);
-};
-/*
- * Register a callback as specified by the rcu_head_remote struct.
- * This function is intended to be invoked via smp_call_function_single().
- */
-static void call_rcu_local(void *arg)
-{
-        struct rcu_head_remote *rhrp =
-                container_of(arg, struct rcu_head_remote, rhp);
-        rhrp->crf(rhrp->rhp, rhrp->func);
-}
-/*
- * Set up an rcu_head_remote structure and the invoke call_rcu_local()
- * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
- * smp_call_function_single().
- */
-static void invoke_crf_remote(struct rcu_head *rhp,
-                              void (*func)(struct rcu_head *rhp),
-                              call_rcu_func_t crf)
-{
-        struct rcu_head_remote rhr;
-        rhr.rhp = rhp;
-        rhr.crf = crf;
-        rhr.func = func;
-        smp_call_function_single(0, call_rcu_local, &rhr, 1);
-}
-/*
- * Helper functions to be passed to wait_rcu_gp(), each of which
- * invokes invoke_crf_remote() to register a callback appropriately.
- */
-static void __maybe_unused
-call_rcu_preempt_remote(struct rcu_head *rhp,
-                        void (*func)(struct rcu_head *rhp))
-{
-        invoke_crf_remote(rhp, func, call_rcu);
-}
-static void call_rcu_bh_remote(struct rcu_head *rhp,
-                               void (*func)(struct rcu_head *rhp))
-{
-        invoke_crf_remote(rhp, func, call_rcu_bh);
-}
-static void call_rcu_sched_remote(struct rcu_head *rhp,
-                                  void (*func)(struct rcu_head *rhp))
-{
-        invoke_crf_remote(rhp, func, call_rcu_sched);
 }
 /*
@@ -2390,7 +2225,7 @@ static int rcu_nocb_kthread(void *arg)
                cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
                ACCESS_ONCE(rdp->nocb_p_count) += c;
                ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
-                wait_rcu_gp(rdp->rsp->call_remote);
+                rcu_nocb_wait_gp(rdp);
                /* Each pass through the following loop invokes a callback. */
                trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2436,36 +2271,40 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
                return;
        for_each_cpu(cpu, rcu_nocb_mask) {
                rdp = per_cpu_ptr(rsp->rda, cpu);
-                t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
+                t = kthread_run(rcu_nocb_kthread, rdp,
+                                "rcuo%c/%d", rsp->abbr, cpu);
                BUG_ON(IS_ERR(t));
                ACCESS_ONCE(rdp->nocb_kthread) = t;
        }
 }
 /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
-static void init_nocb_callback_list(struct rcu_data *rdp)
+static bool init_nocb_callback_list(struct rcu_data *rdp)
 {
        if (rcu_nocb_mask == NULL ||
            !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
-                return;
+                return false;
        rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+        return true;
 }
-/* Initialize the ->call_remote fields in the rcu_state structures. */
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static void __init rcu_init_nocb(void)
+static int rcu_nocb_needs_gp(struct rcu_state *rsp)
 {
-#ifdef CONFIG_PREEMPT_RCU
+        return 0;
-        rcu_preempt_state.call_remote = call_rcu_preempt_remote;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
-        rcu_bh_state.call_remote = call_rcu_bh_remote;
-        rcu_sched_state.call_remote = call_rcu_sched_remote;
 }
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+}
-static bool is_nocb_cpu(int cpu)
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+}
+static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
-        return false;
 }
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
@@ -2480,11 +2319,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
        return 0;
 }
-static bool nocb_cpu_expendable(int cpu)
-{
-        return 1;
-}
 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
 }
@@ -2493,12 +2327,26 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
 {
 }
-static void init_nocb_callback_list(struct rcu_data *rdp)
+static bool init_nocb_callback_list(struct rcu_data *rdp)
 {
+        return false;
 }
-static void __init rcu_init_nocb(void)
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+/*
+ * An adaptive-ticks CPU can potentially execute in kernel mode for an
+ * arbitrarily long period of time with the scheduling-clock tick turned
+ * off.  RCU will be paying attention to this CPU because it is in the
+ * kernel, but the CPU cannot be guaranteed to be executing the RCU state
+ * machine because the scheduling-clock tick has been disabled.  Therefore,
+ * if an adaptive-ticks CPU is failing to respond to the current grace
+ * period and has not be idle from an RCU perspective, kick it.
+ */
+static void rcu_kick_nohz_cpu(int cpu)
 {
+#ifdef CONFIG_NO_HZ_FULL
+        if (tick_nohz_full_cpu(cpu))
+                smp_send_reschedule(cpu);
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
 }
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0d095dcaa670..cf6c17412932 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,8 +46,6 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
-#define ulong2long(a) (*(long *)(&(a)))
 static int r_open(struct inode *inode, struct file *file,
                                        const struct seq_operations *op)
 {
@@ -97,7 +95,7 @@ static const struct file_operations rcubarrier_fops = {
        .open = rcubarrier_open,
        .read = seq_read,
        .llseek = no_llseek,
-        .release = seq_release,
+        .release = single_release,
 };
 #ifdef CONFIG_RCU_BOOST
@@ -208,7 +206,7 @@ static const struct file_operations rcuexp_fops = {
        .open = rcuexp_open,
        .read = seq_read,
        .llseek = no_llseek,
-        .release = seq_release,
+        .release = single_release,
 };
 #ifdef CONFIG_RCU_BOOST
@@ -308,7 +306,7 @@ static const struct file_operations rcuhier_fops = {
        .open = rcuhier_open,
        .read = seq_read,
        .llseek = no_llseek,
-        .release = seq_release,
+        .release = single_release,
 };
 static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -350,7 +348,7 @@ static const struct file_operations rcugp_fops = {
        .open = rcugp_open,
        .read = seq_read,
        .llseek = no_llseek,
-        .release = seq_release,
+        .release = single_release,
 };
 static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
diff --git a/kernel/relay.c b/kernel/relay.c
index 01ab081ac53a..eef0d113b79e 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -588,7 +588,7 @@ struct rchan *relay_open(const char *base_filename,
        chan->version = RELAYFS_CHANNEL_VERSION;
        chan->n_subbufs = n_subbufs;
        chan->subbuf_size = subbuf_size;
-        chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+        chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
        chan->parent = parent;
        chan->private_data = private_data;
        if (base_filename) {
@@ -1099,8 +1099,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
 static int subbuf_read_actor(size_t read_start,
                             struct rchan_buf *buf,
                             size_t avail,
-                             read_descriptor_t *desc,
+                             read_descriptor_t *desc)
-                             read_actor_t actor)
 {
        void *from;
        int ret = 0;
@@ -1121,15 +1120,13 @@ static int subbuf_read_actor(size_t read_start,
 typedef int (*subbuf_actor_t) (size_t read_start,
                               struct rchan_buf *buf,
                               size_t avail,
-                               read_descriptor_t *desc,
+                               read_descriptor_t *desc);
-                               read_actor_t actor);
 /*
 *      relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
 */
 static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
                                        subbuf_actor_t subbuf_actor,
-                                        read_actor_t actor,
                                        read_descriptor_t *desc)
 {
        struct rchan_buf *buf = filp->private_data;
@@ -1150,7 +1147,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
                        break;
                avail = min(desc->count, avail);
-                ret = subbuf_actor(read_start, buf, avail, desc, actor);
+                ret = subbuf_actor(read_start, buf, avail, desc);
                if (desc->error < 0)
                        break;
@@ -1174,8 +1171,7 @@ static ssize_t relay_file_read(struct file *filp,
        desc.count = count;
        desc.arg.buf = buffer;
        desc.error = 0;
-        return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,
+        return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
-                                       NULL, &desc);
 }
 static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
diff --git a/kernel/resource.c b/kernel/resource.c
index 73f35d4b30b9..d7386986e10e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/device.h>
 #include <linux/pfn.h>
+#include <linux/mm.h>
 #include <asm/io.h>
@@ -50,6 +51,14 @@ struct resource_constraint {
 static DEFINE_RWLOCK(resource_lock);
+/*
+ * For memory hotplug, there is no way to free resource entries allocated
+ * by boot mem after the system is up. So for reusing the resource entry
+ * we need to remember the resource.
+ */
+static struct resource *bootmem_resource_free;
+static DEFINE_SPINLOCK(bootmem_resource_lock);
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct resource *p = v;
@@ -151,6 +160,40 @@ __initcall(ioresources_init);
 #endif /* CONFIG_PROC_FS */
+static void free_resource(struct resource *res)
+{
+        if (!res)
+                return;
+        if (!PageSlab(virt_to_head_page(res))) {
+                spin_lock(&bootmem_resource_lock);
+                res->sibling = bootmem_resource_free;
+                bootmem_resource_free = res;
+                spin_unlock(&bootmem_resource_lock);
+        } else {
+                kfree(res);
+        }
+}
+static struct resource *alloc_resource(gfp_t flags)
+{
+        struct resource *res = NULL;
+        spin_lock(&bootmem_resource_lock);
+        if (bootmem_resource_free) {
+                res = bootmem_resource_free;
+                bootmem_resource_free = res->sibling;
+        }
+        spin_unlock(&bootmem_resource_lock);
+        if (res)
+                memset(res, 0, sizeof(struct resource));
+        else
+                res = kzalloc(sizeof(struct resource), flags);
+        return res;
+}
 /* Return the conflict entry if you can't request it */
 static struct resource * __request_resource(struct resource *root, struct resource *new)
 {
@@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
        write_unlock(&resource_lock);
 }
-/**
+static int __adjust_resource(struct resource *res, resource_size_t start,
- * adjust_resource - modify a resource's start and size
+                                resource_size_t size)
- * @res: resource to modify
- * @start: new start value
- * @size: new size
- *
- * Given an existing resource, change its start and size to match the
- * arguments.  Returns 0 on success, -EBUSY if it can't fit.
- * Existing children of the resource are assumed to be immutable.
- */
-int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
 {
        struct resource *tmp, *parent = res->parent;
        resource_size_t end = start + size - 1;
        int result = -EBUSY;
-        write_lock(&resource_lock);
        if (!parent)
                goto skip;
@@ -751,6 +783,26 @@ skip:
        result = 0;
 out:
+        return result;
+}
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
+ * Given an existing resource, change its start and size to match the
+ * arguments.  Returns 0 on success, -EBUSY if it can't fit.
+ * Existing children of the resource are assumed to be immutable.
+ */
+int adjust_resource(struct resource *res, resource_size_t start,
+                        resource_size_t size)
+{
+        int result;
+        write_lock(&resource_lock);
+        result = __adjust_resource(res, start, size);
        write_unlock(&resource_lock);
        return result;
 }
@@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root,
 {
        struct resource *parent = root;
        struct resource *conflict;
-        struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+        struct resource *res = alloc_resource(GFP_ATOMIC);
        struct resource *next_res = NULL;
        if (!res)
@@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root,
                /* conflict covered whole area */
                if (conflict->start <= res->start &&
                                conflict->end >= res->end) {
-                        kfree(res);
+                        free_resource(res);
                        WARN_ON(next_res);
                        break;
                }
@@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root,
                        end = res->end;
                        res->end = conflict->start - 1;
                        if (conflict->end < end) {
-                                next_res = kzalloc(sizeof(*next_res),
+                                next_res = alloc_resource(GFP_ATOMIC);
-                                                GFP_ATOMIC);
                                if (!next_res) {
-                                        kfree(res);
+                                        free_resource(res);
                                        break;
                                }
                                next_res->name = name;
@@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent,
                                   const char *name, int flags)
 {
        DECLARE_WAITQUEUE(wait, current);
-        struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+        struct resource *res = alloc_resource(GFP_KERNEL);
        if (!res)
                return NULL;
@@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent,
                        continue;
                }
                /* Uhhuh, that didn't work out.. */
-                kfree(res);
+                free_resource(res);
                res = NULL;
                break;
        }
@@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start,
                return -EBUSY;
        release_resource(res);
-        kfree(res);
+        free_resource(res);
        return 0;
 }
 EXPORT_SYMBOL(__check_region);
@@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start,
                        write_unlock(&resource_lock);
                        if (res->flags & IORESOURCE_MUXED)
                                wake_up(&muxed_resource_wait);
-                        kfree(res);
+                        free_resource(res);
                        return;
                }
                p = &res->sibling;
@@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start,
 }
 EXPORT_SYMBOL(__release_region);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/**
+ * release_mem_region_adjustable - release a previously reserved memory region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @size: resource region size
+ *
+ * This interface is intended for memory hot-delete.  The requested region
+ * is released from a currently busy memory resource.  The requested region
+ * must either match exactly or fit into a single busy resource entry.  In
+ * the latter case, the remaining resource is adjusted accordingly.
+ * Existing children of the busy memory resource must be immutable in the
+ * request.
+ *
+ * Note:
+ * - Additional release conditions, such as overlapping region, can be
+ *   supported after they are confirmed as valid cases.
+ * - When a busy memory resource gets split into two entries, the code
+ *   assumes that all children remain in the lower address entry for
+ *   simplicity.  Enhance this logic when necessary.
+ */
+int release_mem_region_adjustable(struct resource *parent,
+                        resource_size_t start, resource_size_t size)
+{
+        struct resource **p;
+        struct resource *res;
+        struct resource *new_res;
+        resource_size_t end;
+        int ret = -EINVAL;
+        end = start + size - 1;
+        if ((start < parent->start) || (end > parent->end))
+                return ret;
+        /* The alloc_resource() result gets checked later */
+        new_res = alloc_resource(GFP_KERNEL);
+        p = &parent->child;
+        write_lock(&resource_lock);
+        while ((res = *p)) {
+                if (res->start >= end)
+                        break;
+                /* look for the next resource if it does not fit into */
+                if (res->start > start || res->end < end) {
+                        p = &res->sibling;
+                        continue;
+                }
+                if (!(res->flags & IORESOURCE_MEM))
+                        break;
+                if (!(res->flags & IORESOURCE_BUSY)) {
+                        p = &res->child;
+                        continue;
+                }
+                /* found the target resource; let's adjust accordingly */
+                if (res->start == start && res->end == end) {
+                        /* free the whole entry */
+                        *p = res->sibling;
+                        free_resource(res);
+                        ret = 0;
+                } else if (res->start == start && res->end != end) {
+                        /* adjust the start */
+                        ret = __adjust_resource(res, end + 1,
+                                                res->end - end);
+                } else if (res->start != start && res->end == end) {
+                        /* adjust the end */
+                        ret = __adjust_resource(res, res->start,
+                                                start - res->start);
+                } else {
+                        /* split into two entries */
+                        if (!new_res) {
+                                ret = -ENOMEM;
+                                break;
+                        }
+                        new_res->name = res->name;
+                        new_res->start = end + 1;
+                        new_res->end = res->end;
+                        new_res->flags = res->flags;
+                        new_res->parent = res->parent;
+                        new_res->sibling = res->sibling;
+                        new_res->child = NULL;
+                        ret = __adjust_resource(res, res->start,
+                                                start - res->start);
+                        if (ret)
+                                break;
+                        res->sibling = new_res;
+                        new_res = NULL;
+                }
+                break;
+        }
+        write_unlock(&resource_lock);
+        free_resource(new_res);
+        return ret;
+}
+#endif  /* CONFIG_MEMORY_HOTREMOVE */
 /*
 * Managed region resource
 */
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 7890b10084a7..1d96dd0d93c1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -14,6 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
+#include <linux/stat.h>
 #include "rtmutex.h"
@@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
        return curr - buf;
 }
-static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
+static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
+static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
 static struct bus_type rttest_subsys = {
        .name = "rttest",
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f06d249e103b..deaf90e4a1de 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492df..c3ae1446461c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
        u64 this_clock, remote_clock;
        u64 *ptr, old_val, val;
+#if BITS_PER_LONG != 64
+again:
+        /*
+         * Careful here: The local and the remote clock values need to
+         * be read out atomic as we need to compare the values and
+         * then update either the local or the remote side. So the
+         * cmpxchg64 below only protects one readout.
+         *
+         * We must reread via sched_clock_local() in the retry case on
+         * 32bit as an NMI could use sched_clock_local() via the
+         * tracer and hit between the readout of
+         * the low32bit and the high 32bit portion.
+         */
+        this_clock = sched_clock_local(my_scd);
+        /*
+         * We must enforce atomic readout on 32bit, otherwise the
+         * update on the remote cpu can hit inbetween the readout of
+         * the low32bit and the high 32bit portion.
+         */
+        remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+        /*
+         * On 64bit the read of [my]scd->clock is atomic versus the
+         * update, so we can avoid the above 32bit dance.
+         */
        sched_clock_local(my_scd);
 again:
        this_clock = my_scd->clock;
        remote_clock = scd->clock;
+#endif
        /*
         * Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624a393c..58453b8272fd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -512,11 +512,6 @@ static inline void init_hrtick(void)
 * the target CPU.
 */
 #ifdef CONFIG_SMP
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) 0
-#endif
 void resched_task(struct task_struct *p)
 {
        int cpu;
@@ -549,7 +544,7 @@ void resched_cpu(int cpu)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
 * In the semi idle case, use the nearest busy cpu for migrating timers
 * from an idle cpu.  This is good for power-savings.
@@ -587,7 +582,7 @@ unlock:
 * account when the CPU goes back to idle and evaluates the timer
 * wheel for the next timer event.
 */
-void wake_up_idle_cpu(int cpu)
+static void wake_up_idle_cpu(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -617,20 +612,56 @@ void wake_up_idle_cpu(int cpu)
                smp_send_reschedule(cpu);
 }
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+        if (tick_nohz_full_cpu(cpu)) {
+                if (cpu != smp_processor_id() ||
+                    tick_nohz_tick_stopped())
+                        smp_send_reschedule(cpu);
+                return true;
+        }
+        return false;
+}
+void wake_up_nohz_cpu(int cpu)
+{
+        if (!wake_up_full_nohz_cpu(cpu))
+                wake_up_idle_cpu(cpu);
+}
 static inline bool got_nohz_idle_kick(void)
 {
        int cpu = smp_processor_id();
        return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
 }
-#else /* CONFIG_NO_HZ */
+#else /* CONFIG_NO_HZ_COMMON */
 static inline bool got_nohz_idle_kick(void)
 {
        return false;
 }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(void)
+{
+       struct rq *rq;
+       rq = this_rq();
+       /* Make sure rq->nr_running update is visible after the IPI */
+       smp_rmb();
+       /* More than one running task need preemption */
+       if (rq->nr_running > 1)
+               return false;
+       return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
 void sched_avg_update(struct rq *rq)
 {
@@ -1288,8 +1319,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 static void
 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
-        trace_sched_wakeup(p, true);
        check_preempt_curr(rq, p, wake_flags);
+        trace_sched_wakeup(p, true);
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -1362,7 +1393,8 @@ static void sched_ttwu_pending(void)
 void scheduler_ipi(void)
 {
-        if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+        if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
+            && !tick_nohz_full_cpu(smp_processor_id()))
                return;
        /*
@@ -1379,6 +1411,7 @@ void scheduler_ipi(void)
         * somewhat pessimize the simple resched case.
         */
        irq_enter();
+        tick_nohz_full_check();
        sched_ttwu_pending();
        /*
@@ -1498,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
-        BUG_ON(rq != this_rq());
+        if (WARN_ON_ONCE(rq != this_rq()) ||
-        BUG_ON(p == current);
+            WARN_ON_ONCE(p == current))
+                return;
        lockdep_assert_held(&rq->lock);
        if (!raw_spin_trylock(&p->pi_lock)) {
@@ -1858,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
                kprobe_flush_task(prev);
                put_task_struct(prev);
        }
+        tick_nohz_task_switch(current);
 }
 #ifdef CONFIG_SMP
@@ -2121,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
        return load >> FSHIFT;
 }
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
 * Handle NO_HZ for the global load-average.
 *
@@ -2347,12 +2384,12 @@ static void calc_global_nohz(void)
        smp_wmb();
        calc_load_idx++;
 }
-#else /* !CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
 static inline long calc_load_fold_idle(void) { return 0; }
 static inline void calc_global_nohz(void) { }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2512,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
        sched_avg_update(this_rq);
 }
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
 * There is no sane way to deal with nohz on smp when using jiffies because the
 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -2572,7 +2609,7 @@ void update_cpu_load_nohz(void)
        }
        raw_spin_unlock(&this_rq->lock);
 }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
 /*
 * Called from scheduler_tick()
@@ -2699,8 +2736,35 @@ void scheduler_tick(void)
        rq->idle_balance = idle_cpu(cpu);
        trigger_load_balance(rq, cpu);
 #endif
+        rq_last_tick_reset(rq);
 }
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running because the scheduler doesn't
+ * yet completely support full dynticks environment.
+ *
+ * This makes sure that uptime, CFS vruntime, load
+ * balancing, etc... continue to move forward, even
+ * with a very low granularity.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+        struct rq *rq = this_rq();
+        unsigned long next, now = ACCESS_ONCE(jiffies);
+        next = rq->last_sched_tick + HZ;
+        if (time_before_eq(next, now))
+                return 0;
+        return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
+}
+#endif
 notrace unsigned long get_parent_ip(unsigned long addr)
 {
        if (in_lock_functions(addr)) {
@@ -2997,51 +3061,6 @@ void __sched schedule_preempt_disabled(void)
        preempt_disable();
 }
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-        if (lock->owner != owner)
-                return false;
-        /*
-         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * lock->owner still matches owner, if that fails, owner might
-         * point to free()d memory, if it still matches, the rcu_read_lock()
-         * ensures the memory stays valid.
-         */
-        barrier();
-        return owner->on_cpu;
-}
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
-        if (!sched_feat(OWNER_SPIN))
-                return 0;
-        rcu_read_lock();
-        while (owner_running(lock, owner)) {
-                if (need_resched())
-                        break;
-                arch_mutex_cpu_relax();
-        }
-        rcu_read_unlock();
-        /*
-         * We break out the loop above on need_resched() and when the
-         * owner changed, which is a sign for heavy contention. Return
-         * success only when lock->owner is NULL.
-         */
-        return lock->owner == NULL;
-}
-#endif
 #ifdef CONFIG_PREEMPT
 /*
 * this is the entry point to schedule() from in-kernel preemption
@@ -3082,11 +3101,13 @@ EXPORT_SYMBOL(preempt_schedule);
 asmlinkage void __sched preempt_schedule_irq(void)
 {
        struct thread_info *ti = current_thread_info();
+        enum ctx_state prev_state;
        /* Catch callers which need to be fixed */
        BUG_ON(ti->preempt_count || !irqs_disabled());
-        user_exit();
+        prev_state = exception_enter();
        do {
                add_preempt_count(PREEMPT_ACTIVE);
                local_irq_enable();
@@ -3100,6 +3121,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
                 */
                barrier();
        } while (need_resched());
+        exception_exit(prev_state);
 }
 #endif /* CONFIG_PREEMPT */
@@ -4126,6 +4149,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        get_task_struct(p);
        rcu_read_unlock();
+        if (p->flags & PF_NO_SETAFFINITY) {
+                retval = -EINVAL;
+                goto out_put_task;
+        }
        if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
                retval = -ENOMEM;
                goto out_put_task;
@@ -4626,6 +4653,7 @@ void sched_show_task(struct task_struct *p)
                task_pid_nr(p), ppid,
                (unsigned long)task_thread_info(p)->flags);
+        print_worker_info(KERN_INFO, p);
        show_stack(p, NULL);
 }
@@ -4773,11 +4801,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
                goto out;
        }
-        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
-                ret = -EINVAL;
-                goto out;
-        }
        do_set_cpus_allowed(p, new_mask);
        /* Can the task run on the task's current CPU? If so, we're done */
@@ -4999,7 +5022,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
 }
 static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
 static void
 set_table_entry(struct ctl_table *entry,
@@ -6248,7 +6271,7 @@ static void sched_init_numa(void)
         * 'level' contains the number of unique distances, excluding the
         * identity distance node_distance(i,i).
         *
-         * The sched_domains_nume_distance[] array includes the actual distance
+         * The sched_domains_numa_distance[] array includes the actual distance
         * numbers.
         */
@@ -6861,11 +6884,15 @@ int in_sched_functions(unsigned long addr)
 }
 #ifdef CONFIG_CGROUP_SCHED
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
 struct task_group root_task_group;
 LIST_HEAD(task_groups);
 #endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
 void __init sched_init(void)
 {
@@ -6902,7 +6929,7 @@ void __init sched_init(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CPUMASK_OFFSTACK
                for_each_possible_cpu(i) {
-                        per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+                        per_cpu(load_balance_mask, i) = (void *)ptr;
                        ptr += cpumask_size();
                }
 #endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6928,12 +6955,6 @@ void __init sched_init(void)
 #endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-        root_cpuacct.cpustat = &kernel_cpustat;
-        root_cpuacct.cpuusage = alloc_percpu(u64);
-        /* Too early, not expected to fail */
-        BUG_ON(!root_cpuacct.cpuusage);
-#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -6997,9 +7018,12 @@ void __init sched_init(void)
                INIT_LIST_HEAD(&rq->cfs_tasks);
                rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
                rq->nohz_flags = 0;
 #endif
+#ifdef CONFIG_NO_HZ_FULL
+                rq->last_sched_tick = 0;
+#endif
 #endif
                init_rq_hrtick(rq);
                atomic_set(&rq->nr_iowait, 0);
@@ -7455,7 +7479,7 @@ unlock:
        return err;
 }
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
        u64 rt_runtime, rt_period;
@@ -7467,7 +7491,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
-long sched_group_rt_runtime(struct task_group *tg)
+static long sched_group_rt_runtime(struct task_group *tg)
 {
        u64 rt_runtime_us;
@@ -7479,7 +7503,7 @@ long sched_group_rt_runtime(struct task_group *tg)
        return rt_runtime_us;
 }
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 {
        u64 rt_runtime, rt_period;
@@ -7492,7 +7516,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
-long sched_group_rt_period(struct task_group *tg)
+static long sched_group_rt_period(struct task_group *tg)
 {
        u64 rt_period_us;
@@ -7527,7 +7551,7 @@ static int sched_rt_global_constraints(void)
        return ret;
 }
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
        /* Don't accept realtime tasks when there is no way for them to run */
        if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -8035,226 +8059,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 #endif  /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-/*
- * CPU accounting code for task groups.
- *
- * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
- * (balbir@in.ibm.com).
- */
-struct cpuacct root_cpuacct;
-/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
-{
-        struct cpuacct *ca;
-        if (!cgrp->parent)
-                return &root_cpuacct.css;
-        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
-        if (!ca)
-                goto out;
-        ca->cpuusage = alloc_percpu(u64);
-        if (!ca->cpuusage)
-                goto out_free_ca;
-        ca->cpustat = alloc_percpu(struct kernel_cpustat);
-        if (!ca->cpustat)
-                goto out_free_cpuusage;
-        return &ca->css;
-out_free_cpuusage:
-        free_percpu(ca->cpuusage);
-out_free_ca:
-        kfree(ca);
-out:
-        return ERR_PTR(-ENOMEM);
-}
-/* destroy an existing cpu accounting group */
-static void cpuacct_css_free(struct cgroup *cgrp)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        free_percpu(ca->cpustat);
-        free_percpu(ca->cpuusage);
-        kfree(ca);
-}
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
-{
-        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-        u64 data;
-#ifndef CONFIG_64BIT
-        /*
-         * Take rq->lock to make 64-bit read safe on 32-bit platforms.
-         */
-        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-        data = *cpuusage;
-        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-        data = *cpuusage;
-#endif
-        return data;
-}
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
-{
-        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-#ifndef CONFIG_64BIT
-        /*
-         * Take rq->lock to make 64-bit write safe on 32-bit platforms.
-         */
-        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-        *cpuusage = val;
-        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-        *cpuusage = val;
-#endif
-}
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        u64 totalcpuusage = 0;
-        int i;
-        for_each_present_cpu(i)
-                totalcpuusage += cpuacct_cpuusage_read(ca, i);
-        return totalcpuusage;
-}
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
-                                                                u64 reset)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        int err = 0;
-        int i;
-        if (reset) {
-                err = -EINVAL;
-                goto out;
-        }
-        for_each_present_cpu(i)
-                cpuacct_cpuusage_write(ca, i, 0);
-out:
-        return err;
-}
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
-                                   struct seq_file *m)
-{
-        struct cpuacct *ca = cgroup_ca(cgroup);
-        u64 percpu;
-        int i;
-        for_each_present_cpu(i) {
-                percpu = cpuacct_cpuusage_read(ca, i);
-                seq_printf(m, "%llu ", (unsigned long long) percpu);
-        }
-        seq_printf(m, "\n");
-        return 0;
-}
-static const char *cpuacct_stat_desc[] = {
-        [CPUACCT_STAT_USER] = "user",
-        [CPUACCT_STAT_SYSTEM] = "system",
-};
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
-                              struct cgroup_map_cb *cb)
-{
-        struct cpuacct *ca = cgroup_ca(cgrp);
-        int cpu;
-        s64 val = 0;
-        for_each_online_cpu(cpu) {
-                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-                val += kcpustat->cpustat[CPUTIME_USER];
-                val += kcpustat->cpustat[CPUTIME_NICE];
-        }
-        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-        val = 0;
-        for_each_online_cpu(cpu) {
-                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-                val += kcpustat->cpustat[CPUTIME_SYSTEM];
-                val += kcpustat->cpustat[CPUTIME_IRQ];
-                val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
-        }
-        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
-        return 0;
-}
-static struct cftype files[] = {
-        {
-                .name = "usage",
-                .read_u64 = cpuusage_read,
-                .write_u64 = cpuusage_write,
-        },
-        {
-                .name = "usage_percpu",
-                .read_seq_string = cpuacct_percpu_seq_read,
-        },
-        {
-                .name = "stat",
-                .read_map = cpuacct_stats_show,
-        },
-        { }     /* terminate */
-};
-/*
- * charge this task's execution time to its accounting group.
- *
- * called with rq->lock held.
- */
-void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-        struct cpuacct *ca;
-        int cpu;
-        if (unlikely(!cpuacct_subsys.active))
-                return;
-        cpu = task_cpu(tsk);
-        rcu_read_lock();
-        ca = task_ca(tsk);
-        for (; ca; ca = parent_ca(ca)) {
-                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-                *cpuusage += cputime;
-        }
-        rcu_read_unlock();
-}
-struct cgroup_subsys cpuacct_subsys = {
-        .name = "cpuacct",
-        .css_alloc = cpuacct_css_alloc,
-        .css_free = cpuacct_css_free,
-        .subsys_id = cpuacct_subsys_id,
-        .base_cftypes = files,
-};
-#endif  /* CONFIG_CGROUP_CPUACCT */
 void dump_cpu_task(int cpu)
 {
        pr_info("Task dump for CPU %d:\n", cpu);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000000..dbb7e2cd95eb
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,296 @@
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/cpumask.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel_stat.h>
+#include <linux/err.h>
+#include "sched.h"
+/*
+ * CPU accounting code for task groups.
+ *
+ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com).
+ */
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+        CPUACCT_STAT_USER,      /* ... user mode */
+        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+        CPUACCT_STAT_NSTATS,
+};
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+        struct cgroup_subsys_state css;
+        /* cpuusage holds pointer to a u64-type object on every cpu */
+        u64 __percpu *cpuusage;
+        struct kernel_cpustat __percpu *cpustat;
+};
+/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+{
+        return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+                            struct cpuacct, css);
+}
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+        return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+                            struct cpuacct, css);
+}
+static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
+{
+        return cgroup_ca(ca->css.cgroup->parent);
+}
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+        if (!ca->css.cgroup->parent)
+                return NULL;
+        return cgroup_ca(ca->css.cgroup->parent);
+}
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static struct cpuacct root_cpuacct = {
+        .cpustat        = &kernel_cpustat,
+        .cpuusage       = &root_cpuacct_cpuusage,
+};
+/* create a new cpu accounting group */
+static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
+{
+        struct cpuacct *ca;
+        if (!cgrp->parent)
+                return &root_cpuacct.css;
+        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+        if (!ca)
+                goto out;
+        ca->cpuusage = alloc_percpu(u64);
+        if (!ca->cpuusage)
+                goto out_free_ca;
+        ca->cpustat = alloc_percpu(struct kernel_cpustat);
+        if (!ca->cpustat)
+                goto out_free_cpuusage;
+        return &ca->css;
+out_free_cpuusage:
+        free_percpu(ca->cpuusage);
+out_free_ca:
+        kfree(ca);
+out:
+        return ERR_PTR(-ENOMEM);
+}
+/* destroy an existing cpu accounting group */
+static void cpuacct_css_free(struct cgroup *cgrp)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        free_percpu(ca->cpustat);
+        free_percpu(ca->cpuusage);
+        kfree(ca);
+}
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+        u64 data;
+#ifndef CONFIG_64BIT
+        /*
+         * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+         */
+        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+        data = *cpuusage;
+        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+        data = *cpuusage;
+#endif
+        return data;
+}
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+#ifndef CONFIG_64BIT
+        /*
+         * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+         */
+        raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+        *cpuusage = val;
+        raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+        *cpuusage = val;
+#endif
+}
+/* return total cpu usage (in nanoseconds) of a group */
+static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        u64 totalcpuusage = 0;
+        int i;
+        for_each_present_cpu(i)
+                totalcpuusage += cpuacct_cpuusage_read(ca, i);
+        return totalcpuusage;
+}
+static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
+                                                                u64 reset)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        int err = 0;
+        int i;
+        if (reset) {
+                err = -EINVAL;
+                goto out;
+        }
+        for_each_present_cpu(i)
+                cpuacct_cpuusage_write(ca, i, 0);
+out:
+        return err;
+}
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+                                   struct seq_file *m)
+{
+        struct cpuacct *ca = cgroup_ca(cgroup);
+        u64 percpu;
+        int i;
+        for_each_present_cpu(i) {
+                percpu = cpuacct_cpuusage_read(ca, i);
+                seq_printf(m, "%llu ", (unsigned long long) percpu);
+        }
+        seq_printf(m, "\n");
+        return 0;
+}
+static const char * const cpuacct_stat_desc[] = {
+        [CPUACCT_STAT_USER] = "user",
+        [CPUACCT_STAT_SYSTEM] = "system",
+};
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+                              struct cgroup_map_cb *cb)
+{
+        struct cpuacct *ca = cgroup_ca(cgrp);
+        int cpu;
+        s64 val = 0;
+        for_each_online_cpu(cpu) {
+                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+                val += kcpustat->cpustat[CPUTIME_USER];
+                val += kcpustat->cpustat[CPUTIME_NICE];
+        }
+        val = cputime64_to_clock_t(val);
+        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+        val = 0;
+        for_each_online_cpu(cpu) {
+                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+                val += kcpustat->cpustat[CPUTIME_SYSTEM];
+                val += kcpustat->cpustat[CPUTIME_IRQ];
+                val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+        }
+        val = cputime64_to_clock_t(val);
+        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+        return 0;
+}
+static struct cftype files[] = {
+        {
+                .name = "usage",
+                .read_u64 = cpuusage_read,
+                .write_u64 = cpuusage_write,
+        },
+        {
+                .name = "usage_percpu",
+                .read_seq_string = cpuacct_percpu_seq_read,
+        },
+        {
+                .name = "stat",
+                .read_map = cpuacct_stats_show,
+        },
+        { }     /* terminate */
+};
+/*
+ * charge this task's execution time to its accounting group.
+ *
+ * called with rq->lock held.
+ */
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+        struct cpuacct *ca;
+        int cpu;
+        cpu = task_cpu(tsk);
+        rcu_read_lock();
+        ca = task_ca(tsk);
+        while (true) {
+                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+                *cpuusage += cputime;
+                ca = parent_ca(ca);
+                if (!ca)
+                        break;
+        }
+        rcu_read_unlock();
+}
+/*
+ * Add user/system time to cpuacct.
+ *
+ * Note: it's the caller that updates the account of the root cgroup.
+ */
+void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+        struct kernel_cpustat *kcpustat;
+        struct cpuacct *ca;
+        rcu_read_lock();
+        ca = task_ca(p);
+        while (ca != &root_cpuacct) {
+                kcpustat = this_cpu_ptr(ca->cpustat);
+                kcpustat->cpustat[index] += val;
+                ca = __parent_ca(ca);
+        }
+        rcu_read_unlock();
+}
+struct cgroup_subsys cpuacct_subsys = {
+        .name           = "cpuacct",
+        .css_alloc      = cpuacct_css_alloc,
+        .css_free       = cpuacct_css_free,
+        .subsys_id      = cpuacct_subsys_id,
+        .base_cftypes   = files,
+        .early_init     = 1,
+};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 000000000000..ed605624a5e7
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
+#ifdef CONFIG_CGROUP_CPUACCT
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+#else
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+}
+static inline void
+cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+}
+#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ed12cbb135f4..cc2dc3eea8a3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)
 static inline void task_group_account_field(struct task_struct *p, int index,
                                            u64 tmp)
 {
-#ifdef CONFIG_CGROUP_CPUACCT
-        struct kernel_cpustat *kcpustat;
-        struct cpuacct *ca;
-#endif
        /*
         * Since all updates are sure to touch the root cgroup, we
         * get ourselves ahead and touch it first. If the root cgroup
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
         */
        __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-#ifdef CONFIG_CGROUP_CPUACCT
+        cpuacct_account_field(p, index, tmp);
-        if (unlikely(!cpuacct_subsys.active))
-                return;
-        rcu_read_lock();
-        ca = task_ca(p);
-        while (ca && (ca != &root_cpuacct)) {
-                kcpustat = this_cpu_ptr(ca->cpustat);
-                kcpustat->cpustat[index] += tmp;
-                ca = parent_ca(ca);
-        }
-        rcu_read_unlock();
-#endif
 }
 /*
@@ -310,7 +294,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        t = tsk;
        do {
-                task_cputime(tsk, &utime, &stime);
+                task_cputime(t, &utime, &stime);
                times->utime += utime;
                times->stime += stime;
                times->sum_exec_runtime += task_sched_runtime(t);
@@ -388,7 +372,84 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
                                                struct rq *rq) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_task_switch(struct task_struct *prev)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        if (is_idle_task(prev))
+                vtime_account_idle(prev);
+        else
+                vtime_account_system(prev);
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+        vtime_account_user(prev);
+#endif
+        arch_vtime_task_switch(prev);
+}
+#endif
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_account_irq_enter(struct task_struct *tsk)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        if (!in_interrupt()) {
+                /*
+                 * If we interrupted user, context_tracking_in_user()
+                 * is 1 because the context tracking don't hook
+                 * on irq entry/exit. This way we know if
+                 * we need to flush user time on kernel entry.
+                 */
+                if (context_tracking_in_user()) {
+                        vtime_account_user(tsk);
+                        return;
+                }
+                if (is_idle_task(tsk)) {
+                        vtime_account_idle(tsk);
+                        return;
+                }
+        }
+        vtime_account_system(tsk);
+}
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+        *ut = p->utime;
+        *st = p->stime;
+}
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+        struct task_cputime cputime;
+        thread_group_cputime(p, &cputime);
+        *ut = cputime.utime;
+        *st = cputime.stime;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 /*
 * Account a single tick of cpu time.
 * @p: the process that the cpu time gets accounted to
@@ -443,96 +504,50 @@ void account_idle_ticks(unsigned long ticks)
        account_idle_time(jiffies_to_cputime(ticks));
 }
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-        *ut = p->utime;
-        *st = p->stime;
-}
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-        struct task_cputime cputime;
-        thread_group_cputime(p, &cputime);
-        *ut = cputime.utime;
-        *st = cputime.stime;
-}
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
-        if (!vtime_accounting_enabled())
-                return;
-        if (is_idle_task(prev))
-                vtime_account_idle(prev);
-        else
-                vtime_account_system(prev);
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-        vtime_account_user(prev);
-#endif
-        arch_vtime_task_switch(prev);
-}
-#endif
 /*
- * Archs that account the whole time spent in the idle task
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * (outside irq) as idle time can rely on this and just implement
+ * loosing precision when the numbers are big.
- * vtime_account_system() and vtime_account_idle(). Archs that
- * have other meaning of the idle time (s390 only includes the
- * time spent by the CPU when it's in low power mode) must override
- * vtime_account().
 */
-#ifndef __ARCH_HAS_VTIME_ACCOUNT
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
-void vtime_account_irq_enter(struct task_struct *tsk)
 {
-        if (!vtime_accounting_enabled())
+        u64 scaled;
-                return;
-        if (!in_interrupt()) {
+        for (;;) {
-                /*
+                /* Make sure "rtime" is the bigger of stime/rtime */
-                 * If we interrupted user, context_tracking_in_user()
+                if (stime > rtime) {
-                 * is 1 because the context tracking don't hook
+                        u64 tmp = rtime; rtime = stime; stime = tmp;
-                 * on irq entry/exit. This way we know if
-                 * we need to flush user time on kernel entry.
-                 */
-                if (context_tracking_in_user()) {
-                        vtime_account_user(tsk);
-                        return;
                }
-                if (is_idle_task(tsk)) {
+                /* Make sure 'total' fits in 32 bits */
-                        vtime_account_idle(tsk);
+                if (total >> 32)
-                        return;
+                        goto drop_precision;
-                }
-        }
-        vtime_account_system(tsk);
-}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
+                /* Does rtime (and thus stime) fit in 32 bits? */
+                if (!(rtime >> 32))
+                        break;
-static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
+                /* Can we just balance rtime/stime rather than dropping bits? */
-{
+                if (stime >> 31)
-        u64 temp = (__force u64) rtime;
+                        goto drop_precision;
-        temp *= (__force u64) stime;
+                /* We can grow stime and shrink rtime and try to make them both fit */
+                stime <<= 1;
+                rtime >>= 1;
+                continue;
-        if (sizeof(cputime_t) == 4)
+drop_precision:
-                temp = div_u64(temp, (__force u32) total);
+                /* We drop from rtime, it has more bits than stime */
-        else
+                rtime >>= 1;
-                temp = div64_u64(temp, (__force u64) total);
+                total >>= 1;
+        }
-        return (__force cputime_t) temp;
+        /*
+         * Make sure gcc understands that this is a 32x32->64 multiply,
+         * followed by a 64/32->64 divide.
+         */
+        scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+        return (__force cputime_t) scaled;
 }
 /*
@@ -543,7 +558,13 @@ static void cputime_adjust(struct task_cputime *curr,
                           struct cputime *prev,
                           cputime_t *ut, cputime_t *st)
 {
-        cputime_t rtime, stime, total;
+        cputime_t rtime, stime, utime, total;
+        if (vtime_accounting_enabled()) {
+                *ut = curr->utime;
+                *st = curr->stime;
+                return;
+        }
        stime = curr->stime;
        total = stime + curr->utime;
@@ -560,10 +581,22 @@ static void cputime_adjust(struct task_cputime *curr,
         */
        rtime = nsecs_to_cputime(curr->sum_exec_runtime);
-        if (total)
+        /*
-                stime = scale_stime(stime, rtime, total);
+         * Update userspace visible utime/stime values only if actual execution
-        else
+         * time is bigger than already exported. Note that can happen, that we
+         * provided bigger values due to scaling inaccuracy on big numbers.
+         */
+        if (prev->stime + prev->utime >= rtime)
+                goto out;
+        if (total) {
+                stime = scale_stime((__force u64)stime,
+                                    (__force u64)rtime, (__force u64)total);
+                utime = rtime - stime;
+        } else {
                stime = rtime;
+                utime = 0;
+        }
        /*
         * If the tick based count grows faster than the scheduler one,
@@ -571,8 +604,9 @@ static void cputime_adjust(struct task_cputime *curr,
         * Let's enforce monotonicity.
         */
        prev->stime = max(prev->stime, stime);
-        prev->utime = max(prev->utime, rtime - prev->stime);
+        prev->utime = max(prev->utime, utime);
+out:
        *ut = prev->utime;
        *st = prev->stime;
 }
@@ -597,7 +631,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
        thread_group_cputime(p, &cputime);
        cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 }
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 static unsigned long long vtime_delta(struct task_struct *tsk)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e5986fc5..c61a614465c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
 * Scheduling class tree data structure manipulation methods:
 */
-static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
+static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 {
-        s64 delta = (s64)(vruntime - min_vruntime);
+        s64 delta = (s64)(vruntime - max_vruntime);
        if (delta > 0)
-                min_vruntime = vruntime;
+                max_vruntime = vruntime;
-        return min_vruntime;
+        return max_vruntime;
 }
 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
@@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
                        vruntime = min_vruntime(vruntime, se->vruntime);
        }
+        /* ensure we never gain time by being placed backwards. */
        cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 #ifndef CONFIG_64BIT
        smp_wmb();
@@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 /*
- * We calculate the vruntime slice of a to be inserted task
+ * We calculate the vruntime slice of a to-be-inserted task.
 *
 * vs = s/w
 */
@@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
        } /* migrations, e.g. sleep=0 leave decay_count == 0 */
 }
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter_fair(struct rq *this_rq)
+{
+        update_rq_runnable_avg(this_rq, 1);
+}
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit_fair(struct rq *this_rq)
+{
+        update_rq_runnable_avg(this_rq, 0);
+}
 #else
 static inline void update_entity_load_avg(struct sched_entity *se,
                                          int update_cfs_rq) {}
@@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        int tsk_cache_hot = 0;
        /*
         * We do not migrate tasks that are:
-         * 1) running (obviously), or
+         * 1) throttled_lb_pair, or
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
-         * 3) are cache-hot on their current CPU.
+         * 3) running (obviously), or
+         * 4) are cache-hot on their current CPU.
         */
+        if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+                return 0;
        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
-                int new_dst_cpu;
+                int cpu;
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
@@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
                        return 0;
-                new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+                /* Prevent to re-select dst_cpu via env's cpus */
-                                                tsk_cpus_allowed(p));
+                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
-                if (new_dst_cpu < nr_cpu_ids) {
+                        if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
-                        env->flags |= LBF_SOME_PINNED;
+                                env->flags |= LBF_SOME_PINNED;
-                        env->new_dst_cpu = new_dst_cpu;
+                                env->new_dst_cpu = cpu;
+                                break;
+                        }
                }
                return 0;
        }
@@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
        if (!tsk_cache_hot ||
                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
                if (tsk_cache_hot) {
                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
                        schedstat_inc(p, se.statistics.nr_forced_migrations);
                }
-#endif
                return 1;
        }
-        if (tsk_cache_hot) {
+        schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
-                schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+        return 0;
-                return 0;
-        }
-        return 1;
 }
 /*
@@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
        struct task_struct *p, *n;
        list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
-                if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
-                        continue;
                if (!can_migrate_task(p, env))
                        continue;
@@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
                        break;
                }
-                if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+                if (!can_migrate_task(p, env))
                        goto next;
                load = task_h_load(p);
@@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env)
                if ((load / 2) > env->imbalance)
                        goto next;
-                if (!can_migrate_task(p, env))
-                        goto next;
                move_task(p, env);
                pulled++;
                env->imbalance -= load;
@@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
        return load_idx;
 }
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
        return SCHED_POWER_SCALE;
 }
@@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
        return default_scale_freq_power(sd, cpu);
 }
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
 {
        unsigned long weight = sd->span_weight;
        unsigned long smt_gain = sd->smt_gain;
@@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
        return default_scale_smt_power(sd, cpu);
 }
-unsigned long scale_rt_power(int cpu)
+static unsigned long scale_rt_power(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        u64 total, available, age_stamp, avg;
@@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 #define MAX_PINNED_INTERVAL     512
 /* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 static int need_active_balance(struct lb_env *env)
 {
@@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        int *balance)
 {
        int ld_moved, cur_ld_moved, active_balance = 0;
-        int lb_iterations, max_lb_iterations;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
-        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+        struct cpumask *cpus = __get_cpu_var(load_balance_mask);
        struct lb_env env = {
                .sd             = sd,
@@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .cpus           = cpus,
        };
+        /*
+         * For NEWLY_IDLE load_balancing, we don't need to consider
+         * other cpus in our group
+         */
+        if (idle == CPU_NEWLY_IDLE)
+                env.dst_grpmask = NULL;
        cpumask_copy(cpus, cpu_active_mask);
-        max_lb_iterations = cpumask_weight(env.dst_grpmask);
        schedstat_inc(sd, lb_count[idle]);
@@ -5034,7 +5059,6 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
        ld_moved = 0;
-        lb_iterations = 1;
        if (busiest->nr_running > 1) {
                /*
                 * Attempt to move tasks. If find_busiest_group has found
@@ -5061,17 +5085,17 @@ more_balance:
                double_rq_unlock(env.dst_rq, busiest);
                local_irq_restore(flags);
-                if (env.flags & LBF_NEED_BREAK) {
-                        env.flags &= ~LBF_NEED_BREAK;
-                        goto more_balance;
-                }
                /*
                 * some other cpu did the load balance for us.
                 */
                if (cur_ld_moved && env.dst_cpu != smp_processor_id())
                        resched_cpu(env.dst_cpu);
+                if (env.flags & LBF_NEED_BREAK) {
+                        env.flags &= ~LBF_NEED_BREAK;
+                        goto more_balance;
+                }
                /*
                 * Revisit (affine) tasks on src_cpu that couldn't be moved to
                 * us and move them to an alternate dst_cpu in our sched_group
@@ -5091,14 +5115,17 @@ more_balance:
                 * moreover subsequent load balance cycles should correct the
                 * excess load moved.
                 */
-                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
-                                lb_iterations++ < max_lb_iterations) {
                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
                        env.dst_cpu      = env.new_dst_cpu;
                        env.flags       &= ~LBF_SOME_PINNED;
                        env.loop         = 0;
                        env.loop_break   = sched_nr_migrate_break;
+                        /* Prevent to re-select dst_cpu via env's cpus */
+                        cpumask_clear_cpu(env.dst_cpu, env.cpus);
                        /*
                         * Go back to "more_balance" rather than "redo" since we
                         * need to continue with same src_cpu.
@@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        if (this_rq->avg_idle < sysctl_sched_migration_cost)
                return;
-        update_rq_runnable_avg(this_rq, 1);
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
         */
@@ -5330,7 +5355,7 @@ out_unlock:
        return 0;
 }
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
 * idle load balancing details
 * - When one of the busy CPUs notice that there may be an idle rebalancing
@@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
        struct sched_domain *sd;
        int cpu = smp_processor_id();
-        if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-                return;
-        clear_bit(NOHZ_IDLE, nohz_flags(cpu));
        rcu_read_lock();
-        for_each_domain(cpu, sd)
+        sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+        if (!sd || !sd->nohz_idle)
+                goto unlock;
+        sd->nohz_idle = 0;
+        for (; sd; sd = sd->parent)
                atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+unlock:
        rcu_read_unlock();
 }
@@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
        struct sched_domain *sd;
        int cpu = smp_processor_id();
-        if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-                return;
-        set_bit(NOHZ_IDLE, nohz_flags(cpu));
        rcu_read_lock();
-        for_each_domain(cpu, sd)
+        sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+        if (!sd || sd->nohz_idle)
+                goto unlock;
+        sd->nohz_idle = 1;
+        for (; sd; sd = sd->parent)
                atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+unlock:
        rcu_read_unlock();
 }
@@ -5468,7 +5499,7 @@ void update_max_interval(void)
 * It checks each scheduling domain to see if it is due to be balanced,
 * and initiates a balancing operation if so.
 *
- * Balancing parameters are set up in arch_init_sched_domains.
+ * Balancing parameters are set up in init_sched_domains.
 */
 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
@@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
                        if (load_balance(cpu, rq, sd, idle, &balance)) {
                                /*
-                                 * We've pulled tasks over so either we're no
+                                 * The LBF_SOME_PINNED logic could have changed
-                                 * longer idle.
+                                 * env->dst_cpu, so we can't know our idle
+                                 * state even if we migrated tasks. Update it.
                                 */
-                                idle = CPU_NOT_IDLE;
+                                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
                        }
                        sd->last_balance = jiffies;
                }
@@ -5540,9 +5572,9 @@ out:
                rq->next_balance = next_balance;
 }
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
- * In CONFIG_NO_HZ case, the idle balance kickee will do the
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
 * rebalancing for all the cpus for whom scheduler ticks are stopped.
 */
 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
@@ -5685,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu)
        if (time_after_eq(jiffies, rq->next_balance) &&
            likely(!on_null_domain(cpu)))
                raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
        if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
                nohz_balancer_kick(cpu);
 #endif
@@ -6155,7 +6187,7 @@ __init void init_sched_fair_class(void)
 #ifdef CONFIG_SMP
        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
        nohz.next_balance = jiffies;
        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
        cpu_notifier(sched_ilb_notifier, 0);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395f..99399f8e4799 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
 /*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, true)
-/*
 * Decrement CPU power based on time not spent running tasks
 */
 SCHED_FEAT(NONTASK_POWER, true)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae9..d8da01008d39 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,17 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
+static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
+{
+        idle_exit_fair(rq);
+        rq_last_tick_reset(rq);
+}
+static void post_schedule_idle(struct rq *rq)
+{
+        idle_enter_fair(rq);
+}
 #endif /* CONFIG_SMP */
 /*
 * Idle tasks are unconditionally rescheduled:
@@ -25,6 +36,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
+#ifdef CONFIG_SMP
+        /* Trigger the post schedule to do an idle_enter for CFS */
+        rq->post_schedule = 1;
+#endif
        return rq->idle;
 }
@@ -86,6 +101,8 @@ const struct sched_class idle_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_idle,
+        .pre_schedule           = pre_schedule_idle,
+        .post_schedule          = post_schedule_idle,
 #endif
        .set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfdf469f..ce39224d6155 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,8 +5,10 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
+#include <linux/tick.h>
 #include "cpupri.h"
+#include "cpuacct.h"
 extern __read_mostly int scheduler_running;
@@ -33,6 +35,31 @@ extern __read_mostly int scheduler_running;
 */
 #define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+/*
+ * Increase resolution of nice-level calculations for 64-bit architectures.
+ * The extra resolution improves shares distribution and load balancing of
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * hierarchies, especially on larger systems. This is not a user-visible change
+ * and does not change the user-interface for setting shares/weights.
+ *
+ * We increase resolution only if we have enough bits to allow this increased
+ * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
+ * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
+ * increased costs.
+ */
+#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
+# define SCHED_LOAD_RESOLUTION  10
+# define scale_load(w)          ((w) << SCHED_LOAD_RESOLUTION)
+# define scale_load_down(w)     ((w) >> SCHED_LOAD_RESOLUTION)
+#else
+# define SCHED_LOAD_RESOLUTION  0
+# define scale_load(w)          (w)
+# define scale_load_down(w)     (w)
+#endif
+#define SCHED_LOAD_SHIFT        (10 + SCHED_LOAD_RESOLUTION)
+#define SCHED_LOAD_SCALE        (1L << SCHED_LOAD_SHIFT)
 #define NICE_0_LOAD             SCHED_LOAD_SCALE
 #define NICE_0_SHIFT            SCHED_LOAD_SHIFT
@@ -154,11 +181,6 @@ struct task_group {
 #define MAX_SHARES      (1UL << 18)
 #endif
-/* Default task group.
- *      Every task in system belong to this group at bootup.
- */
-extern struct task_group root_task_group;
 typedef int (*tg_visitor)(struct task_group *, void *);
 extern int walk_tg_tree_from(struct task_group *from,
@@ -196,6 +218,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
                struct sched_rt_entity *rt_se, int cpu,
                struct sched_rt_entity *parent);
+extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_online_group(struct task_group *tg,
+                               struct task_group *parent);
+extern void sched_destroy_group(struct task_group *tg);
+extern void sched_offline_group(struct task_group *tg);
+extern void sched_move_task(struct task_struct *tsk);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+#endif
 #else /* CONFIG_CGROUP_SCHED */
 struct cfs_bandwidth { };
@@ -372,10 +406,13 @@ struct rq {
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
        unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
        u64 nohz_stamp;
        unsigned long nohz_flags;
 #endif
+#ifdef CONFIG_NO_HZ_FULL
+        unsigned long last_sched_tick;
+#endif
        int skip_clock_update;
        /* capture load from *all* tasks on this cpu: */
@@ -547,6 +584,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
+struct sched_group_power {
+        atomic_t ref;
+        /*
+         * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+         * single CPU.
+         */
+        unsigned int power, power_orig;
+        unsigned long next_update;
+        /*
+         * Number of busy cpus in this group.
+         */
+        atomic_t nr_busy_cpus;
+        unsigned long cpumask[0]; /* iteration mask */
+};
+struct sched_group {
+        struct sched_group *next;       /* Must be a circular list */
+        atomic_t ref;
+        unsigned int group_weight;
+        struct sched_group_power *sgp;
+        /*
+         * The CPUs this group covers.
+         *
+         * NOTE: this field is variable length. (Allocated dynamically
+         * by attaching extra space to the end of the structure,
+         * depending on how many CPUs the kernel has booted up with)
+         */
+        unsigned long cpumask[0];
+};
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+        return to_cpumask(sg->cpumask);
+}
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+        return to_cpumask(sg->sgp->cpumask);
+}
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+        return cpumask_first(sched_group_cpus(group));
+}
 extern int group_balance_cpu(struct sched_group *sg);
 #endif /* CONFIG_SMP */
@@ -784,6 +877,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+/*
+ * wake flags
+ */
+#define WF_SYNC         0x01            /* waker goes to sleep after wakeup */
+#define WF_FORK         0x02            /* child wakeup after fork */
+#define WF_MIGRATED     0x4             /* internal use, task got migrated */
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
@@ -856,14 +955,61 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
-/* Time spent by the tasks of the cpu accounting group executing in ... */
+#define ENQUEUE_WAKEUP          1
-enum cpuacct_stat_index {
+#define ENQUEUE_HEAD            2
-        CPUACCT_STAT_USER,      /* ... user mode */
+#ifdef CONFIG_SMP
-        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+#define ENQUEUE_WAKING          4       /* sched_class::task_waking was called */
+#else
+#define ENQUEUE_WAKING          0
+#endif
-        CPUACCT_STAT_NSTATS,
+#define DEQUEUE_SLEEP           1
-};
+struct sched_class {
+        const struct sched_class *next;
+        void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+        void (*yield_task) (struct rq *rq);
+        bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+        struct task_struct * (*pick_next_task) (struct rq *rq);
+        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+#ifdef CONFIG_SMP
+        int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+        void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+        void (*post_schedule) (struct rq *this_rq);
+        void (*task_waking) (struct task_struct *task);
+        void (*task_woken) (struct rq *this_rq, struct task_struct *task);
+        void (*set_cpus_allowed)(struct task_struct *p,
+                                 const struct cpumask *newmask);
+        void (*rq_online)(struct rq *rq);
+        void (*rq_offline)(struct rq *rq);
+#endif
+        void (*set_curr_task) (struct rq *rq);
+        void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+        void (*task_fork) (struct task_struct *p);
+        void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+        void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+        void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+                             int oldprio);
+        unsigned int (*get_rr_interval) (struct rq *rq,
+                                         struct task_struct *task);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        void (*task_move_group) (struct task_struct *p, int on_rq);
+#endif
+};
 #define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
@@ -877,9 +1023,23 @@ extern const struct sched_class idle_sched_class;
 #ifdef CONFIG_SMP
+extern void update_group_power(struct sched_domain *sd, int cpu);
 extern void trigger_load_balance(struct rq *rq, int cpu);
 extern void idle_balance(int this_cpu, struct rq *this_rq);
+/*
+ * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
+ * becomes useful in lb
+ */
+#if defined(CONFIG_FAIR_GROUP_SCHED)
+extern void idle_enter_fair(struct rq *this_rq);
+extern void idle_exit_fair(struct rq *this_rq);
+#else
+static inline void idle_enter_fair(struct rq *this_rq) {}
+static inline void idle_exit_fair(struct rq *this_rq) {}
+#endif
 #else   /* CONFIG_SMP */
 static inline void idle_balance(int cpu, struct rq *rq)
@@ -891,7 +1051,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
 extern void sysrq_sched_debug_show(void);
 extern void sched_init_granularity(void);
 extern void update_max_interval(void);
-extern void update_group_power(struct sched_domain *sd, int cpu);
 extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
@@ -904,45 +1063,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
 extern void update_idle_cpu_load(struct rq *this_rq);
-#ifdef CONFIG_CGROUP_CPUACCT
-#include <linux/cgroup.h>
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
-        struct cgroup_subsys_state css;
-        /* cpuusage holds pointer to a u64-type object on every cpu */
-        u64 __percpu *cpuusage;
-        struct kernel_cpustat __percpu *cpustat;
-};
-extern struct cgroup_subsys cpuacct_subsys;
-extern struct cpuacct root_cpuacct;
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
-        return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
-                            struct cpuacct, css);
-}
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
-        return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
-                            struct cpuacct, css);
-}
-static inline struct cpuacct *parent_ca(struct cpuacct *ca)
-{
-        if (!ca || !ca->css.cgroup->parent)
-                return NULL;
-        return cgroup_ca(ca->css.cgroup->parent);
-}
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-#else
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
-#endif
 #ifdef CONFIG_PARAVIRT
 static inline u64 steal_ticks(u64 steal)
 {
@@ -956,6 +1076,16 @@ static inline u64 steal_ticks(u64 steal)
 static inline void inc_nr_running(struct rq *rq)
 {
        rq->nr_running++;
+#ifdef CONFIG_NO_HZ_FULL
+        if (rq->nr_running == 2) {
+                if (tick_nohz_full_cpu(rq->cpu)) {
+                        /* Order rq->nr_running write against the IPI */
+                        smp_wmb();
+                        smp_send_reschedule(rq->cpu);
+                }
+       }
+#endif
 }
 static inline void dec_nr_running(struct rq *rq)
@@ -963,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq)
        rq->nr_running--;
 }
+static inline void rq_last_tick_reset(struct rq *rq)
+{
+#ifdef CONFIG_NO_HZ_FULL
+        rq->last_sched_tick = jiffies;
+#endif
+}
 extern void update_rq_clock(struct rq *rq);
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1183,11 +1320,10 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
 extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 enum rq_nohz_flag_bits {
        NOHZ_TICK_STOPPED,
        NOHZ_BALANCE_KICK,
-        NOHZ_IDLE,
 };
 #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index e036eda1a9c9..da98af347e8b 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -130,16 +130,11 @@ static int schedstat_open(struct inode *inode, struct file *file)
        return seq_open(file, &schedstat_sops);
 }
-static int schedstat_release(struct inode *inode, struct file *file)
-{
-        return 0;
-};
 static const struct file_operations proc_schedstat_operations = {
        .open    = schedstat_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-        .release = schedstat_release,
+        .release = seq_release,
 };
 static int __init proc_schedstat_init(void)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5af44b593770..b7a10048a32c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -160,6 +160,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
                case BPF_S_ALU_AND_X:
                case BPF_S_ALU_OR_K:
                case BPF_S_ALU_OR_X:
+                case BPF_S_ALU_XOR_K:
+                case BPF_S_ALU_XOR_X:
                case BPF_S_ALU_LSH_K:
                case BPF_S_ALU_LSH_X:
                case BPF_S_ALU_RSH_K:
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 4567fc020fe3..6815171a4fff 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(up);
 struct semaphore_waiter {
        struct list_head list;
        struct task_struct *task;
-        int up;
+        bool up;
 };
 /*
@@ -209,12 +209,12 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
        list_add_tail(&waiter.list, &sem->wait_list);
        waiter.task = task;
-        waiter.up = 0;
+        waiter.up = false;
        for (;;) {
                if (signal_pending_state(state, task))
                        goto interrupted;
-                if (timeout <= 0)
+                if (unlikely(timeout <= 0))
                        goto timed_out;
                __set_task_state(task, state);
                raw_spin_unlock_irq(&sem->lock);
@@ -258,6 +258,6 @@ static noinline void __sched __up(struct semaphore *sem)
        struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
                                                struct semaphore_waiter, list);
        list_del(&waiter->list);
-        waiter->up = 1;
+        waiter->up = true;
        wake_up_process(waiter->task);
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 2ec870a4c3c4..113411bfe8b1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -32,6 +32,7 @@
 #include <linux/user_namespace.h>
 #include <linux/uprobes.h>
 #include <linux/compat.h>
+#include <linux/cn_proc.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -485,6 +486,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
                if (force_default || ka->sa.sa_handler != SIG_IGN)
                        ka->sa.sa_handler = SIG_DFL;
                ka->sa.sa_flags = 0;
+#ifdef __ARCH_HAS_SA_RESTORER
+                ka->sa.sa_restorer = NULL;
+#endif
                sigemptyset(&ka->sa.sa_mask);
                ka++;
        }
@@ -851,12 +855,14 @@ static void ptrace_trap_notify(struct task_struct *t)
 * Returns true if the signal should be actually delivered, otherwise
 * it should be dropped.
 */
-static int prepare_signal(int sig, struct task_struct *p, bool force)
+static bool prepare_signal(int sig, struct task_struct *p, bool force)
 {
        struct signal_struct *signal = p->signal;
        struct task_struct *t;
-        if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) {
+        if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
+                if (signal->flags & SIGNAL_GROUP_COREDUMP)
+                        return sig == SIGKILL;
                /*
                 * The process is in the middle of dying, nothing to do.
                 */
@@ -1157,8 +1163,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 static void print_fatal_signal(int signr)
 {
        struct pt_regs *regs = signal_pt_regs();
-        printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n",
+        printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr);
-                current->comm, task_pid_nr(current), signr);
 #if defined(__i386__) && !defined(__arch_um__)
        printk(KERN_INFO "code at %08lx: ", regs->ip);
@@ -2347,6 +2352,7 @@ relock:
                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
                                print_fatal_signal(info->si_signo);
+                        proc_coredump_connector(current);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
@@ -2682,7 +2688,7 @@ static int do_sigpending(void *set, unsigned long sigsetsize)
 /**
 *  sys_rt_sigpending - examine a pending signal that has been raised
 *                      while blocked
- *  @set: stores pending signals
+ *  @uset: stores pending signals
 *  @sigsetsize: size of sigset_t type or larger
 */
 SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
@@ -2945,7 +2951,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 static int do_tkill(pid_t tgid, pid_t pid, int sig)
 {
-        struct siginfo info;
+        struct siginfo info = {};
        info.si_signo = sig;
        info.si_errno = 0;
diff --git a/kernel/smp.c b/kernel/smp.c
index 8e451f3ff51b..4dba0f7b72ad 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -100,16 +100,16 @@ void __init call_function_init(void)
 * previous function call. For multi-cpu calls its even more interesting
 * as we'll have to ensure no other cpu is observing our csd.
 */
-static void csd_lock_wait(struct call_single_data *data)
+static void csd_lock_wait(struct call_single_data *csd)
 {
-        while (data->flags & CSD_FLAG_LOCK)
+        while (csd->flags & CSD_FLAG_LOCK)
                cpu_relax();
 }
-static void csd_lock(struct call_single_data *data)
+static void csd_lock(struct call_single_data *csd)
 {
-        csd_lock_wait(data);
+        csd_lock_wait(csd);
-        data->flags = CSD_FLAG_LOCK;
+        csd->flags |= CSD_FLAG_LOCK;
        /*
         * prevent CPU from reordering the above assignment
@@ -119,16 +119,16 @@ static void csd_lock(struct call_single_data *data)
        smp_mb();
 }
-static void csd_unlock(struct call_single_data *data)
+static void csd_unlock(struct call_single_data *csd)
 {
-        WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+        WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
        /*
         * ensure we're all done before releasing data:
         */
        smp_mb();
-        data->flags &= ~CSD_FLAG_LOCK;
+        csd->flags &= ~CSD_FLAG_LOCK;
 }
 /*
@@ -137,7 +137,7 @@ static void csd_unlock(struct call_single_data *data)
 * ->func, ->info, and ->flags set.
 */
 static
-void generic_exec_single(int cpu, struct call_single_data *data, int wait)
+void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
 {
        struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
        unsigned long flags;
@@ -145,7 +145,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
        raw_spin_lock_irqsave(&dst->lock, flags);
        ipi = list_empty(&dst->list);
-        list_add_tail(&data->list, &dst->list);
+        list_add_tail(&csd->list, &dst->list);
        raw_spin_unlock_irqrestore(&dst->lock, flags);
        /*
@@ -163,7 +163,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
                arch_send_call_function_single_ipi(cpu);
        if (wait)
-                csd_lock_wait(data);
+                csd_lock_wait(csd);
 }
 /*
@@ -173,7 +173,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 void generic_smp_call_function_single_interrupt(void)
 {
        struct call_single_queue *q = &__get_cpu_var(call_single_queue);
-        unsigned int data_flags;
        LIST_HEAD(list);
        /*
@@ -186,25 +185,26 @@ void generic_smp_call_function_single_interrupt(void)
        raw_spin_unlock(&q->lock);
        while (!list_empty(&list)) {
-                struct call_single_data *data;
+                struct call_single_data *csd;
+                unsigned int csd_flags;
-                data = list_entry(list.next, struct call_single_data, list);
+                csd = list_entry(list.next, struct call_single_data, list);
-                list_del(&data->list);
+                list_del(&csd->list);
                /*
-                 * 'data' can be invalid after this call if flags == 0
+                 * 'csd' can be invalid after this call if flags == 0
                 * (when called through generic_exec_single()),
                 * so save them away before making the call:
                 */
-                data_flags = data->flags;
+                csd_flags = csd->flags;
-                data->func(data->info);
+                csd->func(csd->info);
                /*
                 * Unlocked CSDs are valid through generic_exec_single():
                 */
-                if (data_flags & CSD_FLAG_LOCK)
+                if (csd_flags & CSD_FLAG_LOCK)
-                        csd_unlock(data);
+                        csd_unlock(csd);
        }
 }
@@ -249,16 +249,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                local_irq_restore(flags);
        } else {
                if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-                        struct call_single_data *data = &d;
+                        struct call_single_data *csd = &d;
                        if (!wait)
-                                data = &__get_cpu_var(csd_data);
+                                csd = &__get_cpu_var(csd_data);
-                        csd_lock(data);
+                        csd_lock(csd);
-                        data->func = func;
+                        csd->func = func;
-                        data->info = info;
+                        csd->info = info;
-                        generic_exec_single(cpu, data, wait);
+                        generic_exec_single(cpu, csd, wait);
                } else {
                        err = -ENXIO;   /* CPU not online */
                }
@@ -325,7 +325,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
 * pre-allocated data structure. Useful for embedding @data inside
 * other structures, for instance.
 */
-void __smp_call_function_single(int cpu, struct call_single_data *data,
+void __smp_call_function_single(int cpu, struct call_single_data *csd,
                                int wait)
 {
        unsigned int this_cpu;
@@ -343,11 +343,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
        if (cpu == this_cpu) {
                local_irq_save(flags);
-                data->func(data->info);
+                csd->func(csd->info);
                local_irq_restore(flags);
        } else {
-                csd_lock(data);
+                csd_lock(csd);
-                generic_exec_single(cpu, data, wait);
+                generic_exec_single(cpu, csd, wait);
        }
        put_cpu();
 }
@@ -369,7 +369,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
 {
-        struct call_function_data *data;
+        struct call_function_data *cfd;
        int cpu, next_cpu, this_cpu = smp_processor_id();
        /*
@@ -401,24 +401,24 @@ void smp_call_function_many(const struct cpumask *mask,
                return;
        }
-        data = &__get_cpu_var(cfd_data);
+        cfd = &__get_cpu_var(cfd_data);
-        cpumask_and(data->cpumask, mask, cpu_online_mask);
+        cpumask_and(cfd->cpumask, mask, cpu_online_mask);
-        cpumask_clear_cpu(this_cpu, data->cpumask);
+        cpumask_clear_cpu(this_cpu, cfd->cpumask);
        /* Some callers race with other cpus changing the passed mask */
-        if (unlikely(!cpumask_weight(data->cpumask)))
+        if (unlikely(!cpumask_weight(cfd->cpumask)))
                return;
        /*
-         * After we put an entry into the list, data->cpumask
+         * After we put an entry into the list, cfd->cpumask may be cleared
-         * may be cleared again when another CPU sends another IPI for
+         * again when another CPU sends another IPI for a SMP function call, so
-         * a SMP function call, so data->cpumask will be zero.
+         * cfd->cpumask will be zero.
         */
-        cpumask_copy(data->cpumask_ipi, data->cpumask);
+        cpumask_copy(cfd->cpumask_ipi, cfd->cpumask);
-        for_each_cpu(cpu, data->cpumask) {
+        for_each_cpu(cpu, cfd->cpumask) {
-                struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
+                struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
                struct call_single_queue *dst =
                                        &per_cpu(call_single_queue, cpu);
                unsigned long flags;
@@ -433,12 +433,13 @@ void smp_call_function_many(const struct cpumask *mask,
        }
        /* Send a message to all CPUs in the map */
-        arch_send_call_function_ipi_mask(data->cpumask_ipi);
+        arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
        if (wait) {
-                for_each_cpu(cpu, data->cpumask) {
+                for_each_cpu(cpu, cfd->cpumask) {
-                        struct call_single_data *csd =
+                        struct call_single_data *csd;
-                                        per_cpu_ptr(data->csd, cpu);
+                        csd = per_cpu_ptr(cfd->csd, cpu);
                        csd_lock_wait(csd);
                }
        }
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 8eaed9aa9cf0..02fc5c933673 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
        }
        get_task_struct(tsk);
        *per_cpu_ptr(ht->store, cpu) = tsk;
-        if (ht->create)
+        if (ht->create) {
-                ht->create(cpu);
+                /*
+                 * Make sure that the task has actually scheduled out
+                 * into park position, before calling the create
+                 * callback. At least the migration thread callback
+                 * requires that the task is off the runqueue.
+                 */
+                if (!wait_task_inactive(tsk, TASK_PARKED))
+                        WARN_ON(1);
+                else
+                        ht->create(cpu);
+        }
        return 0;
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 14d7758074aa..b5197dcb0dad 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -329,6 +329,19 @@ static inline void invoke_softirq(void)
                wakeup_softirqd();
 }
+static inline void tick_irq_exit(void)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+        int cpu = smp_processor_id();
+        /* Make sure that timer wheel updates are propagated */
+        if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
+                if (!in_interrupt())
+                        tick_nohz_irq_exit();
+        }
+#endif
+}
 /*
 * Exit an interrupt context. Process softirqs if needed and possible:
 */
@@ -346,11 +359,7 @@ void irq_exit(void)
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
-#ifdef CONFIG_NO_HZ
+        tick_irq_exit();
-        /* Make sure that timer wheel updates are propagated */
-        if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
-                tick_nohz_irq_exit();
-#endif
        rcu_irq_exit();
 }
@@ -620,8 +629,7 @@ static void remote_softirq_receive(void *data)
        unsigned long flags;
        int softirq;
-        softirq = cp->priv;
+        softirq = *(int *)cp->info;
        local_irq_save(flags);
        __local_trigger(cp, softirq);
        local_irq_restore(flags);
@@ -631,9 +639,8 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
 {
        if (cpu_online(cpu)) {
                cp->func = remote_softirq_receive;
-                cp->info = cp;
+                cp->info = &softirq;
                cp->flags = 0;
-                cp->priv = softirq;
                __smp_call_function_single(cpu, cp, 0);
                return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 81f56445fba9..b95d3c72ba21 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -49,6 +49,11 @@
 #include <linux/user_namespace.h>
 #include <linux/binfmts.h>
+#include <linux/sched.h>
+#include <linux/rcupdate.h>
+#include <linux/uidgid.h>
+#include <linux/cred.h>
 #include <linux/kmsg_dump.h>
 /* Move somewhere else to avoid recompiling? */
 #include <generated/utsrelease.h>
@@ -324,7 +329,6 @@ void kernel_restart_prepare(char *cmd)
        system_state = SYSTEM_RESTART;
        usermodehelper_disable();
        device_shutdown();
-        syscore_shutdown();
 }
 /**
@@ -370,6 +374,7 @@ void kernel_restart(char *cmd)
 {
        kernel_restart_prepare(cmd);
        disable_nonboot_cpus();
+        syscore_shutdown();
        if (!cmd)
                printk(KERN_EMERG "Restarting system.\n");
        else
@@ -395,6 +400,7 @@ static void kernel_shutdown_prepare(enum system_states state)
 void kernel_halt(void)
 {
        kernel_shutdown_prepare(SYSTEM_HALT);
+        disable_nonboot_cpus();
        syscore_shutdown();
        printk(KERN_EMERG "System halted.\n");
        kmsg_dump(KMSG_DUMP_HALT);
@@ -1043,6 +1049,67 @@ change_okay:
        return old_fsgid;
 }
+/**
+ * sys_getpid - return the thread group id of the current process
+ *
+ * Note, despite the name, this returns the tgid not the pid.  The tgid and
+ * the pid are identical unless CLONE_THREAD was specified on clone() in
+ * which case the tgid is the same in all threads of the same group.
+ *
+ * This is SMP safe as current->tgid does not change.
+ */
+SYSCALL_DEFINE0(getpid)
+{
+        return task_tgid_vnr(current);
+}
+/* Thread ID - the internal kernel "pid" */
+SYSCALL_DEFINE0(gettid)
+{
+        return task_pid_vnr(current);
+}
+/*
+ * Accessing ->real_parent is not SMP-safe, it could
+ * change from under us. However, we can use a stale
+ * value of ->real_parent under rcu_read_lock(), see
+ * release_task()->call_rcu(delayed_put_task_struct).
+ */
+SYSCALL_DEFINE0(getppid)
+{
+        int pid;
+        rcu_read_lock();
+        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
+        rcu_read_unlock();
+        return pid;
+}
+SYSCALL_DEFINE0(getuid)
+{
+        /* Only we change this so SMP safe */
+        return from_kuid_munged(current_user_ns(), current_uid());
+}
+SYSCALL_DEFINE0(geteuid)
+{
+        /* Only we change this so SMP safe */
+        return from_kuid_munged(current_user_ns(), current_euid());
+}
+SYSCALL_DEFINE0(getgid)
+{
+        /* Only we change this so SMP safe */
+        return from_kgid_munged(current_user_ns(), current_gid());
+}
+SYSCALL_DEFINE0(getegid)
+{
+        /* Only we change this so SMP safe */
+        return from_kgid_munged(current_user_ns(), current_egid());
+}
 void do_sys_times(struct tms *tms)
 {
        cputime_t tgutime, tgstime, cutime, cstime;
@@ -1784,13 +1851,26 @@ SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
        return getrusage(current, who, ru);
 }
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
+{
+        struct rusage r;
+        if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+            who != RUSAGE_THREAD)
+                return -EINVAL;
+        k_getrusage(current, who, &r);
+        return put_compat_rusage(&r, ru);
+}
+#endif
 SYSCALL_DEFINE1(umask, int, mask)
 {
        mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
        return mask;
 }
-#ifdef CONFIG_CHECKPOINT_RESTORE
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
        struct fd exe;
@@ -1984,17 +2064,12 @@ out:
        return error;
 }
+#ifdef CONFIG_CHECKPOINT_RESTORE
 static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 {
        return put_user(me->clear_child_tid, tid_addr);
 }
+#else
-#else /* CONFIG_CHECKPOINT_RESTORE */
-static int prctl_set_mm(int opt, unsigned long addr,
-                        unsigned long arg4, unsigned long arg5)
-{
-        return -EINVAL;
-}
 static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 {
        return -EINVAL;
@@ -2185,9 +2260,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
 char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-static int __orderly_poweroff(void)
+static int __orderly_poweroff(bool force)
 {
-        int argc;
        char **argv;
        static char *envp[] = {
                "HOME=/",
@@ -2196,20 +2270,40 @@ static int __orderly_poweroff(void)
        };
        int ret;
-        argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
+        argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
-        if (argv == NULL) {
+        if (argv) {
+                ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+                argv_free(argv);
+        } else {
                printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
-                       __func__, poweroff_cmd);
+                                         __func__, poweroff_cmd);
-                return -ENOMEM;
+                ret = -ENOMEM;
        }
-        ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
+        if (ret && force) {
-                                      NULL, NULL, NULL);
+                printk(KERN_WARNING "Failed to start orderly shutdown: "
-        argv_free(argv);
+                                        "forcing the issue\n");
+                /*
+                 * I guess this should try to kick off some daemon to sync and
+                 * poweroff asap.  Or not even bother syncing if we're doing an
+                 * emergency shutdown?
+                 */
+                emergency_sync();
+                kernel_power_off();
+        }
        return ret;
 }
+static bool poweroff_force;
+static void poweroff_work_func(struct work_struct *work)
+{
+        __orderly_poweroff(poweroff_force);
+}
+static DECLARE_WORK(poweroff_work, poweroff_work_func);
 /**
 * orderly_poweroff - Trigger an orderly system poweroff
 * @force: force poweroff if command execution fails
@@ -2219,21 +2313,154 @@ static int __orderly_poweroff(void)
 */
 int orderly_poweroff(bool force)
 {
-        int ret = __orderly_poweroff();
+        if (force) /* do not override the pending "true" */
+                poweroff_force = true;
+        schedule_work(&poweroff_work);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(orderly_poweroff);
-        if (ret && force) {
+/**
-                printk(KERN_WARNING "Failed to start orderly shutdown: "
+ * do_sysinfo - fill in sysinfo struct
-                       "forcing the issue\n");
+ * @info: pointer to buffer to fill
+ */
+static int do_sysinfo(struct sysinfo *info)
+{
+        unsigned long mem_total, sav_total;
+        unsigned int mem_unit, bitcount;
+        struct timespec tp;
-                /*
+        memset(info, 0, sizeof(struct sysinfo));
-                 * I guess this should try to kick off some daemon to sync and
-                 * poweroff asap.  Or not even bother syncing if we're doing an
+        ktime_get_ts(&tp);
-                 * emergency shutdown?
+        monotonic_to_bootbased(&tp);
-                 */
+        info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
-                emergency_sync();
-                kernel_power_off();
+        get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+        info->procs = nr_threads;
+        si_meminfo(info);
+        si_swapinfo(info);
+        /*
+         * If the sum of all the available memory (i.e. ram + swap)
+         * is less than can be stored in a 32 bit unsigned long then
+         * we can be binary compatible with 2.2.x kernels.  If not,
+         * well, in that case 2.2.x was broken anyways...
+         *
+         *  -Erik Andersen <andersee@debian.org>
+         */
+        mem_total = info->totalram + info->totalswap;
+        if (mem_total < info->totalram || mem_total < info->totalswap)
+                goto out;
+        bitcount = 0;
+        mem_unit = info->mem_unit;
+        while (mem_unit > 1) {
+                bitcount++;
+                mem_unit >>= 1;
+                sav_total = mem_total;
+                mem_total <<= 1;
+                if (mem_total < sav_total)
+                        goto out;
        }
-        return ret;
+        /*
+         * If mem_total did not overflow, multiply all memory values by
+         * info->mem_unit and set it to 1.  This leaves things compatible
+         * with 2.2.x, and also retains compatibility with earlier 2.4.x
+         * kernels...
+         */
+        info->mem_unit = 1;
+        info->totalram <<= bitcount;
+        info->freeram <<= bitcount;
+        info->sharedram <<= bitcount;
+        info->bufferram <<= bitcount;
+        info->totalswap <<= bitcount;
+        info->freeswap <<= bitcount;
+        info->totalhigh <<= bitcount;
+        info->freehigh <<= bitcount;
+out:
+        return 0;
 }
-EXPORT_SYMBOL_GPL(orderly_poweroff);
+SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
+{
+        struct sysinfo val;
+        do_sysinfo(&val);
+        if (copy_to_user(info, &val, sizeof(struct sysinfo)))
+                return -EFAULT;
+        return 0;
+}
+#ifdef CONFIG_COMPAT
+struct compat_sysinfo {
+        s32 uptime;
+        u32 loads[3];
+        u32 totalram;
+        u32 freeram;
+        u32 sharedram;
+        u32 bufferram;
+        u32 totalswap;
+        u32 freeswap;
+        u16 procs;
+        u16 pad;
+        u32 totalhigh;
+        u32 freehigh;
+        u32 mem_unit;
+        char _f[20-2*sizeof(u32)-sizeof(int)];
+};
+COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
+{
+        struct sysinfo s;
+        do_sysinfo(&s);
+        /* Check to see if any memory value is too large for 32-bit and scale
+         *  down if needed
+         */
+        if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+                int bitcount = 0;
+                while (s.mem_unit < PAGE_SIZE) {
+                        s.mem_unit <<= 1;
+                        bitcount++;
+                }
+                s.totalram >>= bitcount;
+                s.freeram >>= bitcount;
+                s.sharedram >>= bitcount;
+                s.bufferram >>= bitcount;
+                s.totalswap >>= bitcount;
+                s.freeswap >>= bitcount;
+                s.totalhigh >>= bitcount;
+                s.freehigh >>= bitcount;
+        }
+        if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
+            __put_user(s.uptime, &info->uptime) ||
+            __put_user(s.loads[0], &info->loads[0]) ||
+            __put_user(s.loads[1], &info->loads[1]) ||
+            __put_user(s.loads[2], &info->loads[2]) ||
+            __put_user(s.totalram, &info->totalram) ||
+            __put_user(s.freeram, &info->freeram) ||
+            __put_user(s.sharedram, &info->sharedram) ||
+            __put_user(s.bufferram, &info->bufferram) ||
+            __put_user(s.totalswap, &info->totalswap) ||
+            __put_user(s.freeswap, &info->freeswap) ||
+            __put_user(s.procs, &info->procs) ||
+            __put_user(s.totalhigh, &info->totalhigh) ||
+            __put_user(s.freehigh, &info->freehigh) ||
+            __put_user(s.mem_unit, &info->mem_unit))
+                return -EFAULT;
+        return 0;
+}
+#endif /* CONFIG_COMPAT */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 395084d4ce16..bfd6787b355a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -20,6 +20,7 @@ cond_syscall(sys_quotactl);
 cond_syscall(sys32_quotactl);
 cond_syscall(sys_acct);
 cond_syscall(sys_lookup_dcookie);
+cond_syscall(compat_sys_lookup_dcookie);
 cond_syscall(sys_swapon);
 cond_syscall(sys_swapoff);
 cond_syscall(sys_kexec_load);
@@ -155,7 +156,7 @@ cond_syscall(compat_sys_process_vm_writev);
 cond_syscall(sys_pciconfig_read);
 cond_syscall(sys_pciconfig_write);
 cond_syscall(sys_pciconfig_iobase);
-cond_syscall(sys32_ipc);
+cond_syscall(compat_sys_s390_ipc);
 cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afc1dc60f3f8..9edcf456e0fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit;
 #endif
 extern int pid_max;
 extern int pid_max_min, pid_max_max;
-extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int latencytop_enabled;
@@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = {
                .extra2         = &one,
        },
 #endif
+        {
+                .procname       = "user_reserve_kbytes",
+                .data           = &sysctl_user_reserve_kbytes,
+                .maxlen         = sizeof(sysctl_user_reserve_kbytes),
+                .mode           = 0644,
+                .proc_handler   = proc_doulongvec_minmax,
+        },
+        {
+                .procname       = "admin_reserve_kbytes",
+                .data           = &sysctl_admin_reserve_kbytes,
+                .maxlen         = sizeof(sysctl_admin_reserve_kbytes),
+                .mode           = 0644,
+                .proc_handler   = proc_doulongvec_minmax,
+        },
        { }
 };
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index f8b11a283171..12d6ebbfdd83 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -365,7 +365,7 @@ int init_test_probes(void)
        target2 = kprobe_target2;
        do {
-                rand1 = random32();
+                rand1 = prandom_u32();
        } while (rand1 <= div_factor);
        printk(KERN_INFO "Kprobe smoke test started\n");
diff --git a/kernel/time.c b/kernel/time.c
index f8342a41efa6..d3617dbd3dca 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -138,13 +138,14 @@ int persistent_clock_is_local;
 */
 static inline void warp_clock(void)
 {
-        struct timespec adjust;
+        if (sys_tz.tz_minuteswest != 0) {
+                struct timespec adjust;
-        adjust = current_kernel_time();
-        if (sys_tz.tz_minuteswest != 0)
                persistent_clock_is_local = 1;
-        adjust.tv_sec += sys_tz.tz_minuteswest * 60;
+                adjust.tv_sec = sys_tz.tz_minuteswest * 60;
-        do_settimeofday(&adjust);
+                adjust.tv_nsec = 0;
+                timekeeping_inject_offset(&adjust);
+        }
 }
 /*
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 24510d84efd7..e4c07b0692bb 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -64,20 +64,88 @@ config GENERIC_CMOS_UPDATE
 if GENERIC_CLOCKEVENTS
 menu "Timers subsystem"
-# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
+# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
 # only related to the tick functionality. Oneshot clockevent devices
 # are supported independ of this.
 config TICK_ONESHOT
        bool
-config NO_HZ
+config NO_HZ_COMMON
-        bool "Tickless System (Dynamic Ticks)"
+        bool
        depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
        select TICK_ONESHOT
+choice
+        prompt "Timer tick handling"
+        default NO_HZ_IDLE if NO_HZ
+config HZ_PERIODIC
+        bool "Periodic timer ticks (constant rate, no dynticks)"
+        help
+          This option keeps the tick running periodically at a constant
+          rate, even when the CPU doesn't need it.
+config NO_HZ_IDLE
+        bool "Idle dynticks system (tickless idle)"
+        depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+        select NO_HZ_COMMON
+        help
+          This option enables a tickless idle system: timer interrupts
+          will only trigger on an as-needed basis when the system is idle.
+          This is usually interesting for energy saving.
+          Most of the time you want to say Y here.
+config NO_HZ_FULL
+        bool "Full dynticks system (tickless)"
+        # NO_HZ_COMMON dependency
+        depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+        # We need at least one periodic CPU for timekeeping
+        depends on SMP
+        # RCU_USER_QS dependency
+        depends on HAVE_CONTEXT_TRACKING
+        # VIRT_CPU_ACCOUNTING_GEN dependency
+        depends on 64BIT
+        select NO_HZ_COMMON
+        select RCU_USER_QS
+        select RCU_NOCB_CPU
+        select VIRT_CPU_ACCOUNTING_GEN
+        select CONTEXT_TRACKING_FORCE
+        select IRQ_WORK
+        help
+         Adaptively try to shutdown the tick whenever possible, even when
+         the CPU is running tasks. Typically this requires running a single
+         task on the CPU. Chances for running tickless are maximized when
+         the task mostly runs in userspace and has few kernel activity.
+         You need to fill up the nohz_full boot parameter with the
+         desired range of dynticks CPUs.
+         This is implemented at the expense of some overhead in user <-> kernel
+         transitions: syscalls, exceptions and interrupts. Even when it's
+         dynamically off.
+         Say N.
+endchoice
+config NO_HZ_FULL_ALL
+       bool "Full dynticks system on all CPUs by default"
+       depends on NO_HZ_FULL
+       help
+         If the user doesn't pass the nohz_full boot option to
+         define the range of full dynticks CPUs, consider that all
+         CPUs in the system are full dynticks by default.
+         Note the boot CPU will still be kept outside the range to
+         handle the timekeeping duty.
+config NO_HZ
+        bool "Old Idle dynticks config"
+        depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
        help
-          This option enables a tickless system: timer interrupts will
+          This is the old config entry that enables dynticks idle.
-          only trigger on an as-needed basis both when the system is
+          We keep it around for a little while to enforce backward
-          busy and when the system is idle.
+          compatibility with older config files.
 config HIGH_RES_TIMERS
        bool "High Resolution Timer Support"
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 072bb066bb7d..12ff13a838c6 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,13 +18,14 @@
 #include <linux/rtc.h>
 #include "tick-internal.h"
+#include "ntp_internal.h"
 /*
 * NTP timekeeping variables:
+ *
+ * Note: All of the NTP state is protected by the timekeeping locks.
 */
-DEFINE_RAW_SPINLOCK(ntp_lock);
 /* USER_HZ period (usecs): */
 unsigned long                   tick_usec = TICK_USEC;
@@ -53,9 +54,6 @@ static int			time_state = TIME_OK;
 /* clock status bits:                                                   */
 static int                      time_status = STA_UNSYNC;
-/* TAI offset (secs):                                                   */
-static long                     time_tai;
 /* time adjustment (nsecs):                                             */
 static s64                      time_offset;
@@ -134,8 +132,6 @@ static inline void pps_reset_freq_interval(void)
 /**
 * pps_clear - Clears the PPS state variables
- *
- * Must be called while holding a write on the ntp_lock
 */
 static inline void pps_clear(void)
 {
@@ -150,8 +146,6 @@ static inline void pps_clear(void)
 /* Decrease pps_valid to indicate that another second has passed since
 * the last PPS signal. When it reaches 0, indicate that PPS signal is
 * missing.
- *
- * Must be called while holding a write on the ntp_lock
 */
 static inline void pps_dec_valid(void)
 {
@@ -346,10 +340,6 @@ static void ntp_update_offset(long offset)
 */
 void ntp_clear(void)
 {
-        unsigned long flags;
-        raw_spin_lock_irqsave(&ntp_lock, flags);
        time_adjust     = 0;            /* stop active adjtime() */
        time_status     |= STA_UNSYNC;
        time_maxerror   = NTP_PHASE_LIMIT;
@@ -362,20 +352,12 @@ void ntp_clear(void)
        /* Clear PPS state variables */
        pps_clear();
-        raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
 u64 ntp_tick_length(void)
 {
-        unsigned long flags;
+        return tick_length;
-        s64 ret;
-        raw_spin_lock_irqsave(&ntp_lock, flags);
-        ret = tick_length;
-        raw_spin_unlock_irqrestore(&ntp_lock, flags);
-        return ret;
 }
@@ -393,9 +375,6 @@ int second_overflow(unsigned long secs)
 {
        s64 delta;
        int leap = 0;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&ntp_lock, flags);
        /*
         * Leap second processing. If in leap-insert state at the end of the
@@ -415,7 +394,6 @@ int second_overflow(unsigned long secs)
                else if (secs % 86400 == 0) {
                        leap = -1;
                        time_state = TIME_OOP;
-                        time_tai++;
                        printk(KERN_NOTICE
                                "Clock: inserting leap second 23:59:60 UTC\n");
                }
@@ -425,7 +403,6 @@ int second_overflow(unsigned long secs)
                        time_state = TIME_OK;
                else if ((secs + 1) % 86400 == 0) {
                        leap = 1;
-                        time_tai--;
                        time_state = TIME_WAIT;
                        printk(KERN_NOTICE
                                "Clock: deleting leap second 23:59:59 UTC\n");
@@ -479,8 +456,6 @@ int second_overflow(unsigned long secs)
        time_adjust = 0;
 out:
-        raw_spin_unlock_irqrestore(&ntp_lock, flags);
        return leap;
 }
@@ -575,11 +550,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
        time_status |= txc->status & ~STA_RONLY;
 }
-/*
- * Called with ntp_lock held, so we can access and modify
+static inline void process_adjtimex_modes(struct timex *txc,
- * all the global NTP state:
+                                                struct timespec *ts,
- */
+                                                s32 *time_tai)
-static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
 {
        if (txc->modes & ADJ_STATUS)
                process_adj_status(txc, ts);
@@ -613,7 +587,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
        }
        if (txc->modes & ADJ_TAI && txc->constant > 0)
-                time_tai = txc->constant;
+                *time_tai = txc->constant;
        if (txc->modes & ADJ_OFFSET)
                ntp_update_offset(txc->offset);
@@ -625,16 +599,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
                ntp_update_frequency();
 }
-/*
- * adjtimex mainly allows reading (and writing, if superuser) of
- * kernel time-keeping variables. used by xntpd.
+/**
+ * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
 */
-int do_adjtimex(struct timex *txc)
+int ntp_validate_timex(struct timex *txc)
 {
-        struct timespec ts;
-        int result;
-        /* Validate the data before disabling interrupts */
        if (txc->modes & ADJ_ADJTIME) {
                /* singleshot must not be used with any other mode bits */
                if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
@@ -646,7 +617,6 @@ int do_adjtimex(struct timex *txc)
                /* In order to modify anything, you gotta be super-user! */
                 if (txc->modes && !capable(CAP_SYS_TIME))
                        return -EPERM;
                /*
                 * if the quartz is off by more than 10% then
                 * something is VERY wrong!
@@ -657,22 +627,20 @@ int do_adjtimex(struct timex *txc)
                        return -EINVAL;
        }
-        if (txc->modes & ADJ_SETOFFSET) {
+        if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
-                struct timespec delta;
+                return -EPERM;
-                delta.tv_sec  = txc->time.tv_sec;
-                delta.tv_nsec = txc->time.tv_usec;
-                if (!capable(CAP_SYS_TIME))
-                        return -EPERM;
-                if (!(txc->modes & ADJ_NANO))
-                        delta.tv_nsec *= 1000;
-                result = timekeeping_inject_offset(&delta);
-                if (result)
-                        return result;
-        }
-        getnstimeofday(&ts);
+        return 0;
+}
-        raw_spin_lock_irq(&ntp_lock);
+/*
+ * adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
+{
+        int result;
        if (txc->modes & ADJ_ADJTIME) {
                long save_adjust = time_adjust;
@@ -687,7 +655,7 @@ int do_adjtimex(struct timex *txc)
                /* If there are input parameters, then process them: */
                if (txc->modes)
-                        process_adjtimex_modes(txc, &ts);
+                        process_adjtimex_modes(txc, ts, time_tai);
                txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
                                  NTP_SCALE_SHIFT);
@@ -709,15 +677,13 @@ int do_adjtimex(struct timex *txc)
        txc->precision     = 1;
        txc->tolerance     = MAXFREQ_SCALED / PPM_SCALE;
        txc->tick          = tick_usec;
-        txc->tai           = time_tai;
+        txc->tai           = *time_tai;
        /* fill PPS status fields */
        pps_fill_timex(txc);
-        raw_spin_unlock_irq(&ntp_lock);
+        txc->time.tv_sec = ts->tv_sec;
+        txc->time.tv_usec = ts->tv_nsec;
-        txc->time.tv_sec = ts.tv_sec;
-        txc->time.tv_usec = ts.tv_nsec;
        if (!(time_status & STA_NANO))
                txc->time.tv_usec /= NSEC_PER_USEC;
@@ -894,7 +860,7 @@ static void hardpps_update_phase(long error)
 }
 /*
- * hardpps() - discipline CPU clock oscillator to external PPS signal
+ * __hardpps() - discipline CPU clock oscillator to external PPS signal
 *
 * This routine is called at each PPS signal arrival in order to
 * discipline the CPU clock oscillator to the PPS signal. It takes two
@@ -905,15 +871,13 @@ static void hardpps_update_phase(long error)
 * This code is based on David Mills's reference nanokernel
 * implementation. It was mostly rewritten but keeps the same idea.
 */
-void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 {
        struct pps_normtime pts_norm, freq_norm;
        unsigned long flags;
        pts_norm = pps_normalize_ts(*phase_ts);
-        raw_spin_lock_irqsave(&ntp_lock, flags);
        /* clear the error bits, they will be set again if needed */
        time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -925,7 +889,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
         * just start the frequency interval */
        if (unlikely(pps_fbase.tv_sec == 0)) {
                pps_fbase = *raw_ts;
-                raw_spin_unlock_irqrestore(&ntp_lock, flags);
                return;
        }
@@ -940,7 +903,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
                time_status |= STA_PPSJITTER;
                /* restart the frequency calibration interval */
                pps_fbase = *raw_ts;
-                raw_spin_unlock_irqrestore(&ntp_lock, flags);
                pr_err("hardpps: PPSJITTER: bad pulse\n");
                return;
        }
@@ -957,10 +919,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
        hardpps_update_phase(pts_norm.nsec);
-        raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
-EXPORT_SYMBOL(hardpps);
 #endif  /* CONFIG_NTP_PPS */
 static int __init ntp_tick_adj_setup(char *str)
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
new file mode 100644
index 000000000000..1950cb4ca2a4
--- /dev/null
+++ b/kernel/time/ntp_internal.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_NTP_INTERNAL_H
+#define _LINUX_NTP_INTERNAL_H
+extern void ntp_init(void);
+extern void ntp_clear(void);
+/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
+extern u64 ntp_tick_length(void);
+extern int second_overflow(unsigned long secs);
+extern int ntp_validate_timex(struct timex *);
+extern int __do_adjtimex(struct timex *, struct timespec *, s32 *);
+extern void __hardpps(const struct timespec *, const struct timespec *);
+#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2fb8cb88df8d..206bbfb34e09 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,9 +28,8 @@
 */
 static struct tick_device tick_broadcast_device;
-/* FIXME: Use cpumask_var_t. */
+static cpumask_var_t tick_broadcast_mask;
-static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
+static cpumask_var_t tmpmask;
-static DECLARE_BITMAP(tmpmask, NR_CPUS);
 static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
 static int tick_broadcast_force;
@@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void)
 struct cpumask *tick_get_broadcast_mask(void)
 {
-        return to_cpumask(tick_broadcast_mask);
+        return tick_broadcast_mask;
 }
 /*
@@ -67,15 +66,30 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
 */
 int tick_check_broadcast_device(struct clock_event_device *dev)
 {
-        if ((tick_broadcast_device.evtdev &&
+        struct clock_event_device *cur = tick_broadcast_device.evtdev;
+        if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+            (tick_broadcast_device.evtdev &&
             tick_broadcast_device.evtdev->rating >= dev->rating) ||
             (dev->features & CLOCK_EVT_FEAT_C3STOP))
                return 0;
        clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
+        if (cur)
+                cur->event_handler = clockevents_handle_noop;
        tick_broadcast_device.evtdev = dev;
-        if (!cpumask_empty(tick_get_broadcast_mask()))
+        if (!cpumask_empty(tick_broadcast_mask))
                tick_broadcast_start_periodic(dev);
+        /*
+         * Inform all cpus about this. We might be in a situation
+         * where we did not switch to oneshot mode because the per cpu
+         * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack
+         * of a oneshot capable broadcast device. Without that
+         * notification the systems stays stuck in periodic mode
+         * forever.
+         */
+        if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
+                tick_clock_notify();
        return 1;
 }
@@ -123,7 +137,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
        if (!tick_device_is_functional(dev)) {
                dev->event_handler = tick_handle_periodic;
                tick_device_setup_broadcast_func(dev);
-                cpumask_set_cpu(cpu, tick_get_broadcast_mask());
+                cpumask_set_cpu(cpu, tick_broadcast_mask);
                tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
                ret = 1;
        } else {
@@ -134,7 +148,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
                 */
                if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
                        int cpu = smp_processor_id();
-                        cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+                        cpumask_clear_cpu(cpu, tick_broadcast_mask);
                        tick_broadcast_clear_oneshot(cpu);
                } else {
                        tick_device_setup_broadcast_func(dev);
@@ -198,9 +212,8 @@ static void tick_do_periodic_broadcast(void)
 {
        raw_spin_lock(&tick_broadcast_lock);
-        cpumask_and(to_cpumask(tmpmask),
+        cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
-                    cpu_online_mask, tick_get_broadcast_mask());
+        tick_do_broadcast(tmpmask);
-        tick_do_broadcast(to_cpumask(tmpmask));
        raw_spin_unlock(&tick_broadcast_lock);
 }
@@ -263,13 +276,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
        if (!tick_device_is_functional(dev))
                goto out;
-        bc_stopped = cpumask_empty(tick_get_broadcast_mask());
+        bc_stopped = cpumask_empty(tick_broadcast_mask);
        switch (*reason) {
        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
-                if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
+                if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
-                        cpumask_set_cpu(cpu, tick_get_broadcast_mask());
                        if (tick_broadcast_device.mode ==
                            TICKDEV_MODE_PERIODIC)
                                clockevents_shutdown(dev);
@@ -279,8 +291,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
                break;
        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
                if (!tick_broadcast_force &&
-                    cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
+                    cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
-                        cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
                        if (tick_broadcast_device.mode ==
                            TICKDEV_MODE_PERIODIC)
                                tick_setup_periodic(dev, 0);
@@ -288,7 +299,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
                break;
        }
-        if (cpumask_empty(tick_get_broadcast_mask())) {
+        if (cpumask_empty(tick_broadcast_mask)) {
                if (!bc_stopped)
                        clockevents_shutdown(bc);
        } else if (bc_stopped) {
@@ -337,10 +348,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        bc = tick_broadcast_device.evtdev;
-        cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+        cpumask_clear_cpu(cpu, tick_broadcast_mask);
        if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
-                if (bc && cpumask_empty(tick_get_broadcast_mask()))
+                if (bc && cpumask_empty(tick_broadcast_mask))
                        clockevents_shutdown(bc);
        }
@@ -376,13 +387,13 @@ int tick_resume_broadcast(void)
                switch (tick_broadcast_device.mode) {
                case TICKDEV_MODE_PERIODIC:
-                        if (!cpumask_empty(tick_get_broadcast_mask()))
+                        if (!cpumask_empty(tick_broadcast_mask))
                                tick_broadcast_start_periodic(bc);
                        broadcast = cpumask_test_cpu(smp_processor_id(),
-                                                     tick_get_broadcast_mask());
+                                                     tick_broadcast_mask);
                        break;
                case TICKDEV_MODE_ONESHOT:
-                        if (!cpumask_empty(tick_get_broadcast_mask()))
+                        if (!cpumask_empty(tick_broadcast_mask))
                                broadcast = tick_resume_broadcast_oneshot(bc);
                        break;
                }
@@ -395,25 +406,58 @@ int tick_resume_broadcast(void)
 #ifdef CONFIG_TICK_ONESHOT
-/* FIXME: use cpumask_var_t. */
+static cpumask_var_t tick_broadcast_oneshot_mask;
-static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
+static cpumask_var_t tick_broadcast_pending_mask;
+static cpumask_var_t tick_broadcast_force_mask;
 /*
 * Exposed for debugging: see timer_list.c
 */
 struct cpumask *tick_get_broadcast_oneshot_mask(void)
 {
-        return to_cpumask(tick_broadcast_oneshot_mask);
+        return tick_broadcast_oneshot_mask;
 }
-static int tick_broadcast_set_event(ktime_t expires, int force)
+/*
+ * Called before going idle with interrupts disabled. Checks whether a
+ * broadcast event from the other core is about to happen. We detected
+ * that in tick_broadcast_oneshot_control(). The callsite can use this
+ * to avoid a deep idle transition as we are about to get the
+ * broadcast IPI right away.
+ */
+int tick_check_broadcast_expired(void)
 {
-        struct clock_event_device *bc = tick_broadcast_device.evtdev;
+        return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
+}
+/*
+ * Set broadcast interrupt affinity
+ */
+static void tick_broadcast_set_affinity(struct clock_event_device *bc,
+                                        const struct cpumask *cpumask)
+{
+        if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
+                return;
+        if (cpumask_equal(bc->cpumask, cpumask))
+                return;
+        bc->cpumask = cpumask;
+        irq_set_affinity(bc->irq, bc->cpumask);
+}
+static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
+                                    ktime_t expires, int force)
+{
+        int ret;
        if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
-        return clockevents_program_event(bc, expires, force);
+        ret = clockevents_program_event(bc, expires, force);
+        if (!ret)
+                tick_broadcast_set_affinity(bc, cpumask_of(cpu));
+        return ret;
 }
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -428,7 +472,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 */
 void tick_check_oneshot_broadcast(int cpu)
 {
-        if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
+        if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
                struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
                clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -442,27 +486,39 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 {
        struct tick_device *td;
        ktime_t now, next_event;
-        int cpu;
+        int cpu, next_cpu = 0;
        raw_spin_lock(&tick_broadcast_lock);
 again:
        dev->next_event.tv64 = KTIME_MAX;
        next_event.tv64 = KTIME_MAX;
-        cpumask_clear(to_cpumask(tmpmask));
+        cpumask_clear(tmpmask);
        now = ktime_get();
        /* Find all expired events */
-        for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
+        for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
                td = &per_cpu(tick_cpu_device, cpu);
-                if (td->evtdev->next_event.tv64 <= now.tv64)
+                if (td->evtdev->next_event.tv64 <= now.tv64) {
-                        cpumask_set_cpu(cpu, to_cpumask(tmpmask));
+                        cpumask_set_cpu(cpu, tmpmask);
-                else if (td->evtdev->next_event.tv64 < next_event.tv64)
+                        /*
+                         * Mark the remote cpu in the pending mask, so
+                         * it can avoid reprogramming the cpu local
+                         * timer in tick_broadcast_oneshot_control().
+                         */
+                        cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
+                } else if (td->evtdev->next_event.tv64 < next_event.tv64) {
                        next_event.tv64 = td->evtdev->next_event.tv64;
+                        next_cpu = cpu;
+                }
        }
+        /* Take care of enforced broadcast requests */
+        cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
+        cpumask_clear(tick_broadcast_force_mask);
        /*
         * Wakeup the cpus which have an expired event.
         */
-        tick_do_broadcast(to_cpumask(tmpmask));
+        tick_do_broadcast(tmpmask);
        /*
         * Two reasons for reprogram:
@@ -479,7 +535,7 @@ again:
                 * Rearm the broadcast device. If event expired,
                 * repeat the above
                 */
-                if (tick_broadcast_set_event(next_event, 0))
+                if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
                        goto again;
        }
        raw_spin_unlock(&tick_broadcast_lock);
@@ -494,6 +550,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
        unsigned long flags;
+        ktime_t now;
        int cpu;
        /*
@@ -518,21 +575,84 @@ void tick_broadcast_oneshot_control(unsigned long reason)
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
-                if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
+                WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
-                        cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
+                if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
                        clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
-                        if (dev->next_event.tv64 < bc->next_event.tv64)
+                        /*
-                                tick_broadcast_set_event(dev->next_event, 1);
+                         * We only reprogram the broadcast timer if we
+                         * did not mark ourself in the force mask and
+                         * if the cpu local event is earlier than the
+                         * broadcast event. If the current CPU is in
+                         * the force mask, then we are going to be
+                         * woken by the IPI right away.
+                         */
+                        if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
+                            dev->next_event.tv64 < bc->next_event.tv64)
+                                tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
                }
        } else {
-                if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
+                if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
-                        cpumask_clear_cpu(cpu,
-                                          tick_get_broadcast_oneshot_mask());
                        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
-                        if (dev->next_event.tv64 != KTIME_MAX)
+                        if (dev->next_event.tv64 == KTIME_MAX)
-                                tick_program_event(dev->next_event, 1);
+                                goto out;
+                        /*
+                         * The cpu which was handling the broadcast
+                         * timer marked this cpu in the broadcast
+                         * pending mask and fired the broadcast
+                         * IPI. So we are going to handle the expired
+                         * event anyway via the broadcast IPI
+                         * handler. No need to reprogram the timer
+                         * with an already expired event.
+                         */
+                        if (cpumask_test_and_clear_cpu(cpu,
+                                       tick_broadcast_pending_mask))
+                                goto out;
+                        /*
+                         * If the pending bit is not set, then we are
+                         * either the CPU handling the broadcast
+                         * interrupt or we got woken by something else.
+                         *
+                         * We are not longer in the broadcast mask, so
+                         * if the cpu local expiry time is already
+                         * reached, we would reprogram the cpu local
+                         * timer with an already expired event.
+                         *
+                         * This can lead to a ping-pong when we return
+                         * to idle and therefor rearm the broadcast
+                         * timer before the cpu local timer was able
+                         * to fire. This happens because the forced
+                         * reprogramming makes sure that the event
+                         * will happen in the future and depending on
+                         * the min_delta setting this might be far
+                         * enough out that the ping-pong starts.
+                         *
+                         * If the cpu local next_event has expired
+                         * then we know that the broadcast timer
+                         * next_event has expired as well and
+                         * broadcast is about to be handled. So we
+                         * avoid reprogramming and enforce that the
+                         * broadcast handler, which did not run yet,
+                         * will invoke the cpu local handler.
+                         *
+                         * We cannot call the handler directly from
+                         * here, because we might be in a NOHZ phase
+                         * and we did not go through the irq_enter()
+                         * nohz fixups.
+                         */
+                        now = ktime_get();
+                        if (dev->next_event.tv64 <= now.tv64) {
+                                cpumask_set_cpu(cpu, tick_broadcast_force_mask);
+                                goto out;
+                        }
+                        /*
+                         * We got woken by something else. Reprogram
+                         * the cpu local timer device.
+                         */
+                        tick_program_event(dev->next_event, 1);
                }
        }
+out:
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
@@ -543,7 +663,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 */
 static void tick_broadcast_clear_oneshot(int cpu)
 {
-        cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+        cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
 }
 static void tick_broadcast_init_next_event(struct cpumask *mask,
@@ -573,7 +693,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
                bc->event_handler = tick_handle_oneshot_broadcast;
                /* Take the do_timer update */
-                tick_do_timer_cpu = cpu;
+                if (!tick_nohz_full_cpu(cpu))
+                        tick_do_timer_cpu = cpu;
                /*
                 * We must be careful here. There might be other CPUs
@@ -581,17 +702,16 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
                 * oneshot_mask bits for those and program the
                 * broadcast device to fire.
                 */
-                cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
+                cpumask_copy(tmpmask, tick_broadcast_mask);
-                cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
+                cpumask_clear_cpu(cpu, tmpmask);
-                cpumask_or(tick_get_broadcast_oneshot_mask(),
+                cpumask_or(tick_broadcast_oneshot_mask,
-                           tick_get_broadcast_oneshot_mask(),
+                           tick_broadcast_oneshot_mask, tmpmask);
-                           to_cpumask(tmpmask));
-                if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
+                if (was_periodic && !cpumask_empty(tmpmask)) {
                        clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
-                        tick_broadcast_init_next_event(to_cpumask(tmpmask),
+                        tick_broadcast_init_next_event(tmpmask,
                                                       tick_next_period);
-                        tick_broadcast_set_event(tick_next_period, 1);
+                        tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
                } else
                        bc->next_event.tv64 = KTIME_MAX;
        } else {
@@ -639,7 +759,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
         * Clear the broadcast mask flag for the dead cpu, but do not
         * stop the broadcast device!
         */
-        cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+        cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
@@ -663,3 +783,14 @@ bool tick_broadcast_oneshot_available(void)
 }
 #endif
+void __init tick_broadcast_init(void)
+{
+        alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
+        alloc_cpumask_var(&tmpmask, GFP_NOWAIT);
+#ifdef CONFIG_TICK_ONESHOT
+        alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
+        alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
+        alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
+#endif
+}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b1600a6973f4..5d3fb100bc06 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td,
                 * this cpu:
                 */
                if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
-                        tick_do_timer_cpu = cpu;
+                        if (!tick_nohz_full_cpu(cpu))
+                                tick_do_timer_cpu = cpu;
+                        else
+                                tick_do_timer_cpu = TICK_DO_TIMER_NONE;
                        tick_next_period = ktime_get();
                        tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
                }
@@ -323,6 +326,7 @@ static void tick_shutdown(unsigned int *cpup)
                 */
                dev->mode = CLOCK_EVT_MODE_UNUSED;
                clockevents_exchange_device(dev, NULL);
+                dev->event_handler = clockevents_handle_noop;
                td->evtdev = NULL;
        }
        raw_spin_unlock_irqrestore(&tick_device_lock, flags);
@@ -416,4 +420,5 @@ static struct notifier_block tick_notifier = {
 void __init tick_init(void)
 {
        clockevents_register_notifier(&tick_notifier);
+        tick_broadcast_init();
 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59ed6dc0..f0299eae4602 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
+extern seqlock_t jiffies_lock;
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 #define TICK_DO_TIMER_NONE      -1
@@ -94,7 +96,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
 extern void tick_shutdown_broadcast(unsigned int *cpup);
 extern void tick_suspend_broadcast(void);
 extern int tick_resume_broadcast(void);
+extern void tick_broadcast_init(void);
 extern void
 tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
@@ -119,6 +121,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
 static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
 static inline void tick_suspend_broadcast(void) { }
 static inline int tick_resume_broadcast(void) { return 0; }
+static inline void tick_broadcast_init(void) { }
 /*
 * Set the periodic handler in non broadcast mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a19a39952c1b..bc67d4245e1d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -21,11 +21,15 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/irq_work.h>
+#include <linux/posix-timers.h>
+#include <linux/perf_event.h>
 #include <asm/irq_regs.h>
 #include "tick-internal.h"
+#include <trace/events/timer.h>
 /*
 * Per cpu nohz control structure
 */
@@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now)
 {
        int cpu = smp_processor_id();
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
        /*
         * Check if the do_timer duty was dropped. We don't care about
         * concurrency: This happens only when the cpu in charge went
@@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
         * this duty, then the jiffies update is still serialized by
         * jiffies_lock.
         */
-        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+            && !tick_nohz_full_cpu(cpu))
                tick_do_timer_cpu = cpu;
 #endif
@@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now)
 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 {
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
        /*
         * When we are idle and the tick is stopped, we have to touch
         * the watchdog as we might not schedule for a really long
@@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
        profile_tick(CPU_PROFILING);
 }
+#ifdef CONFIG_NO_HZ_FULL
+static cpumask_var_t nohz_full_mask;
+bool have_nohz_full_mask;
+static bool can_stop_full_tick(void)
+{
+        WARN_ON_ONCE(!irqs_disabled());
+        if (!sched_can_stop_tick()) {
+                trace_tick_stop(0, "more than 1 task in runqueue\n");
+                return false;
+        }
+        if (!posix_cpu_timers_can_stop_tick(current)) {
+                trace_tick_stop(0, "posix timers running\n");
+                return false;
+        }
+        if (!perf_event_can_stop_tick()) {
+                trace_tick_stop(0, "perf events running\n");
+                return false;
+        }
+        /* sched_clock_tick() needs us? */
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+        /*
+         * TODO: kick full dynticks CPUs when
+         * sched_clock_stable is set.
+         */
+        if (!sched_clock_stable) {
+                trace_tick_stop(0, "unstable sched clock\n");
+                return false;
+        }
+#endif
+        return true;
+}
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
+/*
+ * Re-evaluate the need for the tick on the current CPU
+ * and restart it if necessary.
+ */
+void tick_nohz_full_check(void)
+{
+        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        if (tick_nohz_full_cpu(smp_processor_id())) {
+                if (ts->tick_stopped && !is_idle_task(current)) {
+                        if (!can_stop_full_tick())
+                                tick_nohz_restart_sched_tick(ts, ktime_get());
+                }
+        }
+}
+static void nohz_full_kick_work_func(struct irq_work *work)
+{
+        tick_nohz_full_check();
+}
+static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
+        .func = nohz_full_kick_work_func,
+};
+/*
+ * Kick the current CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick(void)
+{
+        if (tick_nohz_full_cpu(smp_processor_id()))
+                irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+}
+static void nohz_full_kick_ipi(void *info)
+{
+        tick_nohz_full_check();
+}
+/*
+ * Kick all full dynticks CPUs in order to force these to re-evaluate
+ * their dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick_all(void)
+{
+        if (!have_nohz_full_mask)
+                return;
+        preempt_disable();
+        smp_call_function_many(nohz_full_mask,
+                               nohz_full_kick_ipi, NULL, false);
+        preempt_enable();
+}
+/*
+ * Re-evaluate the need for the tick as we switch the current task.
+ * It might need the tick due to per task/process properties:
+ * perf events, posix cpu timers, ...
+ */
+void tick_nohz_task_switch(struct task_struct *tsk)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        if (!tick_nohz_full_cpu(smp_processor_id()))
+                goto out;
+        if (tick_nohz_tick_stopped() && !can_stop_full_tick())
+                tick_nohz_full_kick();
+out:
+        local_irq_restore(flags);
+}
+int tick_nohz_full_cpu(int cpu)
+{
+        if (!have_nohz_full_mask)
+                return 0;
+        return cpumask_test_cpu(cpu, nohz_full_mask);
+}
+/* Parse the boot-time nohz CPU list from the kernel parameters. */
+static int __init tick_nohz_full_setup(char *str)
+{
+        int cpu;
+        alloc_bootmem_cpumask_var(&nohz_full_mask);
+        if (cpulist_parse(str, nohz_full_mask) < 0) {
+                pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
+                return 1;
+        }
+        cpu = smp_processor_id();
+        if (cpumask_test_cpu(cpu, nohz_full_mask)) {
+                pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
+                cpumask_clear_cpu(cpu, nohz_full_mask);
+        }
+        have_nohz_full_mask = true;
+        return 1;
+}
+__setup("nohz_full=", tick_nohz_full_setup);
+static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
+                                                 unsigned long action,
+                                                 void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_DOWN_PREPARE:
+                /*
+                 * If we handle the timekeeping duty for full dynticks CPUs,
+                 * we can't safely shutdown that CPU.
+                 */
+                if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
+                        return -EINVAL;
+                break;
+        }
+        return NOTIFY_OK;
+}
+/*
+ * Worst case string length in chunks of CPU range seems 2 steps
+ * separations: 0,2,4,6,...
+ * This is NR_CPUS + sizeof('\0')
+ */
+static char __initdata nohz_full_buf[NR_CPUS + 1];
+static int tick_nohz_init_all(void)
+{
+        int err = -1;
+#ifdef CONFIG_NO_HZ_FULL_ALL
+        if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
+                pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
+                return err;
+        }
+        err = 0;
+        cpumask_setall(nohz_full_mask);
+        cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
+        have_nohz_full_mask = true;
+#endif
+        return err;
+}
+void __init tick_nohz_init(void)
+{
+        int cpu;
+        if (!have_nohz_full_mask) {
+                if (tick_nohz_init_all() < 0)
+                        return;
+        }
+        cpu_notifier(tick_nohz_cpu_down_callback, 0);
+        /* Make sure full dynticks CPU are also RCU nocbs */
+        for_each_cpu(cpu, nohz_full_mask) {
+                if (!rcu_is_nocb_cpu(cpu)) {
+                        pr_warning("NO_HZ: CPU %d is not RCU nocb: "
+                                   "cleared from nohz_full range", cpu);
+                        cpumask_clear_cpu(cpu, nohz_full_mask);
+                }
+        }
+        cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
+        pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
+}
+#else
+#define have_nohz_full_mask (0)
+#endif
 /*
 * NOHZ - aka dynamic tick functionality
 */
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
 * NO HZ enabled ?
 */
@@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                        delta_jiffies = rcu_delta_jiffies;
                }
        }
        /*
-         * Do not stop the tick, if we are only one off
+         * Do not stop the tick, if we are only one off (or less)
-         * or if the cpu is required for rcu
+         * or if the cpu is required for RCU:
         */
-        if (!ts->tick_stopped && delta_jiffies == 1)
+        if (!ts->tick_stopped && delta_jiffies <= 1)
                goto out;
        /* Schedule the tick, if we are at least one jiffie off */
@@ -378,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                        time_delta = KTIME_MAX;
                }
+#ifdef CONFIG_NO_HZ_FULL
+                if (!ts->inidle) {
+                        time_delta = min(time_delta,
+                                         scheduler_tick_max_deferment());
+                }
+#endif
                /*
                 * calculate the expiry time for the next timer wheel
                 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
@@ -421,6 +650,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                        ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
+                        trace_tick_stop(1, " ");
                }
                /*
@@ -457,6 +687,24 @@ out:
        return ret;
 }
+static void tick_nohz_full_stop_tick(struct tick_sched *ts)
+{
+#ifdef CONFIG_NO_HZ_FULL
+       int cpu = smp_processor_id();
+       if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
+               return;
+       if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
+               return;
+       if (!can_stop_full_tick())
+               return;
+       tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+#endif
+}
 static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 {
        /*
@@ -482,13 +730,28 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
                if (ratelimit < 10 &&
                    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
-                        printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+                        pr_warn("NOHZ: local_softirq_pending %02x\n",
-                               (unsigned int) local_softirq_pending());
+                                (unsigned int) local_softirq_pending());
                        ratelimit++;
                }
                return false;
        }
+        if (have_nohz_full_mask) {
+                /*
+                 * Keep the tick alive to guarantee timekeeping progression
+                 * if there are full dynticks CPUs around
+                 */
+                if (tick_do_timer_cpu == cpu)
+                        return false;
+                /*
+                 * Boot safety: make sure the timekeeping duty has been
+                 * assigned before entering dyntick-idle mode,
+                 */
+                if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+                        return false;
+        }
        return true;
 }
@@ -568,12 +831,13 @@ void tick_nohz_irq_exit(void)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
-        if (!ts->inidle)
+        if (ts->inidle) {
-                return;
+                /* Cancel the timer because CPU already waken up from the C-states*/
+                menu_hrtimer_cancel();
-        /* Cancel the timer because CPU already waken up from the C-states*/
+                __tick_nohz_idle_enter(ts);
-        menu_hrtimer_cancel();
+        } else {
-        __tick_nohz_idle_enter(ts);
+                tick_nohz_full_stop_tick(ts);
+        }
 }
 /**
@@ -802,7 +1066,7 @@ static inline void tick_check_nohz(int cpu)
 static inline void tick_nohz_switch_to_nohz(void) { }
 static inline void tick_check_nohz(int cpu) { }
-#endif /* NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
 /*
 * Called from irq_enter to notify about the possible interruption of idle()
@@ -887,14 +1151,14 @@ void tick_setup_sched_timer(void)
                now = ktime_get();
        }
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
        if (tick_nohz_enabled)
                ts->nohz_mode = NOHZ_MODE_HIGHRES;
 #endif
 }
 #endif /* HIGH_RES_TIMERS */
-#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
 void tick_cancel_sched_timer(int cpu)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9a0bc98fbe1d..98cd470bbe49 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -23,8 +23,13 @@
 #include <linux/stop_machine.h>
 #include <linux/pvclock_gtod.h>
+#include "tick-internal.h"
+#include "ntp_internal.h"
 static struct timekeeper timekeeper;
+static DEFINE_RAW_SPINLOCK(timekeeper_lock);
+static seqcount_t timekeeper_seq;
+static struct timekeeper shadow_timekeeper;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -67,6 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
        tk->wall_to_monotonic = wtm;
        set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
        tk->offs_real = timespec_to_ktime(tmp);
+        tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0));
 }
 static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -96,7 +102,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
        old_clock = tk->clock;
        tk->clock = clock;
-        clock->cycle_last = clock->read(clock);
+        tk->cycle_last = clock->cycle_last = clock->read(clock);
        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
@@ -201,8 +207,6 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 /**
 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
- *
- * Must hold write on timekeeper.lock
 */
 int pvclock_gtod_register_notifier(struct notifier_block *nb)
 {
@@ -210,11 +214,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
        unsigned long flags;
        int ret;
-        write_seqlock_irqsave(&tk->lock, flags);
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
-        /* update timekeeping data */
        update_pvclock_gtod(tk);
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        return ret;
 }
@@ -223,25 +226,22 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
 /**
 * pvclock_gtod_unregister_notifier - unregister a pvclock
 * timedata update listener
- *
- * Must hold write on timekeeper.lock
 */
 int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 {
-        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        int ret;
-        write_seqlock_irqsave(&tk->lock, flags);
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        return ret;
 }
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
-/* must hold write on timekeeper.lock */
+/* must hold timekeeper_lock */
-static void timekeeping_update(struct timekeeper *tk, bool clearntp)
+static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror)
 {
        if (clearntp) {
                tk->ntp_error = 0;
@@ -249,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
        }
        update_vsyscall(tk);
        update_pvclock_gtod(tk);
+        if (mirror)
+                memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
 }
 /**
@@ -267,7 +270,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
        clock = tk->clock;
        cycle_now = clock->read(clock);
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-        clock->cycle_last = cycle_now;
+        tk->cycle_last = clock->cycle_last = cycle_now;
        tk->xtime_nsec += cycle_delta * tk->mult;
@@ -294,12 +297,12 @@ int __getnstimeofday(struct timespec *ts)
        s64 nsecs = 0;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                ts->tv_sec = tk->xtime_sec;
                nsecs = timekeeping_get_ns(tk);
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        ts->tv_nsec = 0;
        timespec_add_ns(ts, nsecs);
@@ -335,11 +338,11 @@ ktime_t ktime_get(void)
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
                nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        /*
         * Use ktime_set/ktime_add_ns to create a proper ktime on
         * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -366,12 +369,12 @@ void ktime_get_ts(struct timespec *ts)
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                ts->tv_sec = tk->xtime_sec;
                nsec = timekeeping_get_ns(tk);
                tomono = tk->wall_to_monotonic;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        ts->tv_sec += tomono.tv_sec;
        ts->tv_nsec = 0;
@@ -379,6 +382,50 @@ void ktime_get_ts(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
+/**
+ * timekeeping_clocktai - Returns the TAI time of day in a timespec
+ * @ts:         pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void timekeeping_clocktai(struct timespec *ts)
+{
+        struct timekeeper *tk = &timekeeper;
+        unsigned long seq;
+        u64 nsecs;
+        WARN_ON(timekeeping_suspended);
+        do {
+                seq = read_seqcount_begin(&timekeeper_seq);
+                ts->tv_sec = tk->xtime_sec + tk->tai_offset;
+                nsecs = timekeeping_get_ns(tk);
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        ts->tv_nsec = 0;
+        timespec_add_ns(ts, nsecs);
+}
+EXPORT_SYMBOL(timekeeping_clocktai);
+/**
+ * ktime_get_clocktai - Returns the TAI time of day in a ktime
+ *
+ * Returns the time of day in a ktime.
+ */
+ktime_t ktime_get_clocktai(void)
+{
+        struct timespec ts;
+        timekeeping_clocktai(&ts);
+        return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL(ktime_get_clocktai);
 #ifdef CONFIG_NTP_PPS
 /**
@@ -399,7 +446,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
        WARN_ON_ONCE(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                *ts_raw = tk->raw_time;
                ts_real->tv_sec = tk->xtime_sec;
@@ -408,7 +455,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
                nsecs_raw = timekeeping_get_ns_raw(tk);
                nsecs_real = timekeeping_get_ns(tk);
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        timespec_add_ns(ts_raw, nsecs_raw);
        timespec_add_ns(ts_real, nsecs_real);
@@ -448,7 +495,8 @@ int do_settimeofday(const struct timespec *tv)
        if (!timespec_valid_strict(tv))
                return -EINVAL;
-        write_seqlock_irqsave(&tk->lock, flags);
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
+        write_seqcount_begin(&timekeeper_seq);
        timekeeping_forward_now(tk);
@@ -460,9 +508,10 @@ int do_settimeofday(const struct timespec *tv)
        tk_set_xtime(tk, tv);
-        timekeeping_update(tk, true);
+        timekeeping_update(tk, true, true);
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        write_seqcount_end(&timekeeper_seq);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
@@ -487,7 +536,8 @@ int timekeeping_inject_offset(struct timespec *ts)
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
-        write_seqlock_irqsave(&tk->lock, flags);
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
+        write_seqcount_begin(&timekeeper_seq);
        timekeeping_forward_now(tk);
@@ -502,9 +552,10 @@ int timekeeping_inject_offset(struct timespec *ts)
        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
 error: /* even if we error out, we forwarded the time, so call update */
-        timekeeping_update(tk, true);
+        timekeeping_update(tk, true, true);
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        write_seqcount_end(&timekeeper_seq);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
@@ -513,6 +564,52 @@ error: /* even if we error out, we forwarded the time, so call update */
 }
 EXPORT_SYMBOL(timekeeping_inject_offset);
+/**
+ * timekeeping_get_tai_offset - Returns current TAI offset from UTC
+ *
+ */
+s32 timekeeping_get_tai_offset(void)
+{
+        struct timekeeper *tk = &timekeeper;
+        unsigned int seq;
+        s32 ret;
+        do {
+                seq = read_seqcount_begin(&timekeeper_seq);
+                ret = tk->tai_offset;
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
+        return ret;
+}
+/**
+ * __timekeeping_set_tai_offset - Lock free worker function
+ *
+ */
+static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
+{
+        tk->tai_offset = tai_offset;
+        tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0));
+}
+/**
+ * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
+ *
+ */
+void timekeeping_set_tai_offset(s32 tai_offset)
+{
+        struct timekeeper *tk = &timekeeper;
+        unsigned long flags;
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
+        write_seqcount_begin(&timekeeper_seq);
+        __timekeeping_set_tai_offset(tk, tai_offset);
+        write_seqcount_end(&timekeeper_seq);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+        clock_was_set();
+}
 /**
 * change_clocksource - Swaps clocksources if a new one is available
 *
@@ -526,7 +623,8 @@ static int change_clocksource(void *data)
        new = (struct clocksource *) data;
-        write_seqlock_irqsave(&tk->lock, flags);
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
+        write_seqcount_begin(&timekeeper_seq);
        timekeeping_forward_now(tk);
        if (!new->enable || new->enable(new) == 0) {
@@ -535,9 +633,10 @@ static int change_clocksource(void *data)
                if (old->disable)
                        old->disable(old);
        }
-        timekeeping_update(tk, true);
+        timekeeping_update(tk, true, true);
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        write_seqcount_end(&timekeeper_seq);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        return 0;
 }
@@ -587,11 +686,11 @@ void getrawmonotonic(struct timespec *ts)
        s64 nsecs;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                nsecs = timekeeping_get_ns_raw(tk);
                *ts = tk->raw_time;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        timespec_add_ns(ts, nsecs);
 }
@@ -607,11 +706,11 @@ int timekeeping_valid_for_hres(void)
        int ret;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        return ret;
 }
@@ -626,11 +725,11 @@ u64 timekeeping_max_deferment(void)
        u64 ret;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                ret = tk->clock->max_idle_ns;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        return ret;
 }
@@ -693,11 +792,10 @@ void __init timekeeping_init(void)
                boot.tv_nsec = 0;
        }
-        seqlock_init(&tk->lock);
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
+        write_seqcount_begin(&timekeeper_seq);
        ntp_init();
-        write_seqlock_irqsave(&tk->lock, flags);
        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
@@ -716,7 +814,10 @@ void __init timekeeping_init(void)
        tmp.tv_nsec = 0;
        tk_set_sleep_time(tk, tmp);
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
+        write_seqcount_end(&timekeeper_seq);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 /* time in seconds when suspend began */
@@ -764,15 +865,17 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
        if (has_persistent_clock())
                return;
-        write_seqlock_irqsave(&tk->lock, flags);
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
+        write_seqcount_begin(&timekeeper_seq);
        timekeeping_forward_now(tk);
        __timekeeping_inject_sleeptime(tk, delta);
-        timekeeping_update(tk, true);
+        timekeeping_update(tk, true, true);
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        write_seqcount_end(&timekeeper_seq);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
@@ -788,26 +891,72 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 static void timekeeping_resume(void)
 {
        struct timekeeper *tk = &timekeeper;
+        struct clocksource *clock = tk->clock;
        unsigned long flags;
-        struct timespec ts;
+        struct timespec ts_new, ts_delta;
+        cycle_t cycle_now, cycle_delta;
+        bool suspendtime_found = false;
-        read_persistent_clock(&ts);
+        read_persistent_clock(&ts_new);
        clockevents_resume();
        clocksource_resume();
-        write_seqlock_irqsave(&tk->lock, flags);
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
+        write_seqcount_begin(&timekeeper_seq);
+        /*
+         * After system resumes, we need to calculate the suspended time and
+         * compensate it for the OS time. There are 3 sources that could be
+         * used: Nonstop clocksource during suspend, persistent clock and rtc
+         * device.
+         *
+         * One specific platform may have 1 or 2 or all of them, and the
+         * preference will be:
+         *      suspend-nonstop clocksource -> persistent clock -> rtc
+         * The less preferred source will only be tried if there is no better
+         * usable source. The rtc part is handled separately in rtc core code.
+         */
+        cycle_now = clock->read(clock);
+        if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
+                cycle_now > clock->cycle_last) {
+                u64 num, max = ULLONG_MAX;
+                u32 mult = clock->mult;
+                u32 shift = clock->shift;
+                s64 nsec = 0;
+                cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-        if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
+                /*
-                ts = timespec_sub(ts, timekeeping_suspend_time);
+                 * "cycle_delta * mutl" may cause 64 bits overflow, if the
-                __timekeeping_inject_sleeptime(tk, &ts);
+                 * suspended time is too long. In that case we need do the
+                 * 64 bits math carefully
+                 */
+                do_div(max, mult);
+                if (cycle_delta > max) {
+                        num = div64_u64(cycle_delta, max);
+                        nsec = (((u64) max * mult) >> shift) * num;
+                        cycle_delta -= num * max;
+                }
+                nsec += ((u64) cycle_delta * mult) >> shift;
+                ts_delta = ns_to_timespec(nsec);
+                suspendtime_found = true;
+        } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) {
+                ts_delta = timespec_sub(ts_new, timekeeping_suspend_time);
+                suspendtime_found = true;
        }
-        /* re-base the last cycle value */
-        tk->clock->cycle_last = tk->clock->read(tk->clock);
+        if (suspendtime_found)
+                __timekeeping_inject_sleeptime(tk, &ts_delta);
+        /* Re-base the last cycle value */
+        tk->cycle_last = clock->cycle_last = cycle_now;
        tk->ntp_error = 0;
        timekeeping_suspended = 0;
-        timekeeping_update(tk, false);
+        timekeeping_update(tk, false, true);
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        write_seqcount_end(&timekeeper_seq);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        touch_softlockup_watchdog();
@@ -826,7 +975,8 @@ static int timekeeping_suspend(void)
        read_persistent_clock(&timekeeping_suspend_time);
-        write_seqlock_irqsave(&tk->lock, flags);
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
+        write_seqcount_begin(&timekeeper_seq);
        timekeeping_forward_now(tk);
        timekeeping_suspended = 1;
@@ -849,7 +999,8 @@ static int timekeeping_suspend(void)
                timekeeping_suspend_time =
                        timespec_add(timekeeping_suspend_time, delta_delta);
        }
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        write_seqcount_end(&timekeeper_seq);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
        clocksource_suspend();
@@ -1099,6 +1250,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
                        tk_set_wall_to_mono(tk,
                                timespec_sub(tk->wall_to_monotonic, ts));
+                        __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
                        clock_was_set_delayed();
                }
        }
@@ -1116,15 +1269,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
 static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
                                                u32 shift)
 {
+        cycle_t interval = tk->cycle_interval << shift;
        u64 raw_nsecs;
        /* If the offset is smaller then a shifted interval, do nothing */
-        if (offset < tk->cycle_interval<<shift)
+        if (offset < interval)
                return offset;
        /* Accumulate one shifted interval */
-        offset -= tk->cycle_interval << shift;
+        offset -= interval;
-        tk->clock->cycle_last += tk->cycle_interval << shift;
+        tk->cycle_last += interval;
        tk->xtime_nsec += tk->xtime_interval << shift;
        accumulate_nsecs_to_secs(tk);
@@ -1181,27 +1335,28 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
 static void update_wall_time(void)
 {
        struct clocksource *clock;
-        struct timekeeper *tk = &timekeeper;
+        struct timekeeper *real_tk = &timekeeper;
+        struct timekeeper *tk = &shadow_timekeeper;
        cycle_t offset;
        int shift = 0, maxshift;
        unsigned long flags;
-        write_seqlock_irqsave(&tk->lock, flags);
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
                goto out;
-        clock = tk->clock;
+        clock = real_tk->clock;
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-        offset = tk->cycle_interval;
+        offset = real_tk->cycle_interval;
 #else
        offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
        /* Check if there's really nothing to do */
-        if (offset < tk->cycle_interval)
+        if (offset < real_tk->cycle_interval)
                goto out;
        /*
@@ -1238,11 +1393,24 @@ static void update_wall_time(void)
         */
        accumulate_nsecs_to_secs(tk);
-        timekeeping_update(tk, false);
+        write_seqcount_begin(&timekeeper_seq);
+        /* Update clock->cycle_last with the new value */
+        clock->cycle_last = tk->cycle_last;
+        /*
+         * Update the real timekeeper.
+         *
+         * We could avoid this memcpy by switching pointers, but that
+         * requires changes to all other timekeeper usage sites as
+         * well, i.e. move the timekeeper pointer getter into the
+         * spinlocked/seqcount protected sections. And we trade this
+         * memcpy under the timekeeper_seq against one before we start
+         * updating.
+         */
+        memcpy(real_tk, tk, sizeof(*tk));
+        timekeeping_update(real_tk, false, false);
+        write_seqcount_end(&timekeeper_seq);
 out:
-        write_sequnlock_irqrestore(&tk->lock, flags);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 }
 /**
@@ -1289,13 +1457,13 @@ void get_monotonic_boottime(struct timespec *ts)
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                ts->tv_sec = tk->xtime_sec;
                nsec = timekeeping_get_ns(tk);
                tomono = tk->wall_to_monotonic;
                sleep = tk->total_sleep_time;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
        ts->tv_nsec = 0;
@@ -1354,10 +1522,10 @@ struct timespec current_kernel_time(void)
        unsigned long seq;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                now = tk_xtime(tk);
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        return now;
 }
@@ -1370,11 +1538,11 @@ struct timespec get_monotonic_coarse(void)
        unsigned long seq;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                now = tk_xtime(tk);
                mono = tk->wall_to_monotonic;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
                                now.tv_nsec + mono.tv_nsec);
@@ -1405,11 +1573,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
        unsigned long seq;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                *xtim = tk_xtime(tk);
                *wtom = tk->wall_to_monotonic;
                *sleep = tk->total_sleep_time;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
 }
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1421,7 +1589,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 * Returns current monotonic time and updates the offsets
 * Called from hrtimer_interupt() or retrigger_next_event()
 */
-ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
+                                                        ktime_t *offs_tai)
 {
        struct timekeeper *tk = &timekeeper;
        ktime_t now;
@@ -1429,14 +1598,15 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
        u64 secs, nsecs;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                secs = tk->xtime_sec;
                nsecs = timekeeping_get_ns(tk);
                *offs_real = tk->offs_real;
                *offs_boot = tk->offs_boot;
-        } while (read_seqretry(&tk->lock, seq));
+                *offs_tai = tk->offs_tai;
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        now = ktime_add_ns(ktime_set(secs, 0), nsecs);
        now = ktime_sub(now, *offs_real);
@@ -1454,15 +1624,79 @@ ktime_t ktime_get_monotonic_offset(void)
        struct timespec wtom;
        do {
-                seq = read_seqbegin(&tk->lock);
+                seq = read_seqcount_begin(&timekeeper_seq);
                wtom = tk->wall_to_monotonic;
-        } while (read_seqretry(&tk->lock, seq));
+        } while (read_seqcount_retry(&timekeeper_seq, seq));
        return timespec_to_ktime(wtom);
 }
 EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
 /**
+ * do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ */
+int do_adjtimex(struct timex *txc)
+{
+        struct timekeeper *tk = &timekeeper;
+        unsigned long flags;
+        struct timespec ts;
+        s32 orig_tai, tai;
+        int ret;
+        /* Validate the data before disabling interrupts */
+        ret = ntp_validate_timex(txc);
+        if (ret)
+                return ret;
+        if (txc->modes & ADJ_SETOFFSET) {
+                struct timespec delta;
+                delta.tv_sec  = txc->time.tv_sec;
+                delta.tv_nsec = txc->time.tv_usec;
+                if (!(txc->modes & ADJ_NANO))
+                        delta.tv_nsec *= 1000;
+                ret = timekeeping_inject_offset(&delta);
+                if (ret)
+                        return ret;
+        }
+        getnstimeofday(&ts);
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
+        write_seqcount_begin(&timekeeper_seq);
+        orig_tai = tai = tk->tai_offset;
+        ret = __do_adjtimex(txc, &ts, &tai);
+        if (tai != orig_tai) {
+                __timekeeping_set_tai_offset(tk, tai);
+                clock_was_set_delayed();
+        }
+        write_seqcount_end(&timekeeper_seq);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+        return ret;
+}
+#ifdef CONFIG_NTP_PPS
+/**
+ * hardpps() - Accessor function to NTP __hardpps function
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+        unsigned long flags;
+        raw_spin_lock_irqsave(&timekeeper_lock, flags);
+        write_seqcount_begin(&timekeeper_seq);
+        __hardpps(phase_ts, raw_ts);
+        write_seqcount_end(&timekeeper_seq);
+        raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+#endif
+/**
 * xtime_update() - advances the timekeeping infrastructure
 * @ticks:      number of ticks, that have elapsed since the last call.
 *
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9f164b..3bdf28323012 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -20,6 +20,13 @@
 #include <asm/uaccess.h>
+struct timer_list_iter {
+        int cpu;
+        bool second_pass;
+        u64 now;
+};
 typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
@@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;
-        SEQ_printf(m, "\n");
        SEQ_printf(m, "cpu: %d\n", cpu);
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                SEQ_printf(m, " clock %d:\n", i);
@@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 #undef P
 #undef P_ns
+        SEQ_printf(m, "\n");
 }
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 {
        struct clock_event_device *dev = td->evtdev;
-        SEQ_printf(m, "\n");
        SEQ_printf(m, "Tick Device: mode:     %d\n", td->mode);
        if (cpu < 0)
                SEQ_printf(m, "Broadcast device\n");
@@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
        print_name_offset(m, dev->event_handler);
        SEQ_printf(m, "\n");
        SEQ_printf(m, " retries:        %lu\n", dev->retries);
+        SEQ_printf(m, "\n");
 }
-static void timer_list_show_tickdevices(struct seq_file *m)
+static void timer_list_show_tickdevices_header(struct seq_file *m)
 {
-        int cpu;
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
        print_tickdevice(m, tick_get_broadcast_device(), -1);
        SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
@@ -246,47 +251,104 @@ static void timer_list_show_tickdevices(struct seq_file *m)
 #endif
        SEQ_printf(m, "\n");
 #endif
-        for_each_online_cpu(cpu)
-                print_tickdevice(m, tick_get_device(cpu), cpu);
-        SEQ_printf(m, "\n");
 }
-#else
-static void timer_list_show_tickdevices(struct seq_file *m) { }
 #endif
+static inline void timer_list_header(struct seq_file *m, u64 now)
+{
+        SEQ_printf(m, "Timer List Version: v0.7\n");
+        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
+        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+        SEQ_printf(m, "\n");
+}
 static int timer_list_show(struct seq_file *m, void *v)
 {
+        struct timer_list_iter *iter = v;
+        u64 now = ktime_to_ns(ktime_get());
+        if (iter->cpu == -1 && !iter->second_pass)
+                timer_list_header(m, now);
+        else if (!iter->second_pass)
+                print_cpu(m, iter->cpu, iter->now);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+        else if (iter->cpu == -1 && iter->second_pass)
+                timer_list_show_tickdevices_header(m);
+        else
+                print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
+#endif
+        return 0;
+}
+void sysrq_timer_list_show(void)
+{
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
-        SEQ_printf(m, "Timer List Version: v0.7\n");
+        timer_list_header(NULL, now);
-        SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
-        SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
        for_each_online_cpu(cpu)
-                print_cpu(m, cpu, now);
+                print_cpu(NULL, cpu, now);
-        SEQ_printf(m, "\n");
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
-        timer_list_show_tickdevices(m);
+        timer_list_show_tickdevices_header(NULL);
+        for_each_online_cpu(cpu)
+                print_tickdevice(NULL, tick_get_device(cpu), cpu);
+#endif
+        return;
+}
-        return 0;
+static void *timer_list_start(struct seq_file *file, loff_t *offset)
+{
+        struct timer_list_iter *iter = file->private;
+        if (!*offset) {
+                iter->cpu = -1;
+                iter->now = ktime_to_ns(ktime_get());
+        } else if (iter->cpu >= nr_cpu_ids) {
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+                if (!iter->second_pass) {
+                        iter->cpu = -1;
+                        iter->second_pass = true;
+                } else
+                        return NULL;
+#else
+                return NULL;
+#endif
+        }
+        return iter;
 }
-void sysrq_timer_list_show(void)
+static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
+{
+        struct timer_list_iter *iter = file->private;
+        iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
+        ++*offset;
+        return timer_list_start(file, offset);
+}
+static void timer_list_stop(struct seq_file *seq, void *v)
 {
-        timer_list_show(NULL, NULL);
 }
+static const struct seq_operations timer_list_sops = {
+        .start = timer_list_start,
+        .next = timer_list_next,
+        .stop = timer_list_stop,
+        .show = timer_list_show,
+};
 static int timer_list_open(struct inode *inode, struct file *filp)
 {
-        return single_open(filp, timer_list_show, NULL);
+        return seq_open_private(filp, &timer_list_sops,
+                        sizeof(struct timer_list_iter));
 }
 static const struct file_operations timer_list_fops = {
        .open           = timer_list_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = single_release,
+        .release        = seq_release_private,
 };
 static int __init init_timer_list_procfs(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index dbf7a78a1ef1..a860bba34412 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
 /*
 *  linux/kernel/timer.c
 *
- *  Kernel internal timers, basic process system calls
+ *  Kernel internal timers
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
@@ -41,6 +41,7 @@
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/slab.h>
+#include <linux/compat.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -738,7 +739,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
        cpu = smp_processor_id();
-#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
        if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
                cpu = get_nohz_timer_target();
 #endif
@@ -930,14 +931,14 @@ void add_timer_on(struct timer_list *timer, int cpu)
        debug_activate(timer, timer->expires);
        internal_add_timer(base, timer);
        /*
-         * Check whether the other CPU is idle and needs to be
+         * Check whether the other CPU is in dynticks mode and needs
-         * triggered to reevaluate the timer wheel when nohz is
+         * to be triggered to reevaluate the timer wheel.
-         * active. We are protected against the other CPU fiddling
+         * We are protected against the other CPU fiddling
         * with the timer by holding the timer base lock. This also
-         * makes sure that a CPU on the way to idle can not evaluate
+         * makes sure that a CPU on the way to stop its tick can not
-         * the timer wheel.
+         * evaluate the timer wheel.
         */
-        wake_up_idle_cpu(cpu);
+        wake_up_nohz_cpu(cpu);
        spin_unlock_irqrestore(&base->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1188,7 +1189,7 @@ static inline void __run_timers(struct tvec_base *base)
        spin_unlock_irq(&base->lock);
 }
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
 * Find out when the next timer event is due to happen. This
 * is used on S/390 to stop all activity when a CPU is idle.
@@ -1395,61 +1396,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
 #endif
-/**
- * sys_getpid - return the thread group id of the current process
- *
- * Note, despite the name, this returns the tgid not the pid.  The tgid and
- * the pid are identical unless CLONE_THREAD was specified on clone() in
- * which case the tgid is the same in all threads of the same group.
- *
- * This is SMP safe as current->tgid does not change.
- */
-SYSCALL_DEFINE0(getpid)
-{
-        return task_tgid_vnr(current);
-}
-/*
- * Accessing ->real_parent is not SMP-safe, it could
- * change from under us. However, we can use a stale
- * value of ->real_parent under rcu_read_lock(), see
- * release_task()->call_rcu(delayed_put_task_struct).
- */
-SYSCALL_DEFINE0(getppid)
-{
-        int pid;
-        rcu_read_lock();
-        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
-        rcu_read_unlock();
-        return pid;
-}
-SYSCALL_DEFINE0(getuid)
-{
-        /* Only we change this so SMP safe */
-        return from_kuid_munged(current_user_ns(), current_uid());
-}
-SYSCALL_DEFINE0(geteuid)
-{
-        /* Only we change this so SMP safe */
-        return from_kuid_munged(current_user_ns(), current_euid());
-}
-SYSCALL_DEFINE0(getgid)
-{
-        /* Only we change this so SMP safe */
-        return from_kgid_munged(current_user_ns(), current_gid());
-}
-SYSCALL_DEFINE0(getegid)
-{
-        /* Only we change this so SMP safe */
-        return from_kgid_munged(current_user_ns(), current_egid());
-}
 static void process_timeout(unsigned long __data)
 {
        wake_up_process((struct task_struct *)__data);
@@ -1557,91 +1503,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-/* Thread ID - the internal kernel "pid" */
-SYSCALL_DEFINE0(gettid)
-{
-        return task_pid_vnr(current);
-}
-/**
- * do_sysinfo - fill in sysinfo struct
- * @info: pointer to buffer to fill
- */
-int do_sysinfo(struct sysinfo *info)
-{
-        unsigned long mem_total, sav_total;
-        unsigned int mem_unit, bitcount;
-        struct timespec tp;
-        memset(info, 0, sizeof(struct sysinfo));
-        ktime_get_ts(&tp);
-        monotonic_to_bootbased(&tp);
-        info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
-        get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
-        info->procs = nr_threads;
-        si_meminfo(info);
-        si_swapinfo(info);
-        /*
-         * If the sum of all the available memory (i.e. ram + swap)
-         * is less than can be stored in a 32 bit unsigned long then
-         * we can be binary compatible with 2.2.x kernels.  If not,
-         * well, in that case 2.2.x was broken anyways...
-         *
-         *  -Erik Andersen <andersee@debian.org>
-         */
-        mem_total = info->totalram + info->totalswap;
-        if (mem_total < info->totalram || mem_total < info->totalswap)
-                goto out;
-        bitcount = 0;
-        mem_unit = info->mem_unit;
-        while (mem_unit > 1) {
-                bitcount++;
-                mem_unit >>= 1;
-                sav_total = mem_total;
-                mem_total <<= 1;
-                if (mem_total < sav_total)
-                        goto out;
-        }
-        /*
-         * If mem_total did not overflow, multiply all memory values by
-         * info->mem_unit and set it to 1.  This leaves things compatible
-         * with 2.2.x, and also retains compatibility with earlier 2.4.x
-         * kernels...
-         */
-        info->mem_unit = 1;
-        info->totalram <<= bitcount;
-        info->freeram <<= bitcount;
-        info->sharedram <<= bitcount;
-        info->bufferram <<= bitcount;
-        info->totalswap <<= bitcount;
-        info->freeswap <<= bitcount;
-        info->totalhigh <<= bitcount;
-        info->freehigh <<= bitcount;
-out:
-        return 0;
-}
-SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
-{
-        struct sysinfo val;
-        do_sysinfo(&val);
-        if (copy_to_user(info, &val, sizeof(struct sysinfo)))
-                return -EFAULT;
-        return 0;
-}
 static int __cpuinit init_timers_cpu(int cpu)
 {
        int j;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 192473b22799..5e9efd4b83a4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -176,6 +176,8 @@ config IRQSOFF_TRACER
        select GENERIC_TRACER
        select TRACER_MAX_TRACE
        select RING_BUFFER_ALLOW_SWAP
+        select TRACER_SNAPSHOT
+        select TRACER_SNAPSHOT_PER_CPU_SWAP
        help
          This option measures the time spent in irqs-off critical
          sections, with microsecond accuracy.
@@ -198,6 +200,8 @@ config PREEMPT_TRACER
        select GENERIC_TRACER
        select TRACER_MAX_TRACE
        select RING_BUFFER_ALLOW_SWAP
+        select TRACER_SNAPSHOT
+        select TRACER_SNAPSHOT_PER_CPU_SWAP
        help
          This option measures the time spent in preemption-off critical
          sections, with microsecond accuracy.
@@ -217,6 +221,7 @@ config SCHED_TRACER
        select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
        select TRACER_MAX_TRACE
+        select TRACER_SNAPSHOT
        help
          This tracer tracks the latency of the highest priority task
          to be scheduled in, starting from the point it has woken up.
@@ -248,6 +253,27 @@ config TRACER_SNAPSHOT
              echo 1 > /sys/kernel/debug/tracing/snapshot
              cat snapshot
+config TRACER_SNAPSHOT_PER_CPU_SWAP
+        bool "Allow snapshot to swap per CPU"
+        depends on TRACER_SNAPSHOT
+        select RING_BUFFER_ALLOW_SWAP
+        help
+          Allow doing a snapshot of a single CPU buffer instead of a
+          full swap (all buffers). If this is set, then the following is
+          allowed:
+              echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
+          After which, only the tracing buffer for CPU 2 was swapped with
+          the main tracing buffer, and the other CPU buffers remain the same.
+          When this is enabled, this adds a little more overhead to the
+          trace recording, as it needs to add some checks to synchronize
+          recording with swaps. But this does not affect the performance
+          of the overall system. This is enabled by default when the preempt
+          or irq latency tracers are enabled, as those need to swap as well
+          and already adds the overhead (plus a lot more).
 config TRACE_BRANCH_PROFILING
        bool
        select GENERIC_TRACER
@@ -414,24 +440,28 @@ config PROBE_EVENTS
        def_bool n
 config DYNAMIC_FTRACE
-        bool "enable/disable ftrace tracepoints dynamically"
+        bool "enable/disable function tracing dynamically"
        depends on FUNCTION_TRACER
        depends on HAVE_DYNAMIC_FTRACE
        default y
        help
-          This option will modify all the calls to ftrace dynamically
+          This option will modify all the calls to function tracing
-          (will patch them out of the binary image and replace them
+          dynamically (will patch them out of the binary image and
-          with a No-Op instruction) as they are called. A table is
+          replace them with a No-Op instruction) on boot up. During
-          created to dynamically enable them again.
+          compile time, a table is made of all the locations that ftrace
+          can function trace, and this table is linked into the kernel
+          image. When this is enabled, functions can be individually
+          enabled, and the functions not enabled will not affect
+          performance of the system.
+          See the files in /sys/kernel/debug/tracing:
+            available_filter_functions
+            set_ftrace_filter
+            set_ftrace_notrace
          This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
          otherwise has native performance as long as no tracing is active.
-          The changes to the code are done by a kernel thread that
-          wakes up once a second and checks to see if any ftrace calls
-          were made. If so, it runs stop_machine (stops all CPUS)
-          and modifies the code to jump over the call to ftrace.
 config DYNAMIC_FTRACE_WITH_REGS
        def_bool y
        depends on DYNAMIC_FTRACE
@@ -520,6 +550,29 @@ config RING_BUFFER_BENCHMARK
          If unsure, say N.
+config RING_BUFFER_STARTUP_TEST
+       bool "Ring buffer startup self test"
+       depends on RING_BUFFER
+       help
+         Run a simple self test on the ring buffer on boot up. Late in the
+         kernel boot sequence, the test will start that kicks off
+         a thread per cpu. Each thread will write various size events
+         into the ring buffer. Another thread is created to send IPIs
+         to each of the threads, where the IPI handler will also write
+         to the ring buffer, to test/stress the nesting ability.
+         If any anomalies are discovered, a warning will be displayed
+         and all ring buffers will be disabled.
+         The test runs for 10 seconds. This will slow your boot time
+         by at least 10 more seconds.
+         At the end of the test, statics and more checks are done.
+         It will output the stats of each per cpu buffer. What
+         was written, the sizes, what was read, what was lost, and
+         other similar details.
+         If unsure, say N
 endif # FTRACE
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e5b8c272eec..ed58a3216a6d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
        bool blk_tracer = blk_tracer_enabled;
        if (blk_tracer) {
-                buffer = blk_tr->buffer;
+                buffer = blk_tr->trace_buffer.buffer;
                pc = preempt_count();
                event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
                                                  sizeof(*t) + len,
@@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        if (blk_tracer) {
                tracing_record_cmdline(current);
-                buffer = blk_tr->buffer;
+                buffer = blk_tr->trace_buffer.buffer;
                pc = preempt_count();
                event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
                                                  sizeof(*t) + pdu_len,
@@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,
                                      struct request_queue *q,
                                      struct request *rq)
 {
-        struct blk_trace *bt = q->blk_trace;
-        /* if control ever passes through here, it's a request based driver */
-        if (unlikely(bt && !bt->rq_based))
-                bt->rq_based = true;
        blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
 }
@@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
-static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
+static void blk_add_trace_bio_complete(void *ignore,
+                                       struct request_queue *q, struct bio *bio,
+                                       int error)
 {
-        struct request_queue *q;
-        struct blk_trace *bt;
-        if (!bio->bi_bdev)
-                return;
-        q = bdev_get_queue(bio->bi_bdev);
-        bt = q->blk_trace;
-        /*
-         * Request based drivers will generate both rq and bio completions.
-         * Ignore bio ones.
-         */
-        if (likely(!bt) || bt->rq_based)
-                return;
        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ab25b88aae56..8a5c017bb50c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -66,7 +66,7 @@
 static struct ftrace_ops ftrace_list_end __read_mostly = {
        .func           = ftrace_stub,
-        .flags          = FTRACE_OPS_FL_RECURSION_SAFE,
+        .flags          = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
 };
 /* ftrace_enabled is a method to turn ftrace on or off */
@@ -486,7 +486,6 @@ struct ftrace_profile_stat {
 #define PROFILES_PER_PAGE                                       \
        (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
-static int ftrace_profile_bits __read_mostly;
 static int ftrace_profile_enabled __read_mostly;
 /* ftrace_profile_lock - synchronize the enable and disable of the profiler */
@@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);
 static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
-#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+#define FTRACE_PROFILE_HASH_BITS 10
+#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)
 static void *
 function_stat_next(void *v, int idx)
@@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
        pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
-        for (i = 0; i < pages; i++) {
+        for (i = 1; i < pages; i++) {
                pg->next = (void *)get_zeroed_page(GFP_KERNEL);
                if (!pg->next)
                        goto out_free;
@@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
                free_page(tmp);
        }
-        free_page((unsigned long)stat->pages);
        stat->pages = NULL;
        stat->start = NULL;
@@ -725,13 +724,6 @@ static int ftrace_profile_init_cpu(int cpu)
        if (!stat->hash)
                return -ENOMEM;
-        if (!ftrace_profile_bits) {
-                size--;
-                for (; size; size >>= 1)
-                        ftrace_profile_bits++;
-        }
        /* Preallocate the function profiling pages */
        if (ftrace_profile_pages_init(stat) < 0) {
                kfree(stat->hash);
@@ -764,7 +756,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
        struct hlist_head *hhd;
        unsigned long key;
-        key = hash_long(ip, ftrace_profile_bits);
+        key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);
        hhd = &stat->hash[key];
        if (hlist_empty(hhd))
@@ -783,7 +775,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
 {
        unsigned long key;
-        key = hash_long(rec->ip, ftrace_profile_bits);
+        key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);
        hlist_add_head_rcu(&rec->node, &stat->hash[key]);
 }
@@ -1053,6 +1045,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+loff_t
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
+{
+        loff_t ret;
+        if (file->f_mode & FMODE_READ)
+                ret = seq_lseek(file, offset, whence);
+        else
+                file->f_pos = ret = 1;
+        return ret;
+}
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1067,7 +1072,7 @@ struct ftrace_func_probe {
        unsigned long           flags;
        unsigned long           ip;
        void                    *data;
-        struct rcu_head         rcu;
+        struct list_head        free_list;
 };
 struct ftrace_func_entry {
@@ -1317,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
        struct hlist_head *hhd;
        struct ftrace_hash *old_hash;
        struct ftrace_hash *new_hash;
-        unsigned long key;
        int size = src->count;
        int bits = 0;
        int ret;
@@ -1360,10 +1364,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
        for (i = 0; i < size; i++) {
                hhd = &src->buckets[i];
                hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
-                        if (bits > 0)
-                                key = hash_long(entry->ip, bits);
-                        else
-                                key = 0;
                        remove_hash_entry(src, entry);
                        __add_hash_entry(new_hash, entry);
                }
@@ -2613,7 +2613,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
 * routine, you can use ftrace_filter_write() for the write
 * routine if @flag has FTRACE_ITER_FILTER set, or
 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_regex_lseek() should be used as the lseek routine, and
+ * ftrace_filter_lseek() should be used as the lseek routine, and
 * release must call ftrace_regex_release().
 */
 int
@@ -2697,19 +2697,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
                                 inode, file);
 }
-loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
-{
-        loff_t ret;
-        if (file->f_mode & FMODE_READ)
-                ret = seq_lseek(file, offset, whence);
-        else
-                file->f_pos = ret = 1;
-        return ret;
-}
 static int ftrace_match(char *str, char *regex, int len, int type)
 {
        int matched = 0;
@@ -2974,28 +2961,27 @@ static void __disable_ftrace_function_probe(void)
 }
-static void ftrace_free_entry_rcu(struct rcu_head *rhp)
+static void ftrace_free_entry(struct ftrace_func_probe *entry)
 {
-        struct ftrace_func_probe *entry =
-                container_of(rhp, struct ftrace_func_probe, rcu);
        if (entry->ops->free)
-                entry->ops->free(&entry->data);
+                entry->ops->free(entry->ops, entry->ip, &entry->data);
        kfree(entry);
 }
 int
 register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                              void *data)
 {
        struct ftrace_func_probe *entry;
+        struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+        struct ftrace_hash *hash;
        struct ftrace_page *pg;
        struct dyn_ftrace *rec;
        int type, len, not;
        unsigned long key;
        int count = 0;
        char *search;
+        int ret;
        type = filter_parse_regex(glob, strlen(glob), &search, &not);
        len = strlen(search);
@@ -3006,8 +2992,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        mutex_lock(&ftrace_lock);
-        if (unlikely(ftrace_disabled))
+        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+        if (!hash) {
+                count = -ENOMEM;
+                goto out_unlock;
+        }
+        if (unlikely(ftrace_disabled)) {
+                count = -ENODEV;
                goto out_unlock;
+        }
        do_for_each_ftrace_rec(pg, rec) {
@@ -3031,14 +3025,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                 * for each function we find. We call the callback
                 * to give the caller an opportunity to do so.
                 */
-                if (ops->callback) {
+                if (ops->init) {
-                        if (ops->callback(rec->ip, &entry->data) < 0) {
+                        if (ops->init(ops, rec->ip, &entry->data) < 0) {
                                /* caller does not like this func */
                                kfree(entry);
                                continue;
                        }
                }
+                ret = enter_record(hash, rec, 0);
+                if (ret < 0) {
+                        kfree(entry);
+                        count = ret;
+                        goto out_unlock;
+                }
                entry->ops = ops;
                entry->ip = rec->ip;
@@ -3046,10 +3047,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
        } while_for_each_ftrace_rec();
+        ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+        if (ret < 0)
+                count = ret;
        __enable_ftrace_function_probe();
 out_unlock:
        mutex_unlock(&ftrace_lock);
+        free_ftrace_hash(hash);
        return count;
 }
@@ -3063,7 +3070,12 @@ static void
 __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                                  void *data, int flags)
 {
+        struct ftrace_func_entry *rec_entry;
        struct ftrace_func_probe *entry;
+        struct ftrace_func_probe *p;
+        struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+        struct list_head free_list;
+        struct ftrace_hash *hash;
        struct hlist_node *tmp;
        char str[KSYM_SYMBOL_LEN];
        int type = MATCH_FULL;
@@ -3084,6 +3096,14 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        }
        mutex_lock(&ftrace_lock);
+        hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+        if (!hash)
+                /* Hmm, should report this somehow */
+                goto out_unlock;
+        INIT_LIST_HEAD(&free_list);
        for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
                struct hlist_head *hhd = &ftrace_func_hash[i];
@@ -3104,12 +3124,30 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                                        continue;
                        }
-                        hlist_del(&entry->node);
+                        rec_entry = ftrace_lookup_ip(hash, entry->ip);
-                        call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+                        /* It is possible more than one entry had this ip */
+                        if (rec_entry)
+                                free_hash_entry(hash, rec_entry);
+                        hlist_del_rcu(&entry->node);
+                        list_add(&entry->free_list, &free_list);
                }
        }
        __disable_ftrace_function_probe();
+        /*
+         * Remove after the disable is called. Otherwise, if the last
+         * probe is removed, a null hash means *all enabled*.
+         */
+        ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+        synchronize_sched();
+        list_for_each_entry_safe(entry, p, &free_list, free_list) {
+                list_del(&entry->free_list);
+                ftrace_free_entry(entry);
+        }
+                
+ out_unlock:
        mutex_unlock(&ftrace_lock);
+        free_ftrace_hash(hash);
 }
 void
@@ -3441,14 +3479,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
 static int __init set_ftrace_notrace(char *str)
 {
-        strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+        strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
        return 1;
 }
 __setup("ftrace_notrace=", set_ftrace_notrace);
 static int __init set_ftrace_filter(char *str)
 {
-        strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+        strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
        return 1;
 }
 __setup("ftrace_filter=", set_ftrace_filter);
@@ -3571,7 +3609,7 @@ static const struct file_operations ftrace_filter_fops = {
        .open = ftrace_filter_open,
        .read = seq_read,
        .write = ftrace_filter_write,
-        .llseek = ftrace_regex_lseek,
+        .llseek = ftrace_filter_lseek,
        .release = ftrace_regex_release,
 };
@@ -3579,7 +3617,7 @@ static const struct file_operations ftrace_notrace_fops = {
        .open = ftrace_notrace_open,
        .read = seq_read,
        .write = ftrace_notrace_write,
-        .llseek = ftrace_regex_lseek,
+        .llseek = ftrace_filter_lseek,
        .release = ftrace_regex_release,
 };
@@ -3737,7 +3775,8 @@ out:
        if (fail)
                return -EINVAL;
-        ftrace_graph_filter_enabled = 1;
+        ftrace_graph_filter_enabled = !!(*idx);
        return 0;
 }
@@ -3784,8 +3823,8 @@ static const struct file_operations ftrace_graph_fops = {
        .open           = ftrace_graph_open,
        .read           = seq_read,
        .write          = ftrace_graph_write,
+        .llseek         = ftrace_filter_lseek,
        .release        = ftrace_graph_release,
-        .llseek         = seq_lseek,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -4131,7 +4170,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
        preempt_disable_notrace();
        trace_recursion_set(TRACE_CONTROL_BIT);
        do_for_each_ftrace_op(op, ftrace_control_list) {
-                if (!ftrace_function_local_disabled(op) &&
+                if (!(op->flags & FTRACE_OPS_FL_STUB) &&
+                    !ftrace_function_local_disabled(op) &&
                    ftrace_ops_test(op, ip))
                        op->func(ip, parent_ip, op, regs);
        } while_for_each_ftrace_op(op);
@@ -4439,7 +4479,7 @@ static const struct file_operations ftrace_pid_fops = {
        .open           = ftrace_pid_open,
        .write          = ftrace_pid_write,
        .read           = seq_read,
-        .llseek         = seq_lseek,
+        .llseek         = ftrace_filter_lseek,
        .release        = ftrace_pid_release,
 };
@@ -4555,12 +4595,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
                ftrace_startup_sysctl();
                /* we are starting ftrace again */
-                if (ftrace_ops_list != &ftrace_list_end) {
+                if (ftrace_ops_list != &ftrace_list_end)
-                        if (ftrace_ops_list->next == &ftrace_list_end)
+                        update_ftrace_function();
-                                ftrace_trace_function = ftrace_ops_list->func;
-                        else
-                                ftrace_trace_function = ftrace_ops_list_func;
-                }
        } else {
                /* stopping ftrace calls (just send to ftrace_stub) */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6989df2ba194..b59aea2c48c2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,13 +8,16 @@
 #include <linux/trace_clock.h>
 #include <linux/trace_seq.h>
 #include <linux/spinlock.h>
+#include <linux/irq_work.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
+#include <linux/kthread.h>      /* for self test */
 #include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
+#include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/hash.h>
@@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
        return ret;
 }
+struct rb_irq_work {
+        struct irq_work                 work;
+        wait_queue_head_t               waiters;
+        bool                            waiters_pending;
+};
 /*
 * head_page == tail_page && head == tail then buffer is empty.
 */
@@ -478,6 +487,8 @@ struct ring_buffer_per_cpu {
        struct list_head                new_pages; /* new pages to add */
        struct work_struct              update_pages_work;
        struct completion               update_done;
+        struct rb_irq_work              irq_work;
 };
 struct ring_buffer {
@@ -497,6 +508,8 @@ struct ring_buffer {
        struct notifier_block           cpu_notify;
 #endif
        u64                             (*clock)(void);
+        struct rb_irq_work              irq_work;
 };
 struct ring_buffer_iter {
@@ -508,6 +521,118 @@ struct ring_buffer_iter {
        u64                             read_stamp;
 };
+/*
+ * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
+ *
+ * Schedules a delayed work to wake up any task that is blocked on the
+ * ring buffer waiters queue.
+ */
+static void rb_wake_up_waiters(struct irq_work *work)
+{
+        struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
+        wake_up_all(&rbwork->waiters);
+}
+/**
+ * ring_buffer_wait - wait for input to the ring buffer
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ */
+void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        DEFINE_WAIT(wait);
+        struct rb_irq_work *work;
+        /*
+         * Depending on what the caller is waiting for, either any
+         * data in any cpu buffer, or a specific buffer, put the
+         * caller on the appropriate wait queue.
+         */
+        if (cpu == RING_BUFFER_ALL_CPUS)
+                work = &buffer->irq_work;
+        else {
+                cpu_buffer = buffer->buffers[cpu];
+                work = &cpu_buffer->irq_work;
+        }
+        prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+        /*
+         * The events can happen in critical sections where
+         * checking a work queue can cause deadlocks.
+         * After adding a task to the queue, this flag is set
+         * only to notify events to try to wake up the queue
+         * using irq_work.
+         *
+         * We don't clear it even if the buffer is no longer
+         * empty. The flag only causes the next event to run
+         * irq_work to do the work queue wake up. The worse
+         * that can happen if we race with !trace_empty() is that
+         * an event will cause an irq_work to try to wake up
+         * an empty queue.
+         *
+         * There's no reason to protect this flag either, as
+         * the work queue and irq_work logic will do the necessary
+         * synchronization for the wake ups. The only thing
+         * that is necessary is that the wake up happens after
+         * a task has been queued. It's OK for spurious wake ups.
+         */
+        work->waiters_pending = true;
+        if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
+            (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
+                schedule();
+        finish_wait(&work->waiters, &wait);
+}
+/**
+ * ring_buffer_poll_wait - poll on buffer input
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ * @filp: the file descriptor
+ * @poll_table: The poll descriptor
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ *
+ * Returns POLLIN | POLLRDNORM if data exists in the buffers,
+ * zero otherwise.
+ */
+int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+                          struct file *filp, poll_table *poll_table)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        struct rb_irq_work *work;
+        if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+            (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+                return POLLIN | POLLRDNORM;
+        if (cpu == RING_BUFFER_ALL_CPUS)
+                work = &buffer->irq_work;
+        else {
+                cpu_buffer = buffer->buffers[cpu];
+                work = &cpu_buffer->irq_work;
+        }
+        work->waiters_pending = true;
+        poll_wait(filp, &work->waiters, poll_table);
+        if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+            (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+                return POLLIN | POLLRDNORM;
+        return 0;
+}
 /* buffer may be either ring_buffer or ring_buffer_per_cpu */
 #define RB_WARN_ON(b, cond)                                             \
        ({                                                              \
@@ -1063,6 +1188,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
        cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
        INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
        init_completion(&cpu_buffer->update_done);
+        init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
+        init_waitqueue_head(&cpu_buffer->irq_work.waiters);
        bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                            GFP_KERNEL, cpu_to_node(cpu));
@@ -1158,6 +1285,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
        buffer->clock = trace_clock_local;
        buffer->reader_lock_key = key;
+        init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
+        init_waitqueue_head(&buffer->irq_work.waiters);
        /* need at least two pages */
        if (nr_pages < 2)
                nr_pages = 2;
@@ -1553,11 +1683,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
                        if (!cpu_buffer->nr_pages_to_update)
                                continue;
-                        if (cpu_online(cpu))
+                        /* The update must run on the CPU that is being updated. */
+                        preempt_disable();
+                        if (cpu == smp_processor_id() || !cpu_online(cpu)) {
+                                rb_update_pages(cpu_buffer);
+                                cpu_buffer->nr_pages_to_update = 0;
+                        } else {
+                                /*
+                                 * Can not disable preemption for schedule_work_on()
+                                 * on PREEMPT_RT.
+                                 */
+                                preempt_enable();
                                schedule_work_on(cpu,
                                                &cpu_buffer->update_pages_work);
-                        else
+                                preempt_disable();
-                                rb_update_pages(cpu_buffer);
+                        }
+                        preempt_enable();
                }
                /* wait for all the updates to complete */
@@ -1595,12 +1736,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
                get_online_cpus();
-                if (cpu_online(cpu_id)) {
+                preempt_disable();
+                /* The update must run on the CPU that is being updated. */
+                if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
+                        rb_update_pages(cpu_buffer);
+                else {
+                        /*
+                         * Can not disable preemption for schedule_work_on()
+                         * on PREEMPT_RT.
+                         */
+                        preempt_enable();
                        schedule_work_on(cpu_id,
                                         &cpu_buffer->update_pages_work);
                        wait_for_completion(&cpu_buffer->update_done);
-                } else
+                        preempt_disable();
-                        rb_update_pages(cpu_buffer);
+                }
+                preempt_enable();
                cpu_buffer->nr_pages_to_update = 0;
                put_online_cpus();
@@ -2612,6 +2763,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
        rb_end_commit(cpu_buffer);
 }
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+        if (buffer->irq_work.waiters_pending) {
+                buffer->irq_work.waiters_pending = false;
+                /* irq_work_queue() supplies it's own memory barriers */
+                irq_work_queue(&buffer->irq_work.work);
+        }
+        if (cpu_buffer->irq_work.waiters_pending) {
+                cpu_buffer->irq_work.waiters_pending = false;
+                /* irq_work_queue() supplies it's own memory barriers */
+                irq_work_queue(&cpu_buffer->irq_work.work);
+        }
+}
 /**
 * ring_buffer_unlock_commit - commit a reserved
 * @buffer: The buffer to commit to
@@ -2631,6 +2798,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
        rb_commit(cpu_buffer, event);
+        rb_wakeups(buffer, cpu_buffer);
        trace_recursive_unlock();
        preempt_enable_notrace();
@@ -2803,6 +2972,8 @@ int ring_buffer_write(struct ring_buffer *buffer,
        rb_commit(cpu_buffer, event);
+        rb_wakeups(buffer, cpu_buffer);
        ret = 0;
 out:
        preempt_enable_notrace();
@@ -4467,3 +4638,320 @@ static int rb_cpu_notify(struct notifier_block *self,
        return NOTIFY_OK;
 }
 #endif
+#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
+/*
+ * This is a basic integrity check of the ring buffer.
+ * Late in the boot cycle this test will run when configured in.
+ * It will kick off a thread per CPU that will go into a loop
+ * writing to the per cpu ring buffer various sizes of data.
+ * Some of the data will be large items, some small.
+ *
+ * Another thread is created that goes into a spin, sending out
+ * IPIs to the other CPUs to also write into the ring buffer.
+ * this is to test the nesting ability of the buffer.
+ *
+ * Basic stats are recorded and reported. If something in the
+ * ring buffer should happen that's not expected, a big warning
+ * is displayed and all ring buffers are disabled.
+ */
+static struct task_struct *rb_threads[NR_CPUS] __initdata;
+struct rb_test_data {
+        struct ring_buffer      *buffer;
+        unsigned long           events;
+        unsigned long           bytes_written;
+        unsigned long           bytes_alloc;
+        unsigned long           bytes_dropped;
+        unsigned long           events_nested;
+        unsigned long           bytes_written_nested;
+        unsigned long           bytes_alloc_nested;
+        unsigned long           bytes_dropped_nested;
+        int                     min_size_nested;
+        int                     max_size_nested;
+        int                     max_size;
+        int                     min_size;
+        int                     cpu;
+        int                     cnt;
+};
+static struct rb_test_data rb_data[NR_CPUS] __initdata;
+/* 1 meg per cpu */
+#define RB_TEST_BUFFER_SIZE     1048576
+static char rb_string[] __initdata =
+        "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
+        "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
+        "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
+static bool rb_test_started __initdata;
+struct rb_item {
+        int size;
+        char str[];
+};
+static __init int rb_write_something(struct rb_test_data *data, bool nested)
+{
+        struct ring_buffer_event *event;
+        struct rb_item *item;
+        bool started;
+        int event_len;
+        int size;
+        int len;
+        int cnt;
+        /* Have nested writes different that what is written */
+        cnt = data->cnt + (nested ? 27 : 0);
+        /* Multiply cnt by ~e, to make some unique increment */
+        size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
+        len = size + sizeof(struct rb_item);
+        started = rb_test_started;
+        /* read rb_test_started before checking buffer enabled */
+        smp_rmb();
+        event = ring_buffer_lock_reserve(data->buffer, len);
+        if (!event) {
+                /* Ignore dropped events before test starts. */
+                if (started) {
+                        if (nested)
+                                data->bytes_dropped += len;
+                        else
+                                data->bytes_dropped_nested += len;
+                }
+                return len;
+        }
+        event_len = ring_buffer_event_length(event);
+        if (RB_WARN_ON(data->buffer, event_len < len))
+                goto out;
+        item = ring_buffer_event_data(event);
+        item->size = size;
+        memcpy(item->str, rb_string, size);
+        if (nested) {
+                data->bytes_alloc_nested += event_len;
+                data->bytes_written_nested += len;
+                data->events_nested++;
+                if (!data->min_size_nested || len < data->min_size_nested)
+                        data->min_size_nested = len;
+                if (len > data->max_size_nested)
+                        data->max_size_nested = len;
+        } else {
+                data->bytes_alloc += event_len;
+                data->bytes_written += len;
+                data->events++;
+                if (!data->min_size || len < data->min_size)
+                        data->max_size = len;
+                if (len > data->max_size)
+                        data->max_size = len;
+        }
+ out:
+        ring_buffer_unlock_commit(data->buffer, event);
+        return 0;
+}
+static __init int rb_test(void *arg)
+{
+        struct rb_test_data *data = arg;
+        while (!kthread_should_stop()) {
+                rb_write_something(data, false);
+                data->cnt++;
+                set_current_state(TASK_INTERRUPTIBLE);
+                /* Now sleep between a min of 100-300us and a max of 1ms */
+                usleep_range(((data->cnt % 3) + 1) * 100, 1000);
+        }
+        return 0;
+}
+static __init void rb_ipi(void *ignore)
+{
+        struct rb_test_data *data;
+        int cpu = smp_processor_id();
+        data = &rb_data[cpu];
+        rb_write_something(data, true);
+}
+static __init int rb_hammer_test(void *arg)
+{
+        while (!kthread_should_stop()) {
+                /* Send an IPI to all cpus to write data! */
+                smp_call_function(rb_ipi, NULL, 1);
+                /* No sleep, but for non preempt, let others run */
+                schedule();
+        }
+        return 0;
+}
+static __init int test_ringbuffer(void)
+{
+        struct task_struct *rb_hammer;
+        struct ring_buffer *buffer;
+        int cpu;
+        int ret = 0;
+        pr_info("Running ring buffer tests...\n");
+        buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
+        if (WARN_ON(!buffer))
+                return 0;
+        /* Disable buffer so that threads can't write to it yet */
+        ring_buffer_record_off(buffer);
+        for_each_online_cpu(cpu) {
+                rb_data[cpu].buffer = buffer;
+                rb_data[cpu].cpu = cpu;
+                rb_data[cpu].cnt = cpu;
+                rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
+                                                 "rbtester/%d", cpu);
+                if (WARN_ON(!rb_threads[cpu])) {
+                        pr_cont("FAILED\n");
+                        ret = -1;
+                        goto out_free;
+                }
+                kthread_bind(rb_threads[cpu], cpu);
+                wake_up_process(rb_threads[cpu]);
+        }
+        /* Now create the rb hammer! */
+        rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
+        if (WARN_ON(!rb_hammer)) {
+                pr_cont("FAILED\n");
+                ret = -1;
+                goto out_free;
+        }
+        ring_buffer_record_on(buffer);
+        /*
+         * Show buffer is enabled before setting rb_test_started.
+         * Yes there's a small race window where events could be
+         * dropped and the thread wont catch it. But when a ring
+         * buffer gets enabled, there will always be some kind of
+         * delay before other CPUs see it. Thus, we don't care about
+         * those dropped events. We care about events dropped after
+         * the threads see that the buffer is active.
+         */
+        smp_wmb();
+        rb_test_started = true;
+        set_current_state(TASK_INTERRUPTIBLE);
+        /* Just run for 10 seconds */;
+        schedule_timeout(10 * HZ);
+        kthread_stop(rb_hammer);
+ out_free:
+        for_each_online_cpu(cpu) {
+                if (!rb_threads[cpu])
+                        break;
+                kthread_stop(rb_threads[cpu]);
+        }
+        if (ret) {
+                ring_buffer_free(buffer);
+                return ret;
+        }
+        /* Report! */
+        pr_info("finished\n");
+        for_each_online_cpu(cpu) {
+                struct ring_buffer_event *event;
+                struct rb_test_data *data = &rb_data[cpu];
+                struct rb_item *item;
+                unsigned long total_events;
+                unsigned long total_dropped;
+                unsigned long total_written;
+                unsigned long total_alloc;
+                unsigned long total_read = 0;
+                unsigned long total_size = 0;
+                unsigned long total_len = 0;
+                unsigned long total_lost = 0;
+                unsigned long lost;
+                int big_event_size;
+                int small_event_size;
+                ret = -1;
+                total_events = data->events + data->events_nested;
+                total_written = data->bytes_written + data->bytes_written_nested;
+                total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
+                total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
+                big_event_size = data->max_size + data->max_size_nested;
+                small_event_size = data->min_size + data->min_size_nested;
+                pr_info("CPU %d:\n", cpu);
+                pr_info("              events:    %ld\n", total_events);
+                pr_info("       dropped bytes:    %ld\n", total_dropped);
+                pr_info("       alloced bytes:    %ld\n", total_alloc);
+                pr_info("       written bytes:    %ld\n", total_written);
+                pr_info("       biggest event:    %d\n", big_event_size);
+                pr_info("      smallest event:    %d\n", small_event_size);
+                if (RB_WARN_ON(buffer, total_dropped))
+                        break;
+                ret = 0;
+                while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
+                        total_lost += lost;
+                        item = ring_buffer_event_data(event);
+                        total_len += ring_buffer_event_length(event);
+                        total_size += item->size + sizeof(struct rb_item);
+                        if (memcmp(&item->str[0], rb_string, item->size) != 0) {
+                                pr_info("FAILED!\n");
+                                pr_info("buffer had: %.*s\n", item->size, item->str);
+                                pr_info("expected:   %.*s\n", item->size, rb_string);
+                                RB_WARN_ON(buffer, 1);
+                                ret = -1;
+                                break;
+                        }
+                        total_read++;
+                }
+                if (ret)
+                        break;
+                ret = -1;
+                pr_info("         read events:   %ld\n", total_read);
+                pr_info("         lost events:   %ld\n", total_lost);
+                pr_info("        total events:   %ld\n", total_lost + total_read);
+                pr_info("  recorded len bytes:   %ld\n", total_len);
+                pr_info(" recorded size bytes:   %ld\n", total_size);
+                if (total_lost)
+                        pr_info(" With dropped events, record len and size may not match\n"
+                                " alloced and written from above\n");
+                if (!total_lost) {
+                        if (RB_WARN_ON(buffer, total_len != total_alloc ||
+                                       total_size != total_written))
+                                break;
+                }
+                if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
+                        break;
+                ret = 0;
+        }
+        if (!ret)
+                pr_info("Ring buffer PASSED!\n");
+        ring_buffer_free(buffer);
+        return 0;
+}
+late_initcall(test_ringbuffer);
+#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c2e2c2310374..ae6fa2d1cdf7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1,7 +1,7 @@
 /*
 * ring buffer based function tracer
 *
- * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>
 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
 *
 * Originally taken from the RT patch by:
@@ -19,7 +19,6 @@
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
-#include <linux/irq_work.h>
 #include <linux/debugfs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
@@ -48,7 +47,7 @@
 * On boot up, the ring buffer is set to the minimum size, so that
 * we do not waste memory on systems that are not using tracing.
 */
-int ring_buffer_expanded;
+bool ring_buffer_expanded;
 /*
 * We need to change this state when a selftest is running.
@@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
 static DEFINE_PER_CPU(bool, trace_cmdline_save);
 /*
- * When a reader is waiting for data, then this variable is
- * set to true.
- */
-static bool trace_wakeup_needed;
-static struct irq_work trace_work_wakeup;
-/*
 * Kill all tracing for good (never come back).
 * It is initialized to 1 but will turn to zero if the initialization
 * of the tracer is successful. But that is the only place that sets
@@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf);
 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
 static char *default_bootup_tracer;
+static bool allocate_snapshot;
 static int __init set_cmdline_ftrace(char *str)
 {
-        strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
+        strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
        default_bootup_tracer = bootup_tracer_buf;
        /* We are using ftrace early, expand it */
-        ring_buffer_expanded = 1;
+        ring_buffer_expanded = true;
        return 1;
 }
 __setup("ftrace=", set_cmdline_ftrace);
@@ -156,13 +149,22 @@ static int __init set_ftrace_dump_on_oops(char *str)
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
+static int __init boot_alloc_snapshot(char *str)
+{
+        allocate_snapshot = true;
+        /* We also need the main ring buffer expanded */
+        ring_buffer_expanded = true;
+        return 1;
+}
+__setup("alloc_snapshot", boot_alloc_snapshot);
 static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
 static char *trace_boot_options __initdata;
 static int __init set_trace_boot_options(char *str)
 {
-        strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+        strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
        trace_boot_options = trace_boot_options_buf;
        return 0;
 }
@@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec)
 */
 static struct trace_array       global_trace;
-static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+LIST_HEAD(ftrace_trace_arrays);
 int filter_current_check_discard(struct ring_buffer *buffer,
                                 struct ftrace_event_call *call, void *rec,
@@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu)
        u64 ts;
        /* Early boot up does not have a buffer yet */
-        if (!global_trace.buffer)
+        if (!global_trace.trace_buffer.buffer)
                return trace_clock_local();
-        ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
+        ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu);
-        ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
+        ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);
        return ts;
 }
-/*
- * The max_tr is used to snapshot the global_trace when a maximum
- * latency is reached. Some tracers will use this to store a maximum
- * trace while it continues examining live traces.
- *
- * The buffers for the max_tr are set up the same as the global_trace.
- * When a snapshot is taken, the link list of the max_tr is swapped
- * with the link list of the global_trace and the buffers are reset for
- * the global_trace so the tracing can continue.
- */
-static struct trace_array       max_tr;
-static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
 int tracing_is_enabled(void)
 {
        return tracing_is_on();
@@ -249,9 +237,6 @@ static unsigned long		trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
 /* trace_types holds a link list of available tracers. */
 static struct tracer            *trace_types __read_mostly;
-/* current_trace points to the tracer that is currently active */
-static struct tracer            *current_trace __read_mostly = &nop_trace;
 /*
 * trace_types_lock is used to protect the trace_types list.
 */
@@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
 static inline void trace_access_lock(int cpu)
 {
-        if (cpu == TRACE_PIPE_ALL_CPU) {
+        if (cpu == RING_BUFFER_ALL_CPUS) {
                /* gain it for accessing the whole ring buffer. */
                down_write(&all_cpu_access_lock);
        } else {
                /* gain it for accessing a cpu ring buffer. */
-                /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
+                /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */
                down_read(&all_cpu_access_lock);
                /* Secondly block other access to this @cpu ring buffer. */
@@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu)
 static inline void trace_access_unlock(int cpu)
 {
-        if (cpu == TRACE_PIPE_ALL_CPU) {
+        if (cpu == RING_BUFFER_ALL_CPUS) {
                up_write(&all_cpu_access_lock);
        } else {
                mutex_unlock(&per_cpu(cpu_access_lock, cpu));
@@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void)
 #endif
-/* trace_wait is a waitqueue for tasks blocked on trace_poll */
-static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
        TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
-        TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS;
+        TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
-static int trace_stop_count;
-static DEFINE_RAW_SPINLOCK(tracing_start_lock);
-/**
- * trace_wake_up - wake up tasks waiting for trace input
- *
- * Schedules a delayed work to wake up any task that is blocked on the
- * trace_wait queue. These is used with trace_poll for tasks polling the
- * trace.
- */
-static void trace_wake_up(struct irq_work *work)
-{
-        wake_up_all(&trace_wait);
-}
 /**
 * tracing_on - enable tracing buffers
@@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work)
 */
 void tracing_on(void)
 {
-        if (global_trace.buffer)
+        if (global_trace.trace_buffer.buffer)
-                ring_buffer_record_on(global_trace.buffer);
+                ring_buffer_record_on(global_trace.trace_buffer.buffer);
        /*
         * This flag is only looked at when buffers haven't been
         * allocated yet. We don't really care about the race
@@ -385,6 +351,196 @@ void tracing_on(void)
 EXPORT_SYMBOL_GPL(tracing_on);
 /**
+ * __trace_puts - write a constant string into the trace buffer.
+ * @ip:    The address of the caller
+ * @str:   The constant string to write
+ * @size:  The size of the string.
+ */
+int __trace_puts(unsigned long ip, const char *str, int size)
+{
+        struct ring_buffer_event *event;
+        struct ring_buffer *buffer;
+        struct print_entry *entry;
+        unsigned long irq_flags;
+        int alloc;
+        alloc = sizeof(*entry) + size + 2; /* possible \n added */
+        local_save_flags(irq_flags);
+        buffer = global_trace.trace_buffer.buffer;
+        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
+                                          irq_flags, preempt_count());
+        if (!event)
+                return 0;
+        entry = ring_buffer_event_data(event);
+        entry->ip = ip;
+        memcpy(&entry->buf, str, size);
+        /* Add a newline if necessary */
+        if (entry->buf[size - 1] != '\n') {
+                entry->buf[size] = '\n';
+                entry->buf[size + 1] = '\0';
+        } else
+                entry->buf[size] = '\0';
+        __buffer_unlock_commit(buffer, event);
+        return size;
+}
+EXPORT_SYMBOL_GPL(__trace_puts);
+/**
+ * __trace_bputs - write the pointer to a constant string into trace buffer
+ * @ip:    The address of the caller
+ * @str:   The constant string to write to the buffer to
+ */
+int __trace_bputs(unsigned long ip, const char *str)
+{
+        struct ring_buffer_event *event;
+        struct ring_buffer *buffer;
+        struct bputs_entry *entry;
+        unsigned long irq_flags;
+        int size = sizeof(struct bputs_entry);
+        local_save_flags(irq_flags);
+        buffer = global_trace.trace_buffer.buffer;
+        event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
+                                          irq_flags, preempt_count());
+        if (!event)
+                return 0;
+        entry = ring_buffer_event_data(event);
+        entry->ip                       = ip;
+        entry->str                      = str;
+        __buffer_unlock_commit(buffer, event);
+        return 1;
+}
+EXPORT_SYMBOL_GPL(__trace_bputs);
+#ifdef CONFIG_TRACER_SNAPSHOT
+/**
+ * trace_snapshot - take a snapshot of the current buffer.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ *
+ * Note, make sure to allocate the snapshot with either
+ * a tracing_snapshot_alloc(), or by doing it manually
+ * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ *
+ * If the snapshot buffer is not allocated, it will stop tracing.
+ * Basically making a permanent snapshot.
+ */
+void tracing_snapshot(void)
+{
+        struct trace_array *tr = &global_trace;
+        struct tracer *tracer = tr->current_trace;
+        unsigned long flags;
+        if (in_nmi()) {
+                internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+                internal_trace_puts("*** snapshot is being ignored        ***\n");
+                return;
+        }
+        if (!tr->allocated_snapshot) {
+                internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
+                internal_trace_puts("*** stopping trace here!   ***\n");
+                tracing_off();
+                return;
+        }
+        /* Note, snapshot can not be used when the tracer uses it */
+        if (tracer->use_max_tr) {
+                internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
+                internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
+                return;
+        }
+        local_irq_save(flags);
+        update_max_tr(tr, current, smp_processor_id());
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot);
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
+                                        struct trace_buffer *size_buf, int cpu_id);
+static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
+static int alloc_snapshot(struct trace_array *tr)
+{
+        int ret;
+        if (!tr->allocated_snapshot) {
+                /* allocate spare buffer */
+                ret = resize_buffer_duplicate_size(&tr->max_buffer,
+                                   &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
+                if (ret < 0)
+                        return ret;
+                tr->allocated_snapshot = true;
+        }
+        return 0;
+}
+void free_snapshot(struct trace_array *tr)
+{
+        /*
+         * We don't free the ring buffer. instead, resize it because
+         * The max_tr ring buffer has some state (e.g. ring->clock) and
+         * we want preserve it.
+         */
+        ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
+        set_buffer_entries(&tr->max_buffer, 1);
+        tracing_reset_online_cpus(&tr->max_buffer);
+        tr->allocated_snapshot = false;
+}
+/**
+ * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ *
+ * This is similar to trace_snapshot(), but it will allocate the
+ * snapshot buffer if it isn't already allocated. Use this only
+ * where it is safe to sleep, as the allocation may sleep.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ */
+void tracing_snapshot_alloc(void)
+{
+        struct trace_array *tr = &global_trace;
+        int ret;
+        ret = alloc_snapshot(tr);
+        if (WARN_ON(ret < 0))
+                return;
+        tracing_snapshot();
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+#else
+void tracing_snapshot(void)
+{
+        WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot);
+void tracing_snapshot_alloc(void)
+{
+        /* Give warning */
+        tracing_snapshot();
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+#endif /* CONFIG_TRACER_SNAPSHOT */
+/**
 * tracing_off - turn off tracing buffers
 *
 * This function stops the tracing buffers from recording data.
@@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on);
 */
 void tracing_off(void)
 {
-        if (global_trace.buffer)
+        if (global_trace.trace_buffer.buffer)
-                ring_buffer_record_off(global_trace.buffer);
+                ring_buffer_record_off(global_trace.trace_buffer.buffer);
        /*
         * This flag is only looked at when buffers haven't been
         * allocated yet. We don't really care about the race
@@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off);
 */
 int tracing_is_on(void)
 {
-        if (global_trace.buffer)
+        if (global_trace.trace_buffer.buffer)
-                return ring_buffer_record_is_on(global_trace.buffer);
+                return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
        return !global_trace.buffer_disabled;
 }
 EXPORT_SYMBOL_GPL(tracing_is_on);
@@ -479,6 +635,7 @@ static const char *trace_options[] = {
        "disable_on_free",
        "irq-info",
        "markers",
+        "function-trace",
        NULL
 };
@@ -490,6 +647,8 @@ static struct {
        { trace_clock_local,    "local",        1 },
        { trace_clock_global,   "global",       1 },
        { trace_clock_counter,  "counter",      0 },
+        { trace_clock_jiffies,  "uptime",       1 },
+        { trace_clock,          "perf",         1 },
        ARCH_TRACE_CLOCKS
 };
@@ -670,13 +829,14 @@ unsigned long __read_mostly	tracing_max_latency;
 static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
-        struct trace_array_cpu *data = tr->data[cpu];
+        struct trace_buffer *trace_buf = &tr->trace_buffer;
-        struct trace_array_cpu *max_data;
+        struct trace_buffer *max_buf = &tr->max_buffer;
+        struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
+        struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
-        max_tr.cpu = cpu;
+        max_buf->cpu = cpu;
-        max_tr.time_start = data->preempt_timestamp;
+        max_buf->time_start = data->preempt_timestamp;
-        max_data = max_tr.data[cpu];
        max_data->saved_latency = tracing_max_latency;
        max_data->critical_start = data->critical_start;
        max_data->critical_end = data->critical_end;
@@ -704,23 +864,24 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 void
 update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
-        struct ring_buffer *buf = tr->buffer;
+        struct ring_buffer *buf;
-        if (trace_stop_count)
+        if (tr->stop_count)
                return;
        WARN_ON_ONCE(!irqs_disabled());
-        if (!current_trace->allocated_snapshot) {
+        if (!tr->allocated_snapshot) {
                /* Only the nop tracer should hit this when disabling */
-                WARN_ON_ONCE(current_trace != &nop_trace);
+                WARN_ON_ONCE(tr->current_trace != &nop_trace);
                return;
        }
        arch_spin_lock(&ftrace_max_lock);
-        tr->buffer = max_tr.buffer;
+        buf = tr->trace_buffer.buffer;
-        max_tr.buffer = buf;
+        tr->trace_buffer.buffer = tr->max_buffer.buffer;
+        tr->max_buffer.buffer = buf;
        __update_max_tr(tr, tsk, cpu);
        arch_spin_unlock(&ftrace_max_lock);
@@ -739,16 +900,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
        int ret;
-        if (trace_stop_count)
+        if (tr->stop_count)
                return;
        WARN_ON_ONCE(!irqs_disabled());
-        if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
+        if (!tr->allocated_snapshot) {
+                /* Only the nop tracer should hit this when disabling */
+                WARN_ON_ONCE(tr->current_trace != &nop_trace);
                return;
+        }
        arch_spin_lock(&ftrace_max_lock);
-        ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
+        ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
        if (ret == -EBUSY) {
                /*
@@ -757,7 +921,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
                 * the max trace buffer (no one writes directly to it)
                 * and flag that it failed.
                 */
-                trace_array_printk(&max_tr, _THIS_IP_,
+                trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
                        "Failed to swap buffers due to commit in progress\n");
        }
@@ -770,37 +934,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 static void default_wait_pipe(struct trace_iterator *iter)
 {
-        DEFINE_WAIT(wait);
+        /* Iterators are static, they should be filled or empty */
+        if (trace_buffer_iter(iter, iter->cpu_file))
+                return;
-        prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
+        ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+}
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int run_tracer_selftest(struct tracer *type)
+{
+        struct trace_array *tr = &global_trace;
+        struct tracer *saved_tracer = tr->current_trace;
+        int ret;
+        if (!type->selftest || tracing_selftest_disabled)
+                return 0;
        /*
-         * The events can happen in critical sections where
+         * Run a selftest on this tracer.
-         * checking a work queue can cause deadlocks.
+         * Here we reset the trace buffer, and set the current
-         * After adding a task to the queue, this flag is set
+         * tracer to be this tracer. The tracer can then run some
-         * only to notify events to try to wake up the queue
+         * internal tracing to verify that everything is in order.
-         * using irq_work.
+         * If we fail, we do not register this tracer.
-         *
-         * We don't clear it even if the buffer is no longer
-         * empty. The flag only causes the next event to run
-         * irq_work to do the work queue wake up. The worse
-         * that can happen if we race with !trace_empty() is that
-         * an event will cause an irq_work to try to wake up
-         * an empty queue.
-         *
-         * There's no reason to protect this flag either, as
-         * the work queue and irq_work logic will do the necessary
-         * synchronization for the wake ups. The only thing
-         * that is necessary is that the wake up happens after
-         * a task has been queued. It's OK for spurious wake ups.
         */
-        trace_wakeup_needed = true;
+        tracing_reset_online_cpus(&tr->trace_buffer);
-        if (trace_empty(iter))
+        tr->current_trace = type;
-                schedule();
+#ifdef CONFIG_TRACER_MAX_TRACE
+        if (type->use_max_tr) {
+                /* If we expanded the buffers, make sure the max is expanded too */
+                if (ring_buffer_expanded)
+                        ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
+                                           RING_BUFFER_ALL_CPUS);
+                tr->allocated_snapshot = true;
+        }
+#endif
-        finish_wait(&trace_wait, &wait);
+        /* the test is responsible for initializing and enabling */
+        pr_info("Testing tracer %s: ", type->name);
+        ret = type->selftest(type, tr);
+        /* the test is responsible for resetting too */
+        tr->current_trace = saved_tracer;
+        if (ret) {
+                printk(KERN_CONT "FAILED!\n");
+                /* Add the warning after printing 'FAILED' */
+                WARN_ON(1);
+                return -1;
+        }
+        /* Only reset on passing, to avoid touching corrupted buffers */
+        tracing_reset_online_cpus(&tr->trace_buffer);
+#ifdef CONFIG_TRACER_MAX_TRACE
+        if (type->use_max_tr) {
+                tr->allocated_snapshot = false;
+                /* Shrink the max buffer again */
+                if (ring_buffer_expanded)
+                        ring_buffer_resize(tr->max_buffer.buffer, 1,
+                                           RING_BUFFER_ALL_CPUS);
+        }
+#endif
+        printk(KERN_CONT "PASSED\n");
+        return 0;
 }
+#else
+static inline int run_tracer_selftest(struct tracer *type)
+{
+        return 0;
+}
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
 /**
 * register_tracer - register a tracer with the ftrace system.
@@ -847,57 +1052,9 @@ int register_tracer(struct tracer *type)
        if (!type->wait_pipe)
                type->wait_pipe = default_wait_pipe;
+        ret = run_tracer_selftest(type);
-#ifdef CONFIG_FTRACE_STARTUP_TEST
+        if (ret < 0)
-        if (type->selftest && !tracing_selftest_disabled) {
+                goto out;
-                struct tracer *saved_tracer = current_trace;
-                struct trace_array *tr = &global_trace;
-                /*
-                 * Run a selftest on this tracer.
-                 * Here we reset the trace buffer, and set the current
-                 * tracer to be this tracer. The tracer can then run some
-                 * internal tracing to verify that everything is in order.
-                 * If we fail, we do not register this tracer.
-                 */
-                tracing_reset_online_cpus(tr);
-                current_trace = type;
-                if (type->use_max_tr) {
-                        /* If we expanded the buffers, make sure the max is expanded too */
-                        if (ring_buffer_expanded)
-                                ring_buffer_resize(max_tr.buffer, trace_buf_size,
-                                                   RING_BUFFER_ALL_CPUS);
-                        type->allocated_snapshot = true;
-                }
-                /* the test is responsible for initializing and enabling */
-                pr_info("Testing tracer %s: ", type->name);
-                ret = type->selftest(type, tr);
-                /* the test is responsible for resetting too */
-                current_trace = saved_tracer;
-                if (ret) {
-                        printk(KERN_CONT "FAILED!\n");
-                        /* Add the warning after printing 'FAILED' */
-                        WARN_ON(1);
-                        goto out;
-                }
-                /* Only reset on passing, to avoid touching corrupted buffers */
-                tracing_reset_online_cpus(tr);
-                if (type->use_max_tr) {
-                        type->allocated_snapshot = false;
-                        /* Shrink the max buffer again */
-                        if (ring_buffer_expanded)
-                                ring_buffer_resize(max_tr.buffer, 1,
-                                                   RING_BUFFER_ALL_CPUS);
-                }
-                printk(KERN_CONT "PASSED\n");
-        }
-#endif
        type->next = trace_types;
        trace_types = type;
@@ -917,7 +1074,7 @@ int register_tracer(struct tracer *type)
        tracing_set_tracer(type->name);
        default_bootup_tracer = NULL;
        /* disable other selftests, since this will break it. */
-        tracing_selftest_disabled = 1;
+        tracing_selftest_disabled = true;
 #ifdef CONFIG_FTRACE_STARTUP_TEST
        printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
               type->name);
@@ -927,9 +1084,9 @@ int register_tracer(struct tracer *type)
        return ret;
 }
-void tracing_reset(struct trace_array *tr, int cpu)
+void tracing_reset(struct trace_buffer *buf, int cpu)
 {
-        struct ring_buffer *buffer = tr->buffer;
+        struct ring_buffer *buffer = buf->buffer;
        if (!buffer)
                return;
@@ -943,9 +1100,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
        ring_buffer_record_enable(buffer);
 }
-void tracing_reset_online_cpus(struct trace_array *tr)
+void tracing_reset_online_cpus(struct trace_buffer *buf)
 {
-        struct ring_buffer *buffer = tr->buffer;
+        struct ring_buffer *buffer = buf->buffer;
        int cpu;
        if (!buffer)
@@ -956,7 +1113,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
        /* Make sure all commits have finished */
        synchronize_sched();
-        tr->time_start = ftrace_now(tr->cpu);
+        buf->time_start = ftrace_now(buf->cpu);
        for_each_online_cpu(cpu)
                ring_buffer_reset_cpu(buffer, cpu);
@@ -966,12 +1123,21 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 void tracing_reset_current(int cpu)
 {
-        tracing_reset(&global_trace, cpu);
+        tracing_reset(&global_trace.trace_buffer, cpu);
 }
-void tracing_reset_current_online_cpus(void)
+void tracing_reset_all_online_cpus(void)
 {
-        tracing_reset_online_cpus(&global_trace);
+        struct trace_array *tr;
+        mutex_lock(&trace_types_lock);
+        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+                tracing_reset_online_cpus(&tr->trace_buffer);
+#ifdef CONFIG_TRACER_MAX_TRACE
+                tracing_reset_online_cpus(&tr->max_buffer);
+#endif
+        }
+        mutex_unlock(&trace_types_lock);
 }
 #define SAVED_CMDLINES 128
@@ -994,7 +1160,7 @@ static void trace_init_cmdlines(void)
 int is_tracing_stopped(void)
 {
-        return trace_stop_count;
+        return global_trace.stop_count;
 }
 /**
@@ -1026,12 +1192,12 @@ void tracing_start(void)
        if (tracing_disabled)
                return;
-        raw_spin_lock_irqsave(&tracing_start_lock, flags);
+        raw_spin_lock_irqsave(&global_trace.start_lock, flags);
-        if (--trace_stop_count) {
+        if (--global_trace.stop_count) {
-                if (trace_stop_count < 0) {
+                if (global_trace.stop_count < 0) {
                        /* Someone screwed up their debugging */
                        WARN_ON_ONCE(1);
-                        trace_stop_count = 0;
+                        global_trace.stop_count = 0;
                }
                goto out;
        }
@@ -1039,19 +1205,52 @@ void tracing_start(void)
        /* Prevent the buffers from switching */
        arch_spin_lock(&ftrace_max_lock);
-        buffer = global_trace.buffer;
+        buffer = global_trace.trace_buffer.buffer;
        if (buffer)
                ring_buffer_record_enable(buffer);
-        buffer = max_tr.buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+        buffer = global_trace.max_buffer.buffer;
        if (buffer)
                ring_buffer_record_enable(buffer);
+#endif
        arch_spin_unlock(&ftrace_max_lock);
        ftrace_start();
 out:
-        raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
+        raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+}
+static void tracing_start_tr(struct trace_array *tr)
+{
+        struct ring_buffer *buffer;
+        unsigned long flags;
+        if (tracing_disabled)
+                return;
+        /* If global, we need to also start the max tracer */
+        if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+                return tracing_start();
+        raw_spin_lock_irqsave(&tr->start_lock, flags);
+        if (--tr->stop_count) {
+                if (tr->stop_count < 0) {
+                        /* Someone screwed up their debugging */
+                        WARN_ON_ONCE(1);
+                        tr->stop_count = 0;
+                }
+                goto out;
+        }
+        buffer = tr->trace_buffer.buffer;
+        if (buffer)
+                ring_buffer_record_enable(buffer);
+ out:
+        raw_spin_unlock_irqrestore(&tr->start_lock, flags);
 }
 /**
@@ -1066,25 +1265,48 @@ void tracing_stop(void)
        unsigned long flags;
        ftrace_stop();
-        raw_spin_lock_irqsave(&tracing_start_lock, flags);
+        raw_spin_lock_irqsave(&global_trace.start_lock, flags);
-        if (trace_stop_count++)
+        if (global_trace.stop_count++)
                goto out;
        /* Prevent the buffers from switching */
        arch_spin_lock(&ftrace_max_lock);
-        buffer = global_trace.buffer;
+        buffer = global_trace.trace_buffer.buffer;
        if (buffer)
                ring_buffer_record_disable(buffer);
-        buffer = max_tr.buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+        buffer = global_trace.max_buffer.buffer;
        if (buffer)
                ring_buffer_record_disable(buffer);
+#endif
        arch_spin_unlock(&ftrace_max_lock);
 out:
-        raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
+        raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+}
+static void tracing_stop_tr(struct trace_array *tr)
+{
+        struct ring_buffer *buffer;
+        unsigned long flags;
+        /* If global, we need to also stop the max tracer */
+        if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+                return tracing_stop();
+        raw_spin_lock_irqsave(&tr->start_lock, flags);
+        if (tr->stop_count++)
+                goto out;
+        buffer = tr->trace_buffer.buffer;
+        if (buffer)
+                ring_buffer_record_disable(buffer);
+ out:
+        raw_spin_unlock_irqrestore(&tr->start_lock, flags);
 }
 void trace_stop_cmdline_recording(void);
@@ -1217,11 +1439,6 @@ void
 __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
 {
        __this_cpu_write(trace_cmdline_save, true);
-        if (trace_wakeup_needed) {
-                trace_wakeup_needed = false;
-                /* irq_work_queue() supplies it's own memory barriers */
-                irq_work_queue(&trace_work_wakeup);
-        }
        ring_buffer_unlock_commit(buffer, event);
 }
@@ -1245,11 +1462,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
 EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
 struct ring_buffer_event *
+trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
+                          struct ftrace_event_file *ftrace_file,
+                          int type, unsigned long len,
+                          unsigned long flags, int pc)
+{
+        *current_rb = ftrace_file->tr->trace_buffer.buffer;
+        return trace_buffer_lock_reserve(*current_rb,
+                                         type, len, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
+struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
                                  int type, unsigned long len,
                                  unsigned long flags, int pc)
 {
-        *current_rb = global_trace.buffer;
+        *current_rb = global_trace.trace_buffer.buffer;
        return trace_buffer_lock_reserve(*current_rb,
                                         type, len, flags, pc);
 }
@@ -1288,7 +1517,7 @@ trace_function(struct trace_array *tr,
               int pc)
 {
        struct ftrace_event_call *call = &event_function;
-        struct ring_buffer *buffer = tr->buffer;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        struct ring_buffer_event *event;
        struct ftrace_entry *entry;
@@ -1429,13 +1658,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
                   int pc)
 {
-        __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
+        __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
 }
 /**
 * trace_dump_stack - record a stack back trace in the trace buffer
+ * @skip: Number of functions to skip (helper handlers)
 */
-void trace_dump_stack(void)
+void trace_dump_stack(int skip)
 {
        unsigned long flags;
@@ -1444,8 +1674,13 @@ void trace_dump_stack(void)
        local_save_flags(flags);
-        /* skipping 3 traces, seems to get us at the caller of this function */
+        /*
-        __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
+         * Skip 3 more, seems to get us at the caller of
+         * this function.
+         */
+        skip += 3;
+        __ftrace_trace_stack(global_trace.trace_buffer.buffer,
+                             flags, skip, preempt_count(), NULL);
 }
 static DEFINE_PER_CPU(int, user_stack_count);
@@ -1615,7 +1850,7 @@ void trace_printk_init_buffers(void)
         * directly here. If the global_trace.buffer is already
         * allocated here, then this was called by module code.
         */
-        if (global_trace.buffer)
+        if (global_trace.trace_buffer.buffer)
                tracing_start_cmdline_record();
 }
@@ -1675,7 +1910,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        local_save_flags(flags);
        size = sizeof(*entry) + sizeof(u32) * len;
-        buffer = tr->buffer;
+        buffer = tr->trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
                                          flags, pc);
        if (!event)
@@ -1698,27 +1933,12 @@ out:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
-int trace_array_printk(struct trace_array *tr,
+static int
-                       unsigned long ip, const char *fmt, ...)
+__trace_array_vprintk(struct ring_buffer *buffer,
-{
+                      unsigned long ip, const char *fmt, va_list args)
-        int ret;
-        va_list ap;
-        if (!(trace_flags & TRACE_ITER_PRINTK))
-                return 0;
-        va_start(ap, fmt);
-        ret = trace_array_vprintk(tr, ip, fmt, ap);
-        va_end(ap);
-        return ret;
-}
-int trace_array_vprintk(struct trace_array *tr,
-                        unsigned long ip, const char *fmt, va_list args)
 {
        struct ftrace_event_call *call = &event_print;
        struct ring_buffer_event *event;
-        struct ring_buffer *buffer;
        int len = 0, size, pc;
        struct print_entry *entry;
        unsigned long flags;
@@ -1746,7 +1966,6 @@ int trace_array_vprintk(struct trace_array *tr,
        local_save_flags(flags);
        size = sizeof(*entry) + len + 1;
-        buffer = tr->buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
                                          flags, pc);
        if (!event)
@@ -1767,6 +1986,42 @@ int trace_array_vprintk(struct trace_array *tr,
        return len;
 }
+int trace_array_vprintk(struct trace_array *tr,
+                        unsigned long ip, const char *fmt, va_list args)
+{
+        return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
+}
+int trace_array_printk(struct trace_array *tr,
+                       unsigned long ip, const char *fmt, ...)
+{
+        int ret;
+        va_list ap;
+        if (!(trace_flags & TRACE_ITER_PRINTK))
+                return 0;
+        va_start(ap, fmt);
+        ret = trace_array_vprintk(tr, ip, fmt, ap);
+        va_end(ap);
+        return ret;
+}
+int trace_array_printk_buf(struct ring_buffer *buffer,
+                           unsigned long ip, const char *fmt, ...)
+{
+        int ret;
+        va_list ap;
+        if (!(trace_flags & TRACE_ITER_PRINTK))
+                return 0;
+        va_start(ap, fmt);
+        ret = __trace_array_vprintk(buffer, ip, fmt, ap);
+        va_end(ap);
+        return ret;
+}
 int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 {
        return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -1792,7 +2047,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
        if (buf_iter)
                event = ring_buffer_iter_peek(buf_iter, ts);
        else
-                event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+                event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
                                         lost_events);
        if (event) {
@@ -1807,7 +2062,7 @@ static struct trace_entry *
 __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
                  unsigned long *missing_events, u64 *ent_ts)
 {
-        struct ring_buffer *buffer = iter->tr->buffer;
+        struct ring_buffer *buffer = iter->trace_buffer->buffer;
        struct trace_entry *ent, *next = NULL;
        unsigned long lost_events = 0, next_lost = 0;
        int cpu_file = iter->cpu_file;
@@ -1820,7 +2075,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
         * If we are in a per_cpu trace file, don't bother by iterating over
         * all cpu and peek directly.
         */
-        if (cpu_file > TRACE_PIPE_ALL_CPU) {
+        if (cpu_file > RING_BUFFER_ALL_CPUS) {
                if (ring_buffer_empty_cpu(buffer, cpu_file))
                        return NULL;
                ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
@@ -1884,7 +2139,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
 static void trace_consume(struct trace_iterator *iter)
 {
-        ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+        ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
                            &iter->lost_events);
 }
@@ -1917,13 +2172,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 {
-        struct trace_array *tr = iter->tr;
        struct ring_buffer_event *event;
        struct ring_buffer_iter *buf_iter;
        unsigned long entries = 0;
        u64 ts;
-        tr->data[cpu]->skipped_entries = 0;
+        per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
        buf_iter = trace_buffer_iter(iter, cpu);
        if (!buf_iter)
@@ -1937,13 +2191,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
         * by the timestamp being before the start of the buffer.
         */
        while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
-                if (ts >= iter->tr->time_start)
+                if (ts >= iter->trace_buffer->time_start)
                        break;
                entries++;
                ring_buffer_read(buf_iter, NULL);
        }
-        tr->data[cpu]->skipped_entries = entries;
+        per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
 }
 /*
@@ -1953,6 +2207,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
        struct trace_iterator *iter = m->private;
+        struct trace_array *tr = iter->tr;
        int cpu_file = iter->cpu_file;
        void *p = NULL;
        loff_t l = 0;
@@ -1965,12 +2220,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
         * will point to the same string as current_trace->name.
         */
        mutex_lock(&trace_types_lock);
-        if (unlikely(current_trace && iter->trace->name != current_trace->name))
+        if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
-                *iter->trace = *current_trace;
+                *iter->trace = *tr->current_trace;
        mutex_unlock(&trace_types_lock);
+#ifdef CONFIG_TRACER_MAX_TRACE
        if (iter->snapshot && iter->trace->use_max_tr)
                return ERR_PTR(-EBUSY);
+#endif
        if (!iter->snapshot)
                atomic_inc(&trace_record_cmdline_disabled);
@@ -1980,7 +2237,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                iter->cpu = 0;
                iter->idx = -1;
-                if (cpu_file == TRACE_PIPE_ALL_CPU) {
+                if (cpu_file == RING_BUFFER_ALL_CPUS) {
                        for_each_tracing_cpu(cpu)
                                tracing_iter_reset(iter, cpu);
                } else
@@ -2012,17 +2269,21 @@ static void s_stop(struct seq_file *m, void *p)
 {
        struct trace_iterator *iter = m->private;
+#ifdef CONFIG_TRACER_MAX_TRACE
        if (iter->snapshot && iter->trace->use_max_tr)
                return;
+#endif
        if (!iter->snapshot)
                atomic_dec(&trace_record_cmdline_disabled);
        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
 }
 static void
-get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
+get_total_entries(struct trace_buffer *buf,
+                  unsigned long *total, unsigned long *entries)
 {
        unsigned long count;
        int cpu;
@@ -2031,19 +2292,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
        *entries = 0;
        for_each_tracing_cpu(cpu) {
-                count = ring_buffer_entries_cpu(tr->buffer, cpu);
+                count = ring_buffer_entries_cpu(buf->buffer, cpu);
                /*
                 * If this buffer has skipped entries, then we hold all
                 * entries for the trace and we need to ignore the
                 * ones before the time stamp.
                 */
-                if (tr->data[cpu]->skipped_entries) {
+                if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
-                        count -= tr->data[cpu]->skipped_entries;
+                        count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
                        /* total is the same as the entries */
                        *total += count;
                } else
                        *total += count +
-                                ring_buffer_overrun_cpu(tr->buffer, cpu);
+                                ring_buffer_overrun_cpu(buf->buffer, cpu);
                *entries += count;
        }
 }
@@ -2060,27 +2321,27 @@ static void print_lat_help_header(struct seq_file *m)
        seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
 }
-static void print_event_info(struct trace_array *tr, struct seq_file *m)
+static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
 {
        unsigned long total;
        unsigned long entries;
-        get_total_entries(tr, &total, &entries);
+        get_total_entries(buf, &total, &entries);
        seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",
                   entries, total, num_online_cpus());
        seq_puts(m, "#\n");
 }
-static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
+static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
 {
-        print_event_info(tr, m);
+        print_event_info(buf, m);
        seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");
        seq_puts(m, "#              | |       |          |         |\n");
 }
-static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
+static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
 {
-        print_event_info(tr, m);
+        print_event_info(buf, m);
        seq_puts(m, "#                              _-----=> irqs-off\n");
        seq_puts(m, "#                             / _----=> need-resched\n");
        seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
@@ -2094,16 +2355,16 @@ void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
        unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
-        struct trace_array *tr = iter->tr;
+        struct trace_buffer *buf = iter->trace_buffer;
-        struct trace_array_cpu *data = tr->data[tr->cpu];
+        struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
-        struct tracer *type = current_trace;
+        struct tracer *type = iter->trace;
        unsigned long entries;
        unsigned long total;
        const char *name = "preemption";
        name = type->name;
-        get_total_entries(tr, &total, &entries);
+        get_total_entries(buf, &total, &entries);
        seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
                   name, UTS_RELEASE);
@@ -2114,7 +2375,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
                   nsecs_to_usecs(data->saved_latency),
                   entries,
                   total,
-                   tr->cpu,
+                   buf->cpu,
 #if defined(CONFIG_PREEMPT_NONE)
                   "server",
 #elif defined(CONFIG_PREEMPT_VOLUNTARY)
@@ -2165,7 +2426,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
        if (cpumask_test_cpu(iter->cpu, iter->started))
                return;
-        if (iter->tr->data[iter->cpu]->skipped_entries)
+        if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
                return;
        cpumask_set_cpu(iter->cpu, iter->started);
@@ -2288,14 +2549,14 @@ int trace_empty(struct trace_iterator *iter)
        int cpu;
        /* If we are looking at one CPU buffer, only check that one */
-        if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
+        if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
                cpu = iter->cpu_file;
                buf_iter = trace_buffer_iter(iter, cpu);
                if (buf_iter) {
                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
-                        if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+                        if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
                                return 0;
                }
                return 1;
@@ -2307,7 +2568,7 @@ int trace_empty(struct trace_iterator *iter)
                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
-                        if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+                        if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
                                return 0;
                }
        }
@@ -2331,6 +2592,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
                        return ret;
        }
+        if (iter->ent->type == TRACE_BPUTS &&
+                        trace_flags & TRACE_ITER_PRINTK &&
+                        trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+                return trace_print_bputs_msg_only(iter);
        if (iter->ent->type == TRACE_BPRINT &&
                        trace_flags & TRACE_ITER_PRINTK &&
                        trace_flags & TRACE_ITER_PRINTK_MSGONLY)
@@ -2385,9 +2651,9 @@ void trace_default_header(struct seq_file *m)
        } else {
                if (!(trace_flags & TRACE_ITER_VERBOSE)) {
                        if (trace_flags & TRACE_ITER_IRQ_INFO)
-                                print_func_help_header_irq(iter->tr, m);
+                                print_func_help_header_irq(iter->trace_buffer, m);
                        else
-                                print_func_help_header(iter->tr, m);
+                                print_func_help_header(iter->trace_buffer, m);
                }
        }
 }
@@ -2400,6 +2666,50 @@ static void test_ftrace_alive(struct seq_file *m)
        seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n");
 }
+#ifdef CONFIG_TRACER_MAX_TRACE
+static void show_snapshot_main_help(struct seq_file *m)
+{
+        seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
+        seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+        seq_printf(m, "#                      Takes a snapshot of the main buffer.\n");
+        seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n");
+        seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");
+        seq_printf(m, "#                       is not a '0' or '1')\n");
+}
+static void show_snapshot_percpu_help(struct seq_file *m)
+{
+        seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+        seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+        seq_printf(m, "#                      Takes a snapshot of the main buffer for this cpu.\n");
+#else
+        seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
+        seq_printf(m, "#                     Must use main snapshot file to allocate.\n");
+#endif
+        seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
+        seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");
+        seq_printf(m, "#                       is not a '0' or '1')\n");
+}
+static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+        if (iter->tr->allocated_snapshot)
+                seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
+        else
+                seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
+        seq_printf(m, "# Snapshot commands:\n");
+        if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+                show_snapshot_main_help(m);
+        else
+                show_snapshot_percpu_help(m);
+}
+#else
+/* Should never be called */
+static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
+#endif
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
@@ -2411,7 +2721,9 @@ static int s_show(struct seq_file *m, void *v)
                        seq_puts(m, "#\n");
                        test_ftrace_alive(m);
                }
-                if (iter->trace && iter->trace->print_header)
+                if (iter->snapshot && trace_empty(iter))
+                        print_snapshot_help(m, iter);
+                else if (iter->trace && iter->trace->print_header)
                        iter->trace->print_header(m);
                else
                        trace_default_header(m);
@@ -2452,7 +2764,8 @@ static const struct seq_operations tracer_seq_ops = {
 static struct trace_iterator *
 __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 {
-        long cpu_file = (long) inode->i_private;
+        struct trace_cpu *tc = inode->i_private;
+        struct trace_array *tr = tc->tr;
        struct trace_iterator *iter;
        int cpu;
@@ -2477,26 +2790,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
        if (!iter->trace)
                goto fail;
-        *iter->trace = *current_trace;
+        *iter->trace = *tr->current_trace;
        if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
                goto fail;
-        if (current_trace->print_max || snapshot)
+        iter->tr = tr;
-                iter->tr = &max_tr;
+#ifdef CONFIG_TRACER_MAX_TRACE
+        /* Currently only the top directory has a snapshot */
+        if (tr->current_trace->print_max || snapshot)
+                iter->trace_buffer = &tr->max_buffer;
        else
-                iter->tr = &global_trace;
+#endif
+                iter->trace_buffer = &tr->trace_buffer;
        iter->snapshot = snapshot;
        iter->pos = -1;
        mutex_init(&iter->mutex);
-        iter->cpu_file = cpu_file;
+        iter->cpu_file = tc->cpu;
        /* Notify the tracer early; before we stop tracing. */
        if (iter->trace && iter->trace->open)
                iter->trace->open(iter);
        /* Annotate start of buffers if we had overruns */
-        if (ring_buffer_overruns(iter->tr->buffer))
+        if (ring_buffer_overruns(iter->trace_buffer->buffer))
                iter->iter_flags |= TRACE_FILE_ANNOTATE;
        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -2505,12 +2823,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
        /* stop the trace while dumping if we are not opening "snapshot" */
        if (!iter->snapshot)
-                tracing_stop();
+                tracing_stop_tr(tr);
-        if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
+        if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
                for_each_tracing_cpu(cpu) {
                        iter->buffer_iter[cpu] =
-                                ring_buffer_read_prepare(iter->tr->buffer, cpu);
+                                ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
                }
                ring_buffer_read_prepare_sync();
                for_each_tracing_cpu(cpu) {
@@ -2520,12 +2838,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
        } else {
                cpu = iter->cpu_file;
                iter->buffer_iter[cpu] =
-                        ring_buffer_read_prepare(iter->tr->buffer, cpu);
+                        ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
                ring_buffer_read_prepare_sync();
                ring_buffer_read_start(iter->buffer_iter[cpu]);
                tracing_iter_reset(iter, cpu);
        }
+        tr->ref++;
        mutex_unlock(&trace_types_lock);
        return iter;
@@ -2552,14 +2872,20 @@ static int tracing_release(struct inode *inode, struct file *file)
 {
        struct seq_file *m = file->private_data;
        struct trace_iterator *iter;
+        struct trace_array *tr;
        int cpu;
        if (!(file->f_mode & FMODE_READ))
                return 0;
        iter = m->private;
+        tr = iter->tr;
        mutex_lock(&trace_types_lock);
+        WARN_ON(!tr->ref);
+        tr->ref--;
        for_each_tracing_cpu(cpu) {
                if (iter->buffer_iter[cpu])
                        ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2570,7 +2896,7 @@ static int tracing_release(struct inode *inode, struct file *file)
        if (!iter->snapshot)
                /* reenable tracing if it was previously enabled */
-                tracing_start();
+                tracing_start_tr(tr);
        mutex_unlock(&trace_types_lock);
        mutex_destroy(&iter->mutex);
@@ -2589,12 +2915,13 @@ static int tracing_open(struct inode *inode, struct file *file)
        /* If this file was open for write, then erase contents */
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC)) {
-                long cpu = (long) inode->i_private;
+                struct trace_cpu *tc = inode->i_private;
+                struct trace_array *tr = tc->tr;
-                if (cpu == TRACE_PIPE_ALL_CPU)
+                if (tc->cpu == RING_BUFFER_ALL_CPUS)
-                        tracing_reset_online_cpus(&global_trace);
+                        tracing_reset_online_cpus(&tr->trace_buffer);
                else
-                        tracing_reset(&global_trace, cpu);
+                        tracing_reset(&tr->trace_buffer, tc->cpu);
        }
        if (file->f_mode & FMODE_READ) {
@@ -2741,8 +3068,9 @@ static ssize_t
 tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                      size_t count, loff_t *ppos)
 {
-        int err, cpu;
+        struct trace_array *tr = filp->private_data;
        cpumask_var_t tracing_cpumask_new;
+        int err, cpu;
        if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
                return -ENOMEM;
@@ -2762,13 +3090,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                 */
                if (cpumask_test_cpu(cpu, tracing_cpumask) &&
                                !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
-                        atomic_inc(&global_trace.data[cpu]->disabled);
+                        atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
-                        ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
+                        ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
                }
                if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
                                cpumask_test_cpu(cpu, tracing_cpumask_new)) {
-                        atomic_dec(&global_trace.data[cpu]->disabled);
+                        atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
-                        ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
+                        ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
                }
        }
        arch_spin_unlock(&ftrace_max_lock);
@@ -2797,12 +3125,13 @@ static const struct file_operations tracing_cpumask_fops = {
 static int tracing_trace_options_show(struct seq_file *m, void *v)
 {
        struct tracer_opt *trace_opts;
+        struct trace_array *tr = m->private;
        u32 tracer_flags;
        int i;
        mutex_lock(&trace_types_lock);
-        tracer_flags = current_trace->flags->val;
+        tracer_flags = tr->current_trace->flags->val;
-        trace_opts = current_trace->flags->opts;
+        trace_opts = tr->current_trace->flags->opts;
        for (i = 0; trace_options[i]; i++) {
                if (trace_flags & (1 << i))
@@ -2857,11 +3186,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
        return -EINVAL;
 }
-static void set_tracer_flags(unsigned int mask, int enabled)
+/* Some tracers require overwrite to stay enabled */
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
+{
+        if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
+                return -1;
+        return 0;
+}
+int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
 {
        /* do nothing if flag is already set */
        if (!!(trace_flags & mask) == !!enabled)
-                return;
+                return 0;
+        /* Give the tracer a chance to approve the change */
+        if (tr->current_trace->flag_changed)
+                if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled))
+                        return -EINVAL;
        if (enabled)
                trace_flags |= mask;
@@ -2871,18 +3214,24 @@ static void set_tracer_flags(unsigned int mask, int enabled)
        if (mask == TRACE_ITER_RECORD_CMD)
                trace_event_enable_cmd_record(enabled);
-        if (mask == TRACE_ITER_OVERWRITE)
+        if (mask == TRACE_ITER_OVERWRITE) {
-                ring_buffer_change_overwrite(global_trace.buffer, enabled);
+                ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
+#ifdef CONFIG_TRACER_MAX_TRACE
+                ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
+#endif
+        }
        if (mask == TRACE_ITER_PRINTK)
                trace_printk_start_stop_comm(enabled);
+        return 0;
 }
-static int trace_set_options(char *option)
+static int trace_set_options(struct trace_array *tr, char *option)
 {
        char *cmp;
        int neg = 0;
-        int ret = 0;
+        int ret = -ENODEV;
        int i;
        cmp = strstrip(option);
@@ -2892,19 +3241,20 @@ static int trace_set_options(char *option)
                cmp += 2;
        }
+        mutex_lock(&trace_types_lock);
        for (i = 0; trace_options[i]; i++) {
                if (strcmp(cmp, trace_options[i]) == 0) {
-                        set_tracer_flags(1 << i, !neg);
+                        ret = set_tracer_flag(tr, 1 << i, !neg);
                        break;
                }
        }
        /* If no option could be set, test the specific tracer options */
-        if (!trace_options[i]) {
+        if (!trace_options[i])
-                mutex_lock(&trace_types_lock);
+                ret = set_tracer_option(tr->current_trace, cmp, neg);
-                ret = set_tracer_option(current_trace, cmp, neg);
-                mutex_unlock(&trace_types_lock);
+        mutex_unlock(&trace_types_lock);
-        }
        return ret;
 }
@@ -2913,7 +3263,10 @@ static ssize_t
 tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
 {
+        struct seq_file *m = filp->private_data;
+        struct trace_array *tr = m->private;
        char buf[64];
+        int ret;
        if (cnt >= sizeof(buf))
                return -EINVAL;
@@ -2923,7 +3276,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
        buf[cnt] = 0;
-        trace_set_options(buf);
+        ret = trace_set_options(tr, buf);
+        if (ret < 0)
+                return ret;
        *ppos += cnt;
@@ -2934,7 +3289,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)
 {
        if (tracing_disabled)
                return -ENODEV;
-        return single_open(file, tracing_trace_options_show, NULL);
+        return single_open(file, tracing_trace_options_show, inode->i_private);
 }
 static const struct file_operations tracing_iter_fops = {
@@ -2947,20 +3303,84 @@ static const struct file_operations tracing_iter_fops = {
 static const char readme_msg[] =
        "tracing mini-HOWTO:\n\n"
-        "# mount -t debugfs nodev /sys/kernel/debug\n\n"
+        "# echo 0 > tracing_on : quick way to disable tracing\n"
-        "# cat /sys/kernel/debug/tracing/available_tracers\n"
+        "# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
-        "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n"
+        " Important files:\n"
-        "# cat /sys/kernel/debug/tracing/current_tracer\n"
+        "  trace\t\t\t- The static contents of the buffer\n"
-        "nop\n"
+        "\t\t\t  To clear the buffer write into this file: echo > trace\n"
-        "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n"
+        "  trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
-        "# cat /sys/kernel/debug/tracing/current_tracer\n"
+        "  current_tracer\t- function and latency tracers\n"
-        "wakeup\n"
+        "  available_tracers\t- list of configured tracers for current_tracer\n"
-        "# cat /sys/kernel/debug/tracing/trace_options\n"
+        "  buffer_size_kb\t- view and modify size of per cpu buffer\n"
-        "noprint-parent nosym-offset nosym-addr noverbose\n"
+        "  buffer_total_size_kb  - view total size of all cpu buffers\n\n"
-        "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
+        "  trace_clock\t\t-change the clock used to order events\n"
-        "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
+        "       local:   Per cpu clock but may not be synced across CPUs\n"
-        "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
+        "      global:   Synced across CPUs but slows tracing down.\n"
-        "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
+        "     counter:   Not a clock, but just an increment\n"
+        "      uptime:   Jiffy counter from time of boot\n"
+        "        perf:   Same clock that perf events use\n"
+#ifdef CONFIG_X86_64
+        "     x86-tsc:   TSC cycle counter\n"
+#endif
+        "\n  trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
+        "  tracing_cpumask\t- Limit which CPUs to trace\n"
+        "  instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
+        "\t\t\t  Remove sub-buffer with rmdir\n"
+        "  trace_options\t\t- Set format or modify how tracing happens\n"
+        "\t\t\t  Disable an option by adding a suffix 'no' to the option name\n"
+#ifdef CONFIG_DYNAMIC_FTRACE
+        "\n  available_filter_functions - list of functions that can be filtered on\n"
+        "  set_ftrace_filter\t- echo function name in here to only trace these functions\n"
+        "            accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+        "            modules: Can select a group via module\n"
+        "             Format: :mod:<module-name>\n"
+        "             example: echo :mod:ext3 > set_ftrace_filter\n"
+        "            triggers: a command to perform when function is hit\n"
+        "              Format: <function>:<trigger>[:count]\n"
+        "             trigger: traceon, traceoff\n"
+        "                      enable_event:<system>:<event>\n"
+        "                      disable_event:<system>:<event>\n"
+#ifdef CONFIG_STACKTRACE
+        "                      stacktrace\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+        "                      snapshot\n"
+#endif
+        "             example: echo do_fault:traceoff > set_ftrace_filter\n"
+        "                      echo do_trap:traceoff:3 > set_ftrace_filter\n"
+        "             The first one will disable tracing every time do_fault is hit\n"
+        "             The second will disable tracing at most 3 times when do_trap is hit\n"
+        "               The first time do trap is hit and it disables tracing, the counter\n"
+        "               will decrement to 2. If tracing is already disabled, the counter\n"
+        "               will not decrement. It only decrements when the trigger did work\n"
+        "             To remove trigger without count:\n"
+        "               echo '!<function>:<trigger> > set_ftrace_filter\n"
+        "             To remove trigger with a count:\n"
+        "               echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
+        "  set_ftrace_notrace\t- echo function name in here to never trace.\n"
+        "            accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+        "            modules: Can select a group via module command :mod:\n"
+        "            Does not accept triggers\n"
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#ifdef CONFIG_FUNCTION_TRACER
+        "  set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n"
+#endif
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+        "  set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
+        "  max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+        "\n  snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"
+        "\t\t\t  Read the contents for more information\n"
+#endif
+#ifdef CONFIG_STACKTRACE
+        "  stack_trace\t\t- Shows the max stack trace when active\n"
+        "  stack_max_size\t- Shows current max stack size that was traced\n"
+        "\t\t\t  Write into this file to reset the max size (trigger a new trace)\n"
+#ifdef CONFIG_DYNAMIC_FTRACE
+        "  stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"
+#endif
+#endif /* CONFIG_STACKTRACE */
 ;
 static ssize_t
@@ -3032,11 +3452,12 @@ static ssize_t
 tracing_set_trace_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
 {
+        struct trace_array *tr = filp->private_data;
        char buf[MAX_TRACER_SIZE+2];
        int r;
        mutex_lock(&trace_types_lock);
-        r = sprintf(buf, "%s\n", current_trace->name);
+        r = sprintf(buf, "%s\n", tr->current_trace->name);
        mutex_unlock(&trace_types_lock);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3044,43 +3465,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 int tracer_init(struct tracer *t, struct trace_array *tr)
 {
-        tracing_reset_online_cpus(tr);
+        tracing_reset_online_cpus(&tr->trace_buffer);
        return t->init(tr);
 }
-static void set_buffer_entries(struct trace_array *tr, unsigned long val)
+static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
 {
        int cpu;
        for_each_tracing_cpu(cpu)
-                tr->data[cpu]->entries = val;
+                per_cpu_ptr(buf->data, cpu)->entries = val;
 }
+#ifdef CONFIG_TRACER_MAX_TRACE
 /* resize @tr's buffer to the size of @size_tr's entries */
-static int resize_buffer_duplicate_size(struct trace_array *tr,
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
-                                        struct trace_array *size_tr, int cpu_id)
+                                        struct trace_buffer *size_buf, int cpu_id)
 {
        int cpu, ret = 0;
        if (cpu_id == RING_BUFFER_ALL_CPUS) {
                for_each_tracing_cpu(cpu) {
-                        ret = ring_buffer_resize(tr->buffer,
+                        ret = ring_buffer_resize(trace_buf->buffer,
-                                        size_tr->data[cpu]->entries, cpu);
+                                 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
                        if (ret < 0)
                                break;
-                        tr->data[cpu]->entries = size_tr->data[cpu]->entries;
+                        per_cpu_ptr(trace_buf->data, cpu)->entries =
+                                per_cpu_ptr(size_buf->data, cpu)->entries;
                }
        } else {
-                ret = ring_buffer_resize(tr->buffer,
+                ret = ring_buffer_resize(trace_buf->buffer,
-                                        size_tr->data[cpu_id]->entries, cpu_id);
+                                 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
                if (ret == 0)
-                        tr->data[cpu_id]->entries =
+                        per_cpu_ptr(trace_buf->data, cpu_id)->entries =
-                                size_tr->data[cpu_id]->entries;
+                                per_cpu_ptr(size_buf->data, cpu_id)->entries;
        }
        return ret;
 }
+#endif /* CONFIG_TRACER_MAX_TRACE */
-static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
+static int __tracing_resize_ring_buffer(struct trace_array *tr,
+                                        unsigned long size, int cpu)
 {
        int ret;
@@ -3089,23 +3515,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
         * we use the size that was given, and we can forget about
         * expanding it later.
         */
-        ring_buffer_expanded = 1;
+        ring_buffer_expanded = true;
        /* May be called before buffers are initialized */
-        if (!global_trace.buffer)
+        if (!tr->trace_buffer.buffer)
                return 0;
-        ret = ring_buffer_resize(global_trace.buffer, size, cpu);
+        ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
        if (ret < 0)
                return ret;
-        if (!current_trace->use_max_tr)
+#ifdef CONFIG_TRACER_MAX_TRACE
+        if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
+            !tr->current_trace->use_max_tr)
                goto out;
-        ret = ring_buffer_resize(max_tr.buffer, size, cpu);
+        ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
        if (ret < 0) {
-                int r = resize_buffer_duplicate_size(&global_trace,
+                int r = resize_buffer_duplicate_size(&tr->trace_buffer,
-                                                     &global_trace, cpu);
+                                                     &tr->trace_buffer, cpu);
                if (r < 0) {
                        /*
                         * AARGH! We are left with different
@@ -3128,20 +3556,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
        }
        if (cpu == RING_BUFFER_ALL_CPUS)
-                set_buffer_entries(&max_tr, size);
+                set_buffer_entries(&tr->max_buffer, size);
        else
-                max_tr.data[cpu]->entries = size;
+                per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
 out:
+#endif /* CONFIG_TRACER_MAX_TRACE */
        if (cpu == RING_BUFFER_ALL_CPUS)
-                set_buffer_entries(&global_trace, size);
+                set_buffer_entries(&tr->trace_buffer, size);
        else
-                global_trace.data[cpu]->entries = size;
+                per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
        return ret;
 }
-static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
+static ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
+                                          unsigned long size, int cpu_id)
 {
        int ret = size;
@@ -3155,7 +3586,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
                }
        }
-        ret = __tracing_resize_ring_buffer(size, cpu_id);
+        ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
        if (ret < 0)
                ret = -ENOMEM;
@@ -3182,7 +3613,7 @@ int tracing_update_buffers(void)
        mutex_lock(&trace_types_lock);
        if (!ring_buffer_expanded)
-                ret = __tracing_resize_ring_buffer(trace_buf_size,
+                ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
                                                RING_BUFFER_ALL_CPUS);
        mutex_unlock(&trace_types_lock);
@@ -3192,7 +3623,7 @@ int tracing_update_buffers(void)
 struct trace_option_dentry;
 static struct trace_option_dentry *
-create_trace_option_files(struct tracer *tracer);
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
 static void
 destroy_trace_option_files(struct trace_option_dentry *topts);
@@ -3202,13 +3633,15 @@ static int tracing_set_tracer(const char *buf)
        static struct trace_option_dentry *topts;
        struct trace_array *tr = &global_trace;
        struct tracer *t;
+#ifdef CONFIG_TRACER_MAX_TRACE
        bool had_max_tr;
+#endif
        int ret = 0;
        mutex_lock(&trace_types_lock);
        if (!ring_buffer_expanded) {
-                ret = __tracing_resize_ring_buffer(trace_buf_size,
+                ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
                                                RING_BUFFER_ALL_CPUS);
                if (ret < 0)
                        goto out;
@@ -3223,15 +3656,21 @@ static int tracing_set_tracer(const char *buf)
                ret = -EINVAL;
                goto out;
        }
-        if (t == current_trace)
+        if (t == tr->current_trace)
                goto out;
        trace_branch_disable();
-        if (current_trace->reset)
-                current_trace->reset(tr);
-        had_max_tr = current_trace->allocated_snapshot;
+        tr->current_trace->enabled = false;
-        current_trace = &nop_trace;
+        if (tr->current_trace->reset)
+                tr->current_trace->reset(tr);
+        /* Current trace needs to be nop_trace before synchronize_sched */
+        tr->current_trace = &nop_trace;
+#ifdef CONFIG_TRACER_MAX_TRACE
+        had_max_tr = tr->allocated_snapshot;
        if (had_max_tr && !t->use_max_tr) {
                /*
@@ -3242,27 +3681,20 @@ static int tracing_set_tracer(const char *buf)
                 * so a synchronized_sched() is sufficient.
                 */
                synchronize_sched();
-                /*
+                free_snapshot(tr);
-                 * We don't free the ring buffer. instead, resize it because
-                 * The max_tr ring buffer has some state (e.g. ring->clock) and
-                 * we want preserve it.
-                 */
-                ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
-                set_buffer_entries(&max_tr, 1);
-                tracing_reset_online_cpus(&max_tr);
-                current_trace->allocated_snapshot = false;
        }
+#endif
        destroy_trace_option_files(topts);
-        topts = create_trace_option_files(t);
+        topts = create_trace_option_files(tr, t);
+#ifdef CONFIG_TRACER_MAX_TRACE
        if (t->use_max_tr && !had_max_tr) {
-                /* we need to make per cpu buffer sizes equivalent */
+                ret = alloc_snapshot(tr);
-                ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
-                                                   RING_BUFFER_ALL_CPUS);
                if (ret < 0)
                        goto out;
-                t->allocated_snapshot = true;
        }
+#endif
        if (t->init) {
                ret = tracer_init(t, tr);
@@ -3270,7 +3702,8 @@ static int tracing_set_tracer(const char *buf)
                        goto out;
        }
-        current_trace = t;
+        tr->current_trace = t;
+        tr->current_trace->enabled = true;
        trace_branch_enable(tr);
 out:
        mutex_unlock(&trace_types_lock);
@@ -3344,7 +3777,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 static int tracing_open_pipe(struct inode *inode, struct file *filp)
 {
-        long cpu_file = (long) inode->i_private;
+        struct trace_cpu *tc = inode->i_private;
+        struct trace_array *tr = tc->tr;
        struct trace_iterator *iter;
        int ret = 0;
@@ -3369,7 +3803,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
                ret = -ENOMEM;
                goto fail;
        }
-        *iter->trace = *current_trace;
+        *iter->trace = *tr->current_trace;
        if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
                ret = -ENOMEM;
@@ -3386,8 +3820,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        if (trace_clocks[trace_clock_id].in_ns)
                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
-        iter->cpu_file = cpu_file;
+        iter->cpu_file = tc->cpu;
-        iter->tr = &global_trace;
+        iter->tr = tc->tr;
+        iter->trace_buffer = &tc->tr->trace_buffer;
        mutex_init(&iter->mutex);
        filp->private_data = iter;
@@ -3426,24 +3861,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 }
 static unsigned int
-tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
 {
-        struct trace_iterator *iter = filp->private_data;
+        /* Iterators are static, they should be filled or empty */
+        if (trace_buffer_iter(iter, iter->cpu_file))
+                return POLLIN | POLLRDNORM;
-        if (trace_flags & TRACE_ITER_BLOCK) {
+        if (trace_flags & TRACE_ITER_BLOCK)
                /*
                 * Always select as readable when in blocking mode
                 */
                return POLLIN | POLLRDNORM;
-        } else {
+        else
-                if (!trace_empty(iter))
+                return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
-                        return POLLIN | POLLRDNORM;
+                                             filp, poll_table);
-                poll_wait(filp, &trace_wait, poll_table);
+}
-                if (!trace_empty(iter))
-                        return POLLIN | POLLRDNORM;
-                return 0;
+static unsigned int
-        }
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+        struct trace_iterator *iter = filp->private_data;
+        return trace_poll(iter, filp, poll_table);
 }
 /*
@@ -3509,6 +3948,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
 {
        struct trace_iterator *iter = filp->private_data;
+        struct trace_array *tr = iter->tr;
        ssize_t sret;
        /* return any leftover data */
@@ -3520,8 +3960,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
        /* copy the tracer to avoid using a global lock all around */
        mutex_lock(&trace_types_lock);
-        if (unlikely(iter->trace->name != current_trace->name))
+        if (unlikely(iter->trace->name != tr->current_trace->name))
-                *iter->trace = *current_trace;
+                *iter->trace = *tr->current_trace;
        mutex_unlock(&trace_types_lock);
        /*
@@ -3677,6 +4117,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                .ops            = &tracing_pipe_buf_ops,
                .spd_release    = tracing_spd_release_pipe,
        };
+        struct trace_array *tr = iter->tr;
        ssize_t ret;
        size_t rem;
        unsigned int i;
@@ -3686,8 +4127,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        /* copy the tracer to avoid using a global lock all around */
        mutex_lock(&trace_types_lock);
-        if (unlikely(iter->trace->name != current_trace->name))
+        if (unlikely(iter->trace->name != tr->current_trace->name))
-                *iter->trace = *current_trace;
+                *iter->trace = *tr->current_trace;
        mutex_unlock(&trace_types_lock);
        mutex_lock(&iter->mutex);
@@ -3749,43 +4190,19 @@ out_err:
        goto out;
 }
-struct ftrace_entries_info {
-        struct trace_array      *tr;
-        int                     cpu;
-};
-static int tracing_entries_open(struct inode *inode, struct file *filp)
-{
-        struct ftrace_entries_info *info;
-        if (tracing_disabled)
-                return -ENODEV;
-        info = kzalloc(sizeof(*info), GFP_KERNEL);
-        if (!info)
-                return -ENOMEM;
-        info->tr = &global_trace;
-        info->cpu = (unsigned long)inode->i_private;
-        filp->private_data = info;
-        return 0;
-}
 static ssize_t
 tracing_entries_read(struct file *filp, char __user *ubuf,
                     size_t cnt, loff_t *ppos)
 {
-        struct ftrace_entries_info *info = filp->private_data;
+        struct trace_cpu *tc = filp->private_data;
-        struct trace_array *tr = info->tr;
+        struct trace_array *tr = tc->tr;
        char buf[64];
        int r = 0;
        ssize_t ret;
        mutex_lock(&trace_types_lock);
-        if (info->cpu == RING_BUFFER_ALL_CPUS) {
+        if (tc->cpu == RING_BUFFER_ALL_CPUS) {
                int cpu, buf_size_same;
                unsigned long size;
@@ -3795,8 +4212,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
                for_each_tracing_cpu(cpu) {
                        /* fill in the size from first enabled cpu */
                        if (size == 0)
-                                size = tr->data[cpu]->entries;
+                                size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
-                        if (size != tr->data[cpu]->entries) {
+                        if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
                                buf_size_same = 0;
                                break;
                        }
@@ -3812,7 +4229,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
                } else
                        r = sprintf(buf, "X\n");
        } else
-                r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
+                r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);
        mutex_unlock(&trace_types_lock);
@@ -3824,7 +4241,7 @@ static ssize_t
 tracing_entries_write(struct file *filp, const char __user *ubuf,
                      size_t cnt, loff_t *ppos)
 {
-        struct ftrace_entries_info *info = filp->private_data;
+        struct trace_cpu *tc = filp->private_data;
        unsigned long val;
        int ret;
@@ -3839,7 +4256,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        /* value is in KB */
        val <<= 10;
-        ret = tracing_resize_ring_buffer(val, info->cpu);
+        ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);
        if (ret < 0)
                return ret;
@@ -3848,16 +4265,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
-static int
-tracing_entries_release(struct inode *inode, struct file *filp)
-{
-        struct ftrace_entries_info *info = filp->private_data;
-        kfree(info);
-        return 0;
-}
 static ssize_t
 tracing_total_entries_read(struct file *filp, char __user *ubuf,
                                size_t cnt, loff_t *ppos)
@@ -3869,7 +4276,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
        mutex_lock(&trace_types_lock);
        for_each_tracing_cpu(cpu) {
-                size += tr->data[cpu]->entries >> 10;
+                size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
                if (!ring_buffer_expanded)
                        expanded_size += trace_buf_size >> 10;
        }
@@ -3899,11 +4306,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
 static int
 tracing_free_buffer_release(struct inode *inode, struct file *filp)
 {
+        struct trace_array *tr = inode->i_private;
        /* disable tracing ? */
        if (trace_flags & TRACE_ITER_STOP_ON_FREE)
                tracing_off();
        /* resize the ring buffer to 0 */
-        tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);
+        tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
        return 0;
 }
@@ -3972,7 +4381,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        local_save_flags(irq_flags);
        size = sizeof(*entry) + cnt + 2; /* possible \n added */
-        buffer = global_trace.buffer;
+        buffer = global_trace.trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
                                          irq_flags, preempt_count());
        if (!event) {
@@ -4014,13 +4423,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 static int tracing_clock_show(struct seq_file *m, void *v)
 {
+        struct trace_array *tr = m->private;
        int i;
        for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
                seq_printf(m,
                        "%s%s%s%s", i ? " " : "",
-                        i == trace_clock_id ? "[" : "", trace_clocks[i].name,
+                        i == tr->clock_id ? "[" : "", trace_clocks[i].name,
-                        i == trace_clock_id ? "]" : "");
+                        i == tr->clock_id ? "]" : "");
        seq_putc(m, '\n');
        return 0;
@@ -4029,6 +4439,8 @@ static int tracing_clock_show(struct seq_file *m, void *v)
 static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
                                   size_t cnt, loff_t *fpos)
 {
+        struct seq_file *m = filp->private_data;
+        struct trace_array *tr = m->private;
        char buf[64];
        const char *clockstr;
        int i;
@@ -4050,20 +4462,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
        if (i == ARRAY_SIZE(trace_clocks))
                return -EINVAL;
-        trace_clock_id = i;
        mutex_lock(&trace_types_lock);
-        ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
+        tr->clock_id = i;
-        if (max_tr.buffer)
-                ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
+        ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
        /*
         * New clock may not be consistent with the previous clock.
         * Reset the buffer so that it doesn't have incomparable timestamps.
         */
-        tracing_reset_online_cpus(&global_trace);
+        tracing_reset_online_cpus(&global_trace.trace_buffer);
-        tracing_reset_online_cpus(&max_tr);
+#ifdef CONFIG_TRACER_MAX_TRACE
+        if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+                ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
+        tracing_reset_online_cpus(&global_trace.max_buffer);
+#endif
        mutex_unlock(&trace_types_lock);
@@ -4076,20 +4491,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
 {
        if (tracing_disabled)
                return -ENODEV;
-        return single_open(file, tracing_clock_show, NULL);
+        return single_open(file, tracing_clock_show, inode->i_private);
 }
+struct ftrace_buffer_info {
+        struct trace_iterator   iter;
+        void                    *spare;
+        unsigned int            read;
+};
 #ifdef CONFIG_TRACER_SNAPSHOT
 static int tracing_snapshot_open(struct inode *inode, struct file *file)
 {
+        struct trace_cpu *tc = inode->i_private;
        struct trace_iterator *iter;
+        struct seq_file *m;
        int ret = 0;
        if (file->f_mode & FMODE_READ) {
                iter = __tracing_open(inode, file, true);
                if (IS_ERR(iter))
                        ret = PTR_ERR(iter);
+        } else {
+                /* Writes still need the seq_file to hold the private data */
+                m = kzalloc(sizeof(*m), GFP_KERNEL);
+                if (!m)
+                        return -ENOMEM;
+                iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+                if (!iter) {
+                        kfree(m);
+                        return -ENOMEM;
+                }
+                iter->tr = tc->tr;
+                iter->trace_buffer = &tc->tr->max_buffer;
+                iter->cpu_file = tc->cpu;
+                m->private = iter;
+                file->private_data = m;
        }
        return ret;
 }
@@ -4097,6 +4537,9 @@ static ssize_t
 tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
                       loff_t *ppos)
 {
+        struct seq_file *m = filp->private_data;
+        struct trace_iterator *iter = m->private;
+        struct trace_array *tr = iter->tr;
        unsigned long val;
        int ret;
@@ -4110,42 +4553,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
        mutex_lock(&trace_types_lock);
-        if (current_trace->use_max_tr) {
+        if (tr->current_trace->use_max_tr) {
                ret = -EBUSY;
                goto out;
        }
        switch (val) {
        case 0:
-                if (current_trace->allocated_snapshot) {
+                if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
-                        /* free spare buffer */
+                        ret = -EINVAL;
-                        ring_buffer_resize(max_tr.buffer, 1,
+                        break;
-                                           RING_BUFFER_ALL_CPUS);
-                        set_buffer_entries(&max_tr, 1);
-                        tracing_reset_online_cpus(&max_tr);
-                        current_trace->allocated_snapshot = false;
                }
+                if (tr->allocated_snapshot)
+                        free_snapshot(tr);
                break;
        case 1:
-                if (!current_trace->allocated_snapshot) {
+/* Only allow per-cpu swap if the ring buffer supports it */
-                        /* allocate spare buffer */
+#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
-                        ret = resize_buffer_duplicate_size(&max_tr,
+                if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
-                                        &global_trace, RING_BUFFER_ALL_CPUS);
+                        ret = -EINVAL;
+                        break;
+                }
+#endif
+                if (!tr->allocated_snapshot) {
+                        ret = alloc_snapshot(tr);
                        if (ret < 0)
                                break;
-                        current_trace->allocated_snapshot = true;
                }
                local_irq_disable();
                /* Now, we're going to swap */
-                update_max_tr(&global_trace, current, smp_processor_id());
+                if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+                        update_max_tr(tr, current, smp_processor_id());
+                else
+                        update_max_tr_single(tr, current, iter->cpu_file);
                local_irq_enable();
                break;
        default:
-                if (current_trace->allocated_snapshot)
+                if (tr->allocated_snapshot) {
-                        tracing_reset_online_cpus(&max_tr);
+                        if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
-                else
+                                tracing_reset_online_cpus(&tr->max_buffer);
-                        ret = -EINVAL;
+                        else
+                                tracing_reset(&tr->max_buffer, iter->cpu_file);
+                }
                break;
        }
@@ -4157,6 +4606,51 @@ out:
        mutex_unlock(&trace_types_lock);
        return ret;
 }
+static int tracing_snapshot_release(struct inode *inode, struct file *file)
+{
+        struct seq_file *m = file->private_data;
+        if (file->f_mode & FMODE_READ)
+                return tracing_release(inode, file);
+        /* If write only, the seq_file is just a stub */
+        if (m)
+                kfree(m->private);
+        kfree(m);
+        return 0;
+}
+static int tracing_buffers_open(struct inode *inode, struct file *filp);
+static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
+                                    size_t count, loff_t *ppos);
+static int tracing_buffers_release(struct inode *inode, struct file *file);
+static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+                   struct pipe_inode_info *pipe, size_t len, unsigned int flags);
+static int snapshot_raw_open(struct inode *inode, struct file *filp)
+{
+        struct ftrace_buffer_info *info;
+        int ret;
+        ret = tracing_buffers_open(inode, filp);
+        if (ret < 0)
+                return ret;
+        info = filp->private_data;
+        if (info->iter.trace->use_max_tr) {
+                tracing_buffers_release(inode, filp);
+                return -EBUSY;
+        }
+        info->iter.snapshot = true;
+        info->iter.trace_buffer = &info->iter.tr->max_buffer;
+        return ret;
+}
 #endif /* CONFIG_TRACER_SNAPSHOT */
@@ -4184,10 +4678,9 @@ static const struct file_operations tracing_pipe_fops = {
 };
 static const struct file_operations tracing_entries_fops = {
-        .open           = tracing_entries_open,
+        .open           = tracing_open_generic,
        .read           = tracing_entries_read,
        .write          = tracing_entries_write,
-        .release        = tracing_entries_release,
        .llseek         = generic_file_llseek,
 };
@@ -4222,20 +4715,23 @@ static const struct file_operations snapshot_fops = {
        .read           = seq_read,
        .write          = tracing_snapshot_write,
        .llseek         = tracing_seek,
-        .release        = tracing_release,
+        .release        = tracing_snapshot_release,
 };
-#endif /* CONFIG_TRACER_SNAPSHOT */
-struct ftrace_buffer_info {
+static const struct file_operations snapshot_raw_fops = {
-        struct trace_array      *tr;
+        .open           = snapshot_raw_open,
-        void                    *spare;
+        .read           = tracing_buffers_read,
-        int                     cpu;
+        .release        = tracing_buffers_release,
-        unsigned int            read;
+        .splice_read    = tracing_buffers_splice_read,
+        .llseek         = no_llseek,
 };
+#endif /* CONFIG_TRACER_SNAPSHOT */
 static int tracing_buffers_open(struct inode *inode, struct file *filp)
 {
-        int cpu = (int)(long)inode->i_private;
+        struct trace_cpu *tc = inode->i_private;
+        struct trace_array *tr = tc->tr;
        struct ftrace_buffer_info *info;
        if (tracing_disabled)
@@ -4245,72 +4741,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
        if (!info)
                return -ENOMEM;
-        info->tr        = &global_trace;
+        mutex_lock(&trace_types_lock);
-        info->cpu       = cpu;
-        info->spare     = NULL;
+        tr->ref++;
+        info->iter.tr           = tr;
+        info->iter.cpu_file     = tc->cpu;
+        info->iter.trace        = tr->current_trace;
+        info->iter.trace_buffer = &tr->trace_buffer;
+        info->spare             = NULL;
        /* Force reading ring buffer for first read */
-        info->read      = (unsigned int)-1;
+        info->read              = (unsigned int)-1;
        filp->private_data = info;
+        mutex_unlock(&trace_types_lock);
        return nonseekable_open(inode, filp);
 }
+static unsigned int
+tracing_buffers_poll(struct file *filp, poll_table *poll_table)
+{
+        struct ftrace_buffer_info *info = filp->private_data;
+        struct trace_iterator *iter = &info->iter;
+        return trace_poll(iter, filp, poll_table);
+}
 static ssize_t
 tracing_buffers_read(struct file *filp, char __user *ubuf,
                     size_t count, loff_t *ppos)
 {
        struct ftrace_buffer_info *info = filp->private_data;
+        struct trace_iterator *iter = &info->iter;
        ssize_t ret;
-        size_t size;
+        ssize_t size;
        if (!count)
                return 0;
+        mutex_lock(&trace_types_lock);
+#ifdef CONFIG_TRACER_MAX_TRACE
+        if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
+                size = -EBUSY;
+                goto out_unlock;
+        }
+#endif
        if (!info->spare)
-                info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
+                info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
+                                                          iter->cpu_file);
+        size = -ENOMEM;
        if (!info->spare)
-                return -ENOMEM;
+                goto out_unlock;
        /* Do we have previous read data to read? */
        if (info->read < PAGE_SIZE)
                goto read;
-        trace_access_lock(info->cpu);
+ again:
-        ret = ring_buffer_read_page(info->tr->buffer,
+        trace_access_lock(iter->cpu_file);
+        ret = ring_buffer_read_page(iter->trace_buffer->buffer,
                                    &info->spare,
                                    count,
-                                    info->cpu, 0);
+                                    iter->cpu_file, 0);
-        trace_access_unlock(info->cpu);
+        trace_access_unlock(iter->cpu_file);
-        if (ret < 0)
-                return 0;
-        info->read = 0;
+        if (ret < 0) {
+                if (trace_empty(iter)) {
+                        if ((filp->f_flags & O_NONBLOCK)) {
+                                size = -EAGAIN;
+                                goto out_unlock;
+                        }
+                        mutex_unlock(&trace_types_lock);
+                        iter->trace->wait_pipe(iter);
+                        mutex_lock(&trace_types_lock);
+                        if (signal_pending(current)) {
+                                size = -EINTR;
+                                goto out_unlock;
+                        }
+                        goto again;
+                }
+                size = 0;
+                goto out_unlock;
+        }
-read:
+        info->read = 0;
+ read:
        size = PAGE_SIZE - info->read;
        if (size > count)
                size = count;
        ret = copy_to_user(ubuf, info->spare + info->read, size);
-        if (ret == size)
+        if (ret == size) {
-                return -EFAULT;
+                size = -EFAULT;
+                goto out_unlock;
+        }
        size -= ret;
        *ppos += size;
        info->read += size;
+ out_unlock:
+        mutex_unlock(&trace_types_lock);
        return size;
 }
 static int tracing_buffers_release(struct inode *inode, struct file *file)
 {
        struct ftrace_buffer_info *info = file->private_data;
+        struct trace_iterator *iter = &info->iter;
+        mutex_lock(&trace_types_lock);
+        WARN_ON(!iter->tr->ref);
+        iter->tr->ref--;
        if (info->spare)
-                ring_buffer_free_read_page(info->tr->buffer, info->spare);
+                ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
        kfree(info);
+        mutex_unlock(&trace_types_lock);
        return 0;
 }
@@ -4375,6 +4930,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                            unsigned int flags)
 {
        struct ftrace_buffer_info *info = file->private_data;
+        struct trace_iterator *iter = &info->iter;
        struct partial_page partial_def[PIPE_DEF_BUFFERS];
        struct page *pages_def[PIPE_DEF_BUFFERS];
        struct splice_pipe_desc spd = {
@@ -4387,10 +4943,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
        };
        struct buffer_ref *ref;
        int entries, size, i;
-        size_t ret;
+        ssize_t ret;
-        if (splice_grow_spd(pipe, &spd))
+        mutex_lock(&trace_types_lock);
-                return -ENOMEM;
+#ifdef CONFIG_TRACER_MAX_TRACE
+        if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
+                ret = -EBUSY;
+                goto out;
+        }
+#endif
+        if (splice_grow_spd(pipe, &spd)) {
+                ret = -ENOMEM;
+                goto out;
+        }
        if (*ppos & (PAGE_SIZE - 1)) {
                ret = -EINVAL;
@@ -4405,8 +4972,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                len &= PAGE_MASK;
        }
-        trace_access_lock(info->cpu);
+ again:
-        entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+        trace_access_lock(iter->cpu_file);
+        entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
        for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
                struct page *page;
@@ -4417,15 +4985,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                        break;
                ref->ref = 1;
-                ref->buffer = info->tr->buffer;
+                ref->buffer = iter->trace_buffer->buffer;
-                ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
+                ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
                if (!ref->page) {
                        kfree(ref);
                        break;
                }
                r = ring_buffer_read_page(ref->buffer, &ref->page,
-                                          len, info->cpu, 1);
+                                          len, iter->cpu_file, 1);
                if (r < 0) {
                        ring_buffer_free_read_page(ref->buffer, ref->page);
                        kfree(ref);
@@ -4449,31 +5017,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                spd.nr_pages++;
                *ppos += PAGE_SIZE;
-                entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+                entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
        }
-        trace_access_unlock(info->cpu);
+        trace_access_unlock(iter->cpu_file);
        spd.nr_pages = i;
        /* did we read anything? */
        if (!spd.nr_pages) {
-                if (flags & SPLICE_F_NONBLOCK)
+                if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
                        ret = -EAGAIN;
-                else
+                        goto out;
-                        ret = 0;
+                }
-                /* TODO: block */
+                mutex_unlock(&trace_types_lock);
-                goto out;
+                iter->trace->wait_pipe(iter);
+                mutex_lock(&trace_types_lock);
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        goto out;
+                }
+                goto again;
        }
        ret = splice_to_pipe(pipe, &spd);
        splice_shrink_spd(&spd);
 out:
+        mutex_unlock(&trace_types_lock);
        return ret;
 }
 static const struct file_operations tracing_buffers_fops = {
        .open           = tracing_buffers_open,
        .read           = tracing_buffers_read,
+        .poll           = tracing_buffers_poll,
        .release        = tracing_buffers_release,
        .splice_read    = tracing_buffers_splice_read,
        .llseek         = no_llseek,
@@ -4483,12 +5060,14 @@ static ssize_t
 tracing_stats_read(struct file *filp, char __user *ubuf,
                   size_t count, loff_t *ppos)
 {
-        unsigned long cpu = (unsigned long)filp->private_data;
+        struct trace_cpu *tc = filp->private_data;
-        struct trace_array *tr = &global_trace;
+        struct trace_array *tr = tc->tr;
+        struct trace_buffer *trace_buf = &tr->trace_buffer;
        struct trace_seq *s;
        unsigned long cnt;
        unsigned long long t;
        unsigned long usec_rem;
+        int cpu = tc->cpu;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
@@ -4496,41 +5075,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        trace_seq_init(s);
-        cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+        cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "entries: %ld\n", cnt);
-        cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+        cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "overrun: %ld\n", cnt);
-        cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+        cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "commit overrun: %ld\n", cnt);
-        cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
+        cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "bytes: %ld\n", cnt);
        if (trace_clocks[trace_clock_id].in_ns) {
                /* local or global for trace_clock */
-                t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
+                t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
                usec_rem = do_div(t, USEC_PER_SEC);
                trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
                                                                t, usec_rem);
-                t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
+                t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
                usec_rem = do_div(t, USEC_PER_SEC);
                trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
        } else {
                /* counter or tsc mode for trace_clock */
                trace_seq_printf(s, "oldest event ts: %llu\n",
-                                ring_buffer_oldest_event_ts(tr->buffer, cpu));
+                                ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
                trace_seq_printf(s, "now ts: %llu\n",
-                                ring_buffer_time_stamp(tr->buffer, cpu));
+                                ring_buffer_time_stamp(trace_buf->buffer, cpu));
        }
-        cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
+        cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "dropped events: %ld\n", cnt);
-        cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
+        cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
        trace_seq_printf(s, "read events: %ld\n", cnt);
        count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
@@ -4582,60 +5161,161 @@ static const struct file_operations tracing_dyn_info_fops = {
        .read           = tracing_read_dyn_info,
        .llseek         = generic_file_llseek,
 };
-#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
-static struct dentry *d_tracer;
+#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
+static void
+ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+{
+        tracing_snapshot();
+}
-struct dentry *tracing_init_dentry(void)
+static void
+ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+{
+        unsigned long *count = (long *)data;
+        if (!*count)
+                return;
+        if (*count != -1)
+                (*count)--;
+        tracing_snapshot();
+}
+static int
+ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
+                      struct ftrace_probe_ops *ops, void *data)
+{
+        long count = (long)data;
+        seq_printf(m, "%ps:", (void *)ip);
+        seq_printf(m, "snapshot");
+        if (count == -1)
+                seq_printf(m, ":unlimited\n");
+        else
+                seq_printf(m, ":count=%ld\n", count);
+        return 0;
+}
+static struct ftrace_probe_ops snapshot_probe_ops = {
+        .func                   = ftrace_snapshot,
+        .print                  = ftrace_snapshot_print,
+};
+static struct ftrace_probe_ops snapshot_count_probe_ops = {
+        .func                   = ftrace_count_snapshot,
+        .print                  = ftrace_snapshot_print,
+};
+static int
+ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
+                               char *glob, char *cmd, char *param, int enable)
+{
+        struct ftrace_probe_ops *ops;
+        void *count = (void *)-1;
+        char *number;
+        int ret;
+        /* hash funcs only work with set_ftrace_filter */
+        if (!enable)
+                return -EINVAL;
+        ops = param ? &snapshot_count_probe_ops :  &snapshot_probe_ops;
+        if (glob[0] == '!') {
+                unregister_ftrace_function_probe_func(glob+1, ops);
+                return 0;
+        }
+        if (!param)
+                goto out_reg;
+        number = strsep(&param, ":");
+        if (!strlen(number))
+                goto out_reg;
+        /*
+         * We use the callback data field (which is a pointer)
+         * as our counter.
+         */
+        ret = kstrtoul(number, 0, (unsigned long *)&count);
+        if (ret)
+                return ret;
+ out_reg:
+        ret = register_ftrace_function_probe(glob, ops, count);
+        if (ret >= 0)
+                alloc_snapshot(&global_trace);
+        return ret < 0 ? ret : 0;
+}
+static struct ftrace_func_command ftrace_snapshot_cmd = {
+        .name                   = "snapshot",
+        .func                   = ftrace_trace_snapshot_callback,
+};
+static int register_snapshot_cmd(void)
 {
-        static int once;
+        return register_ftrace_command(&ftrace_snapshot_cmd);
+}
+#else
+static inline int register_snapshot_cmd(void) { return 0; }
+#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
-        if (d_tracer)
+struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
-                return d_tracer;
+{
+        if (tr->dir)
+                return tr->dir;
        if (!debugfs_initialized())
                return NULL;
-        d_tracer = debugfs_create_dir("tracing", NULL);
+        if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+                tr->dir = debugfs_create_dir("tracing", NULL);
-        if (!d_tracer && !once) {
+        if (!tr->dir)
-                once = 1;
+                pr_warn_once("Could not create debugfs directory 'tracing'\n");
-                pr_warning("Could not create debugfs directory 'tracing'\n");
-                return NULL;
-        }
-        return d_tracer;
+        return tr->dir;
 }
-static struct dentry *d_percpu;
+struct dentry *tracing_init_dentry(void)
+{
+        return tracing_init_dentry_tr(&global_trace);
+}
-static struct dentry *tracing_dentry_percpu(void)
+static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
 {
-        static int once;
        struct dentry *d_tracer;
-        if (d_percpu)
+        if (tr->percpu_dir)
-                return d_percpu;
+                return tr->percpu_dir;
-        d_tracer = tracing_init_dentry();
+        d_tracer = tracing_init_dentry_tr(tr);
        if (!d_tracer)
                return NULL;
-        d_percpu = debugfs_create_dir("per_cpu", d_tracer);
+        tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
-        if (!d_percpu && !once) {
+        WARN_ONCE(!tr->percpu_dir,
-                once = 1;
+                  "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
-                pr_warning("Could not create debugfs directory 'per_cpu'\n");
-                return NULL;
-        }
-        return d_percpu;
+        return tr->percpu_dir;
 }
-static void tracing_init_debugfs_percpu(long cpu)
+static void
+tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
 {
-        struct dentry *d_percpu = tracing_dentry_percpu();
+        struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+        struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
        struct dentry *d_cpu;
        char cpu_dir[30]; /* 30 characters should be more than enough */
@@ -4651,20 +5331,28 @@ static void tracing_init_debugfs_percpu(long cpu)
        /* per cpu trace_pipe */
        trace_create_file("trace_pipe", 0444, d_cpu,
-                        (void *) cpu, &tracing_pipe_fops);
+                        (void *)&data->trace_cpu, &tracing_pipe_fops);
        /* per cpu trace */
        trace_create_file("trace", 0644, d_cpu,
-                        (void *) cpu, &tracing_fops);
+                        (void *)&data->trace_cpu, &tracing_fops);
        trace_create_file("trace_pipe_raw", 0444, d_cpu,
-                        (void *) cpu, &tracing_buffers_fops);
+                        (void *)&data->trace_cpu, &tracing_buffers_fops);
        trace_create_file("stats", 0444, d_cpu,
-                        (void *) cpu, &tracing_stats_fops);
+                        (void *)&data->trace_cpu, &tracing_stats_fops);
        trace_create_file("buffer_size_kb", 0444, d_cpu,
-                        (void *) cpu, &tracing_entries_fops);
+                        (void *)&data->trace_cpu, &tracing_entries_fops);
+#ifdef CONFIG_TRACER_SNAPSHOT
+        trace_create_file("snapshot", 0644, d_cpu,
+                          (void *)&data->trace_cpu, &snapshot_fops);
+        trace_create_file("snapshot_raw", 0444, d_cpu,
+                        (void *)&data->trace_cpu, &snapshot_raw_fops);
+#endif
 }
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -4675,6 +5363,7 @@ static void tracing_init_debugfs_percpu(long cpu)
 struct trace_option_dentry {
        struct tracer_opt               *opt;
        struct tracer_flags             *flags;
+        struct trace_array              *tr;
        struct dentry                   *entry;
 };
@@ -4710,7 +5399,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (!!(topt->flags->val & topt->opt->bit) != val) {
                mutex_lock(&trace_types_lock);
-                ret = __set_tracer_option(current_trace, topt->flags,
+                ret = __set_tracer_option(topt->tr->current_trace, topt->flags,
                                          topt->opt, !val);
                mutex_unlock(&trace_types_lock);
                if (ret)
@@ -4749,6 +5438,7 @@ static ssize_t
 trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
                         loff_t *ppos)
 {
+        struct trace_array *tr = &global_trace;
        long index = (long)filp->private_data;
        unsigned long val;
        int ret;
@@ -4759,7 +5449,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (val != 0 && val != 1)
                return -EINVAL;
-        set_tracer_flags(1 << index, val);
+        mutex_lock(&trace_types_lock);
+        ret = set_tracer_flag(tr, 1 << index, val);
+        mutex_unlock(&trace_types_lock);
+        if (ret < 0)
+                return ret;
        *ppos += cnt;
@@ -4789,40 +5485,41 @@ struct dentry *trace_create_file(const char *name,
 }
-static struct dentry *trace_options_init_dentry(void)
+static struct dentry *trace_options_init_dentry(struct trace_array *tr)
 {
        struct dentry *d_tracer;
-        static struct dentry *t_options;
-        if (t_options)
+        if (tr->options)
-                return t_options;
+                return tr->options;
-        d_tracer = tracing_init_dentry();
+        d_tracer = tracing_init_dentry_tr(tr);
        if (!d_tracer)
                return NULL;
-        t_options = debugfs_create_dir("options", d_tracer);
+        tr->options = debugfs_create_dir("options", d_tracer);
-        if (!t_options) {
+        if (!tr->options) {
                pr_warning("Could not create debugfs directory 'options'\n");
                return NULL;
        }
-        return t_options;
+        return tr->options;
 }
 static void
-create_trace_option_file(struct trace_option_dentry *topt,
+create_trace_option_file(struct trace_array *tr,
+                         struct trace_option_dentry *topt,
                         struct tracer_flags *flags,
                         struct tracer_opt *opt)
 {
        struct dentry *t_options;
-        t_options = trace_options_init_dentry();
+        t_options = trace_options_init_dentry(tr);
        if (!t_options)
                return;
        topt->flags = flags;
        topt->opt = opt;
+        topt->tr = tr;
        topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
                                    &trace_options_fops);
@@ -4830,7 +5527,7 @@ create_trace_option_file(struct trace_option_dentry *topt,
 }
 static struct trace_option_dentry *
-create_trace_option_files(struct tracer *tracer)
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
 {
        struct trace_option_dentry *topts;
        struct tracer_flags *flags;
@@ -4855,7 +5552,7 @@ create_trace_option_files(struct tracer *tracer)
                return NULL;
        for (cnt = 0; opts[cnt].name; cnt++)
-                create_trace_option_file(&topts[cnt], flags,
+                create_trace_option_file(tr, &topts[cnt], flags,
                                         &opts[cnt]);
        return topts;
@@ -4878,11 +5575,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
 }
 static struct dentry *
-create_trace_option_core_file(const char *option, long index)
+create_trace_option_core_file(struct trace_array *tr,
+                              const char *option, long index)
 {
        struct dentry *t_options;
-        t_options = trace_options_init_dentry();
+        t_options = trace_options_init_dentry(tr);
        if (!t_options)
                return NULL;
@@ -4890,17 +5588,17 @@ create_trace_option_core_file(const char *option, long index)
                                    &trace_options_core_fops);
 }
-static __init void create_trace_options_dir(void)
+static __init void create_trace_options_dir(struct trace_array *tr)
 {
        struct dentry *t_options;
        int i;
-        t_options = trace_options_init_dentry();
+        t_options = trace_options_init_dentry(tr);
        if (!t_options)
                return;
        for (i = 0; trace_options[i]; i++)
-                create_trace_option_core_file(trace_options[i], i);
+                create_trace_option_core_file(tr, trace_options[i], i);
 }
 static ssize_t
@@ -4908,7 +5606,7 @@ rb_simple_read(struct file *filp, char __user *ubuf,
               size_t cnt, loff_t *ppos)
 {
        struct trace_array *tr = filp->private_data;
-        struct ring_buffer *buffer = tr->buffer;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        char buf[64];
        int r;
@@ -4927,7 +5625,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
 {
        struct trace_array *tr = filp->private_data;
-        struct ring_buffer *buffer = tr->buffer;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        unsigned long val;
        int ret;
@@ -4939,12 +5637,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
                mutex_lock(&trace_types_lock);
                if (val) {
                        ring_buffer_record_on(buffer);
-                        if (current_trace->start)
+                        if (tr->current_trace->start)
-                                current_trace->start(tr);
+                                tr->current_trace->start(tr);
                } else {
                        ring_buffer_record_off(buffer);
-                        if (current_trace->stop)
+                        if (tr->current_trace->stop)
-                                current_trace->stop(tr);
+                                tr->current_trace->stop(tr);
                }
                mutex_unlock(&trace_types_lock);
        }
@@ -4961,23 +5659,310 @@ static const struct file_operations rb_simple_fops = {
        .llseek         = default_llseek,
 };
+struct dentry *trace_instance_dir;
+static void
+init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
+static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf)
+{
+        int cpu;
+        for_each_tracing_cpu(cpu) {
+                memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu));
+                per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu;
+                per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr;
+        }
+}
+static int
+allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
+{
+        enum ring_buffer_flags rb_flags;
+        rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+        buf->buffer = ring_buffer_alloc(size, rb_flags);
+        if (!buf->buffer)
+                return -ENOMEM;
+        buf->data = alloc_percpu(struct trace_array_cpu);
+        if (!buf->data) {
+                ring_buffer_free(buf->buffer);
+                return -ENOMEM;
+        }
+        init_trace_buffers(tr, buf);
+        /* Allocate the first page for all buffers */
+        set_buffer_entries(&tr->trace_buffer,
+                           ring_buffer_size(tr->trace_buffer.buffer, 0));
+        return 0;
+}
+static int allocate_trace_buffers(struct trace_array *tr, int size)
+{
+        int ret;
+        ret = allocate_trace_buffer(tr, &tr->trace_buffer, size);
+        if (ret)
+                return ret;
+#ifdef CONFIG_TRACER_MAX_TRACE
+        ret = allocate_trace_buffer(tr, &tr->max_buffer,
+                                    allocate_snapshot ? size : 1);
+        if (WARN_ON(ret)) {
+                ring_buffer_free(tr->trace_buffer.buffer);
+                free_percpu(tr->trace_buffer.data);
+                return -ENOMEM;
+        }
+        tr->allocated_snapshot = allocate_snapshot;
+        /*
+         * Only the top level trace array gets its snapshot allocated
+         * from the kernel command line.
+         */
+        allocate_snapshot = false;
+#endif
+        return 0;
+}
+static int new_instance_create(const char *name)
+{
+        struct trace_array *tr;
+        int ret;
+        mutex_lock(&trace_types_lock);
+        ret = -EEXIST;
+        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+                if (tr->name && strcmp(tr->name, name) == 0)
+                        goto out_unlock;
+        }
+        ret = -ENOMEM;
+        tr = kzalloc(sizeof(*tr), GFP_KERNEL);
+        if (!tr)
+                goto out_unlock;
+        tr->name = kstrdup(name, GFP_KERNEL);
+        if (!tr->name)
+                goto out_free_tr;
+        raw_spin_lock_init(&tr->start_lock);
+        tr->current_trace = &nop_trace;
+        INIT_LIST_HEAD(&tr->systems);
+        INIT_LIST_HEAD(&tr->events);
+        if (allocate_trace_buffers(tr, trace_buf_size) < 0)
+                goto out_free_tr;
+        /* Holder for file callbacks */
+        tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
+        tr->trace_cpu.tr = tr;
+        tr->dir = debugfs_create_dir(name, trace_instance_dir);
+        if (!tr->dir)
+                goto out_free_tr;
+        ret = event_trace_add_tracer(tr->dir, tr);
+        if (ret)
+                goto out_free_tr;
+        init_tracer_debugfs(tr, tr->dir);
+        list_add(&tr->list, &ftrace_trace_arrays);
+        mutex_unlock(&trace_types_lock);
+        return 0;
+ out_free_tr:
+        if (tr->trace_buffer.buffer)
+                ring_buffer_free(tr->trace_buffer.buffer);
+        kfree(tr->name);
+        kfree(tr);
+ out_unlock:
+        mutex_unlock(&trace_types_lock);
+        return ret;
+}
+static int instance_delete(const char *name)
+{
+        struct trace_array *tr;
+        int found = 0;
+        int ret;
+        mutex_lock(&trace_types_lock);
+        ret = -ENODEV;
+        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+                if (tr->name && strcmp(tr->name, name) == 0) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found)
+                goto out_unlock;
+        ret = -EBUSY;
+        if (tr->ref)
+                goto out_unlock;
+        list_del(&tr->list);
+        event_trace_del_tracer(tr);
+        debugfs_remove_recursive(tr->dir);
+        free_percpu(tr->trace_buffer.data);
+        ring_buffer_free(tr->trace_buffer.buffer);
+        kfree(tr->name);
+        kfree(tr);
+        ret = 0;
+ out_unlock:
+        mutex_unlock(&trace_types_lock);
+        return ret;
+}
+static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
+{
+        struct dentry *parent;
+        int ret;
+        /* Paranoid: Make sure the parent is the "instances" directory */
+        parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+        if (WARN_ON_ONCE(parent != trace_instance_dir))
+                return -ENOENT;
+        /*
+         * The inode mutex is locked, but debugfs_create_dir() will also
+         * take the mutex. As the instances directory can not be destroyed
+         * or changed in any other way, it is safe to unlock it, and
+         * let the dentry try. If two users try to make the same dir at
+         * the same time, then the new_instance_create() will determine the
+         * winner.
+         */
+        mutex_unlock(&inode->i_mutex);
+        ret = new_instance_create(dentry->d_iname);
+        mutex_lock(&inode->i_mutex);
+        return ret;
+}
+static int instance_rmdir(struct inode *inode, struct dentry *dentry)
+{
+        struct dentry *parent;
+        int ret;
+        /* Paranoid: Make sure the parent is the "instances" directory */
+        parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+        if (WARN_ON_ONCE(parent != trace_instance_dir))
+                return -ENOENT;
+        /* The caller did a dget() on dentry */
+        mutex_unlock(&dentry->d_inode->i_mutex);
+        /*
+         * The inode mutex is locked, but debugfs_create_dir() will also
+         * take the mutex. As the instances directory can not be destroyed
+         * or changed in any other way, it is safe to unlock it, and
+         * let the dentry try. If two users try to make the same dir at
+         * the same time, then the instance_delete() will determine the
+         * winner.
+         */
+        mutex_unlock(&inode->i_mutex);
+        ret = instance_delete(dentry->d_iname);
+        mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+        mutex_lock(&dentry->d_inode->i_mutex);
+        return ret;
+}
+static const struct inode_operations instance_dir_inode_operations = {
+        .lookup         = simple_lookup,
+        .mkdir          = instance_mkdir,
+        .rmdir          = instance_rmdir,
+};
+static __init void create_trace_instances(struct dentry *d_tracer)
+{
+        trace_instance_dir = debugfs_create_dir("instances", d_tracer);
+        if (WARN_ON(!trace_instance_dir))
+                return;
+        /* Hijack the dir inode operations, to allow mkdir */
+        trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
+}
+static void
+init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
+{
+        int cpu;
+        trace_create_file("trace_options", 0644, d_tracer,
+                          tr, &tracing_iter_fops);
+        trace_create_file("trace", 0644, d_tracer,
+                        (void *)&tr->trace_cpu, &tracing_fops);
+        trace_create_file("trace_pipe", 0444, d_tracer,
+                        (void *)&tr->trace_cpu, &tracing_pipe_fops);
+        trace_create_file("buffer_size_kb", 0644, d_tracer,
+                        (void *)&tr->trace_cpu, &tracing_entries_fops);
+        trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+                          tr, &tracing_total_entries_fops);
+        trace_create_file("free_buffer", 0644, d_tracer,
+                          tr, &tracing_free_buffer_fops);
+        trace_create_file("trace_marker", 0220, d_tracer,
+                          tr, &tracing_mark_fops);
+        trace_create_file("trace_clock", 0644, d_tracer, tr,
+                          &trace_clock_fops);
+        trace_create_file("tracing_on", 0644, d_tracer,
+                            tr, &rb_simple_fops);
+#ifdef CONFIG_TRACER_SNAPSHOT
+        trace_create_file("snapshot", 0644, d_tracer,
+                          (void *)&tr->trace_cpu, &snapshot_fops);
+#endif
+        for_each_tracing_cpu(cpu)
+                tracing_init_debugfs_percpu(tr, cpu);
+}
 static __init int tracer_init_debugfs(void)
 {
        struct dentry *d_tracer;
-        int cpu;
        trace_access_lock_init();
        d_tracer = tracing_init_dentry();
+        if (!d_tracer)
+                return 0;
-        trace_create_file("trace_options", 0644, d_tracer,
+        init_tracer_debugfs(&global_trace, d_tracer);
-                        NULL, &tracing_iter_fops);
        trace_create_file("tracing_cpumask", 0644, d_tracer,
-                        NULL, &tracing_cpumask_fops);
+                        &global_trace, &tracing_cpumask_fops);
-        trace_create_file("trace", 0644, d_tracer,
-                        (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
        trace_create_file("available_tracers", 0444, d_tracer,
                        &global_trace, &show_traces_fops);
@@ -4996,44 +5981,17 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("README", 0444, d_tracer,
                        NULL, &tracing_readme_fops);
-        trace_create_file("trace_pipe", 0444, d_tracer,
-                        (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
-        trace_create_file("buffer_size_kb", 0644, d_tracer,
-                        (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
-        trace_create_file("buffer_total_size_kb", 0444, d_tracer,
-                        &global_trace, &tracing_total_entries_fops);
-        trace_create_file("free_buffer", 0644, d_tracer,
-                        &global_trace, &tracing_free_buffer_fops);
-        trace_create_file("trace_marker", 0220, d_tracer,
-                        NULL, &tracing_mark_fops);
        trace_create_file("saved_cmdlines", 0444, d_tracer,
                        NULL, &tracing_saved_cmdlines_fops);
-        trace_create_file("trace_clock", 0644, d_tracer, NULL,
-                          &trace_clock_fops);
-        trace_create_file("tracing_on", 0644, d_tracer,
-                            &global_trace, &rb_simple_fops);
 #ifdef CONFIG_DYNAMIC_FTRACE
        trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                        &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
-#ifdef CONFIG_TRACER_SNAPSHOT
+        create_trace_instances(d_tracer);
-        trace_create_file("snapshot", 0644, d_tracer,
-                          (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
-#endif
-        create_trace_options_dir();
-        for_each_tracing_cpu(cpu)
+        create_trace_options_dir(&global_trace);
-                tracing_init_debugfs_percpu(cpu);
        return 0;
 }
@@ -5089,8 +6047,8 @@ void
 trace_printk_seq(struct trace_seq *s)
 {
        /* Probably should print a warning here. */
-        if (s->len >= 1000)
+        if (s->len >= TRACE_MAX_PRINT)
-                s->len = 1000;
+                s->len = TRACE_MAX_PRINT;
        /* should be zero ended, but we are paranoid. */
        s->buffer[s->len] = 0;
@@ -5103,46 +6061,43 @@ trace_printk_seq(struct trace_seq *s)
 void trace_init_global_iter(struct trace_iterator *iter)
 {
        iter->tr = &global_trace;
-        iter->trace = current_trace;
+        iter->trace = iter->tr->current_trace;
-        iter->cpu_file = TRACE_PIPE_ALL_CPU;
+        iter->cpu_file = RING_BUFFER_ALL_CPUS;
+        iter->trace_buffer = &global_trace.trace_buffer;
 }
-static void
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 {
-        static arch_spinlock_t ftrace_dump_lock =
-                (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
        /* use static because iter can be a bit big for the stack */
        static struct trace_iterator iter;
+        static atomic_t dump_running;
        unsigned int old_userobj;
-        static int dump_ran;
        unsigned long flags;
        int cnt = 0, cpu;
-        /* only one dump */
+        /* Only allow one dump user at a time. */
-        local_irq_save(flags);
+        if (atomic_inc_return(&dump_running) != 1) {
-        arch_spin_lock(&ftrace_dump_lock);
+                atomic_dec(&dump_running);
-        if (dump_ran)
+                return;
-                goto out;
+        }
-        dump_ran = 1;
+        /*
+         * Always turn off tracing when we dump.
+         * We don't need to show trace output of what happens
+         * between multiple crashes.
+         *
+         * If the user does a sysrq-z, then they can re-enable
+         * tracing with echo 1 > tracing_on.
+         */
        tracing_off();
-        /* Did function tracer already get disabled? */
+        local_irq_save(flags);
-        if (ftrace_is_dead()) {
-                printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
-                printk("#          MAY BE MISSING FUNCTION EVENTS\n");
-        }
-        if (disable_tracing)
-                ftrace_kill();
        /* Simulate the iterator */
        trace_init_global_iter(&iter);
        for_each_tracing_cpu(cpu) {
-                atomic_inc(&iter.tr->data[cpu]->disabled);
+                atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
        }
        old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -5152,7 +6107,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
        switch (oops_dump_mode) {
        case DUMP_ALL:
-                iter.cpu_file = TRACE_PIPE_ALL_CPU;
+                iter.cpu_file = RING_BUFFER_ALL_CPUS;
                break;
        case DUMP_ORIG:
                iter.cpu_file = raw_smp_processor_id();
@@ -5161,11 +6116,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
                goto out_enable;
        default:
                printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
-                iter.cpu_file = TRACE_PIPE_ALL_CPU;
+                iter.cpu_file = RING_BUFFER_ALL_CPUS;
        }
        printk(KERN_TRACE "Dumping ftrace buffer:\n");
+        /* Did function tracer already get disabled? */
+        if (ftrace_is_dead()) {
+                printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+                printk("#          MAY BE MISSING FUNCTION EVENTS\n");
+        }
        /*
         * We need to stop all tracing on all CPUS to read the
         * the next buffer. This is a bit expensive, but is
@@ -5205,33 +6166,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
                printk(KERN_TRACE "---------------------------------\n");
 out_enable:
-        /* Re-enable tracing if requested */
+        trace_flags |= old_userobj;
-        if (!disable_tracing) {
-                trace_flags |= old_userobj;
-                for_each_tracing_cpu(cpu) {
+        for_each_tracing_cpu(cpu) {
-                        atomic_dec(&iter.tr->data[cpu]->disabled);
+                atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
-                }
-                tracing_on();
        }
+        atomic_dec(&dump_running);
- out:
-        arch_spin_unlock(&ftrace_dump_lock);
        local_irq_restore(flags);
 }
-/* By default: disable tracing after the dump */
-void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
-{
-        __ftrace_dump(true, oops_dump_mode);
-}
 EXPORT_SYMBOL_GPL(ftrace_dump);
 __init static int tracer_alloc_buffers(void)
 {
        int ring_buf_size;
-        enum ring_buffer_flags rb_flags;
-        int i;
        int ret = -ENOMEM;
@@ -5252,49 +6199,27 @@ __init static int tracer_alloc_buffers(void)
        else
                ring_buf_size = 1;
-        rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
        cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
        cpumask_copy(tracing_cpumask, cpu_all_mask);
+        raw_spin_lock_init(&global_trace.start_lock);
        /* TODO: make the number of buffers hot pluggable with CPUS */
-        global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
+        if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
-        if (!global_trace.buffer) {
                printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
                WARN_ON(1);
                goto out_free_cpumask;
        }
        if (global_trace.buffer_disabled)
                tracing_off();
-#ifdef CONFIG_TRACER_MAX_TRACE
-        max_tr.buffer = ring_buffer_alloc(1, rb_flags);
-        if (!max_tr.buffer) {
-                printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
-                WARN_ON(1);
-                ring_buffer_free(global_trace.buffer);
-                goto out_free_cpumask;
-        }
-#endif
-        /* Allocate the first page for all buffers */
-        for_each_tracing_cpu(i) {
-                global_trace.data[i] = &per_cpu(global_trace_cpu, i);
-                max_tr.data[i] = &per_cpu(max_tr_data, i);
-        }
-        set_buffer_entries(&global_trace,
-                           ring_buffer_size(global_trace.buffer, 0));
-#ifdef CONFIG_TRACER_MAX_TRACE
-        set_buffer_entries(&max_tr, 1);
-#endif
        trace_init_cmdlines();
-        init_irq_work(&trace_work_wakeup, trace_wake_up);
        register_tracer(&nop_trace);
+        global_trace.current_trace = &nop_trace;
        /* All seems OK, enable tracing */
        tracing_disabled = 0;
@@ -5303,16 +6228,32 @@ __init static int tracer_alloc_buffers(void)
        register_die_notifier(&trace_die_notifier);
+        global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
+        /* Holder for file callbacks */
+        global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
+        global_trace.trace_cpu.tr = &global_trace;
+        INIT_LIST_HEAD(&global_trace.systems);
+        INIT_LIST_HEAD(&global_trace.events);
+        list_add(&global_trace.list, &ftrace_trace_arrays);
        while (trace_boot_options) {
                char *option;
                option = strsep(&trace_boot_options, ",");
-                trace_set_options(option);
+                trace_set_options(&global_trace, option);
        }
+        register_snapshot_cmd();
        return 0;
 out_free_cpumask:
+        free_percpu(global_trace.trace_buffer.data);
+#ifdef CONFIG_TRACER_MAX_TRACE
+        free_percpu(global_trace.max_buffer.data);
+#endif
        free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
        free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 57d7e5397d56..711ca7d3e7f1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,11 @@
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
+#ifdef CONFIG_FTRACE_SYSCALLS
+#include <asm/unistd.h>         /* For NR_SYSCALLS           */
+#include <asm/syscall.h>        /* some archs define it here */
+#endif
 enum trace_type {
        __TRACE_FIRST_TYPE = 0,
@@ -29,6 +34,7 @@ enum trace_type {
        TRACE_GRAPH_ENT,
        TRACE_USER_STACK,
        TRACE_BLK,
+        TRACE_BPUTS,
        __TRACE_LAST_TYPE,
 };
@@ -103,11 +109,6 @@ struct kretprobe_trace_entry_head {
        unsigned long           ret_ip;
 };
-struct uprobe_trace_entry_head {
-        struct trace_entry      ent;
-        unsigned long           ip;
-};
 /*
 * trace_flag_type is an enumeration that holds different
 * states when a trace occurs. These are:
@@ -127,12 +128,21 @@ enum trace_flag_type {
 #define TRACE_BUF_SIZE          1024
+struct trace_array;
+struct trace_cpu {
+        struct trace_array      *tr;
+        struct dentry           *dir;
+        int                     cpu;
+};
 /*
 * The CPU trace array - it consists of thousands of trace entries
 * plus some other descriptor data: (for example which task started
 * the trace, etc.)
 */
 struct trace_array_cpu {
+        struct trace_cpu        trace_cpu;
        atomic_t                disabled;
        void                    *buffer_page;   /* ring buffer spare */
@@ -151,20 +161,83 @@ struct trace_array_cpu {
        char                    comm[TASK_COMM_LEN];
 };
+struct tracer;
+struct trace_buffer {
+        struct trace_array              *tr;
+        struct ring_buffer              *buffer;
+        struct trace_array_cpu __percpu *data;
+        cycle_t                         time_start;
+        int                             cpu;
+};
 /*
 * The trace array - an array of per-CPU trace arrays. This is the
 * highest level data structure that individual tracers deal with.
 * They have on/off state as well:
 */
 struct trace_array {
-        struct ring_buffer      *buffer;
+        struct list_head        list;
-        int                     cpu;
+        char                    *name;
+        struct trace_buffer     trace_buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+        /*
+         * The max_buffer is used to snapshot the trace when a maximum
+         * latency is reached, or when the user initiates a snapshot.
+         * Some tracers will use this to store a maximum trace while
+         * it continues examining live traces.
+         *
+         * The buffers for the max_buffer are set up the same as the trace_buffer
+         * When a snapshot is taken, the buffer of the max_buffer is swapped
+         * with the buffer of the trace_buffer and the buffers are reset for
+         * the trace_buffer so the tracing can continue.
+         */
+        struct trace_buffer     max_buffer;
+        bool                    allocated_snapshot;
+#endif
        int                     buffer_disabled;
-        cycle_t                 time_start;
+        struct trace_cpu        trace_cpu;      /* place holder */
+#ifdef CONFIG_FTRACE_SYSCALLS
+        int                     sys_refcount_enter;
+        int                     sys_refcount_exit;
+        DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
+        DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+#endif
+        int                     stop_count;
+        int                     clock_id;
+        struct tracer           *current_trace;
+        unsigned int            flags;
+        raw_spinlock_t          start_lock;
+        struct dentry           *dir;
+        struct dentry           *options;
+        struct dentry           *percpu_dir;
+        struct dentry           *event_dir;
+        struct list_head        systems;
+        struct list_head        events;
        struct task_struct      *waiter;
-        struct trace_array_cpu  *data[NR_CPUS];
+        int                     ref;
+};
+enum {
+        TRACE_ARRAY_FL_GLOBAL   = (1 << 0)
 };
+extern struct list_head ftrace_trace_arrays;
+/*
+ * The global tracer (top) should be the first trace array added,
+ * but we check the flag anyway.
+ */
+static inline struct trace_array *top_trace_array(void)
+{
+        struct trace_array *tr;
+        tr = list_entry(ftrace_trace_arrays.prev,
+                        typeof(*tr), list);
+        WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
+        return tr;
+}
 #define FTRACE_CMP_TYPE(var, type) \
        __builtin_types_compatible_p(typeof(var), type *)
@@ -200,6 +273,7 @@ extern void __ftrace_bad_type(void);
                IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
                IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);   \
                IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
+                IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);   \
                IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,          \
                          TRACE_MMIO_RW);                               \
                IF_ASSIGN(var, ent, struct trace_mmiotrace_map,         \
@@ -283,11 +357,16 @@ struct tracer {
        enum print_line_t       (*print_line)(struct trace_iterator *iter);
        /* If you handled the flag setting, return 0 */
        int                     (*set_flag)(u32 old_flags, u32 bit, int set);
+        /* Return 0 if OK with change, else return non-zero */
+        int                     (*flag_changed)(struct tracer *tracer,
+                                                u32 mask, int set);
        struct tracer           *next;
        struct tracer_flags     *flags;
        bool                    print_max;
+        bool                    enabled;
+#ifdef CONFIG_TRACER_MAX_TRACE
        bool                    use_max_tr;
-        bool                    allocated_snapshot;
+#endif
 };
@@ -423,8 +502,6 @@ static __always_inline void trace_clear_recursion(int bit)
        current->trace_recursion = val;
 }
-#define TRACE_PIPE_ALL_CPU      -1
 static inline struct ring_buffer_iter *
 trace_buffer_iter(struct trace_iterator *iter, int cpu)
 {
@@ -435,10 +512,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
-void tracing_reset(struct trace_array *tr, int cpu);
+void tracing_reset(struct trace_buffer *buf, int cpu);
-void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset_online_cpus(struct trace_buffer *buf);
 void tracing_reset_current(int cpu);
-void tracing_reset_current_online_cpus(void);
+void tracing_reset_all_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *trace_create_file(const char *name,
                                 umode_t mode,
@@ -446,6 +523,7 @@ struct dentry *trace_create_file(const char *name,
                                 void *data,
                                 const struct file_operations *fops);
+struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
 struct dentry *tracing_init_dentry(void);
 struct ring_buffer_event;
@@ -579,7 +657,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
 #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
 extern int DYN_FTRACE_TEST_NAME2(void);
-extern int ring_buffer_expanded;
+extern bool ring_buffer_expanded;
 extern bool tracing_selftest_disabled;
 DECLARE_PER_CPU(int, ftrace_cpu_disabled);
@@ -615,6 +693,8 @@ trace_array_vprintk(struct trace_array *tr,
                    unsigned long ip, const char *fmt, va_list args);
 int trace_array_printk(struct trace_array *tr,
                       unsigned long ip, const char *fmt, ...);
+int trace_array_printk_buf(struct ring_buffer *buffer,
+                           unsigned long ip, const char *fmt, ...);
 void trace_printk_seq(struct trace_seq *s);
 enum print_line_t print_trace_line(struct trace_iterator *iter);
@@ -782,6 +862,7 @@ enum trace_iterator_flags {
        TRACE_ITER_STOP_ON_FREE         = 0x400000,
        TRACE_ITER_IRQ_INFO             = 0x800000,
        TRACE_ITER_MARKERS              = 0x1000000,
+        TRACE_ITER_FUNCTION             = 0x2000000,
 };
 /*
@@ -828,8 +909,8 @@ enum {
 struct ftrace_event_field {
        struct list_head        link;
-        char                    *name;
+        const char              *name;
-        char                    *type;
+        const char              *type;
        int                     filter_type;
        int                     offset;
        int                     size;
@@ -847,12 +928,19 @@ struct event_filter {
 struct event_subsystem {
        struct list_head        list;
        const char              *name;
-        struct dentry           *entry;
        struct event_filter     *filter;
-        int                     nr_events;
        int                     ref_count;
 };
+struct ftrace_subsystem_dir {
+        struct list_head                list;
+        struct event_subsystem          *subsystem;
+        struct trace_array              *tr;
+        struct dentry                   *entry;
+        int                             ref_count;
+        int                             nr_events;
+};
 #define FILTER_PRED_INVALID     ((unsigned short)-1)
 #define FILTER_PRED_IS_RIGHT    (1 << 15)
 #define FILTER_PRED_FOLD        (1 << 15)
@@ -902,22 +990,20 @@ struct filter_pred {
        unsigned short          right;
 };
-extern struct list_head ftrace_common_fields;
 extern enum regex_type
 filter_parse_regex(char *buff, int len, char **search, int *not);
 extern void print_event_filter(struct ftrace_event_call *call,
                               struct trace_seq *s);
 extern int apply_event_filter(struct ftrace_event_call *call,
                              char *filter_string);
-extern int apply_subsystem_event_filter(struct event_subsystem *system,
+extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
                                        char *filter_string);
 extern void print_subsystem_event_filter(struct event_subsystem *system,
                                         struct trace_seq *s);
 extern int filter_assign_type(const char *type);
-struct list_head *
+struct ftrace_event_field *
-trace_get_fields(struct ftrace_event_call *event_call);
+trace_find_event_field(struct ftrace_event_call *call, char *name);
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -934,6 +1020,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 }
 extern void trace_event_enable_cmd_record(bool enable);
+extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
+extern int event_trace_del_tracer(struct trace_array *tr);
 extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
@@ -943,6 +1031,19 @@ extern const char *__stop___trace_bprintk_fmt[];
 void trace_printk_init_buffers(void);
 void trace_printk_start_comm(void);
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
+int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
+/*
+ * Normal trace_printk() and friends allocates special buffers
+ * to do the manipulation, as well as saves the print formats
+ * into sections to display. But the trace infrastructure wants
+ * to use these without the added overhead at the price of being
+ * a bit slower (used mainly for warnings, where we don't care
+ * about performance). The internal_trace_puts() is for such
+ * a purpose.
+ */
+#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)     \
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 95e96842ed29..d594da0dc03c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
        struct ftrace_event_call *call = &event_branch;
        struct trace_array *tr = branch_tracer;
+        struct trace_array_cpu *data;
        struct ring_buffer_event *event;
        struct trace_branch *entry;
        struct ring_buffer *buffer;
@@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
        local_irq_save(flags);
        cpu = raw_smp_processor_id();
-        if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+        if (atomic_inc_return(&data->disabled) != 1)
                goto out;
        pc = preempt_count();
-        buffer = tr->buffer;
+        buffer = tr->trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
                                          sizeof(*entry), flags, pc);
        if (!event)
@@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
                __buffer_unlock_commit(buffer, event);
 out:
-        atomic_dec(&tr->data[cpu]->disabled);
+        atomic_dec(&data->disabled);
        local_irq_restore(flags);
 }
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index aa8f5f48dae6..26dc348332b7 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -57,6 +57,16 @@ u64 notrace trace_clock(void)
        return local_clock();
 }
+/*
+ * trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ */
+u64 notrace trace_clock_jiffies(void)
+{
+        u64 jiffy = jiffies - INITIAL_JIFFIES;
+        /* Return nsecs */
+        return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
+}
 /*
 * trace_clock_global(): special globally coherent trace clock
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4108e1250ca2..e2d027ac66a2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,
                __dynamic_array(        u32,    buf     )
        ),
-        F_printk("%08lx fmt:%p",
+        F_printk("%pf: %s",
-                 __entry->ip, __entry->fmt),
+                 (void *)__entry->ip, __entry->fmt),
        FILTER_OTHER
 );
@@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,
                __dynamic_array(        char,   buf     )
        ),
-        F_printk("%08lx %s",
+        F_printk("%pf: %s",
-                 __entry->ip, __entry->buf),
+                 (void *)__entry->ip, __entry->buf),
+        FILTER_OTHER
+);
+FTRACE_ENTRY(bputs, bputs_entry,
+        TRACE_BPUTS,
+        F_STRUCT(
+                __field(        unsigned long,  ip      )
+                __field(        const char *,   str     )
+        ),
+        F_printk("%pf: %s",
+                 (void *)__entry->ip, __entry->str),
        FILTER_OTHER
 );
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 57e9b284250c..53582e982e51 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE];
 EXPORT_SYMBOL_GPL(event_storage);
 LIST_HEAD(ftrace_events);
-LIST_HEAD(ftrace_common_fields);
+static LIST_HEAD(ftrace_common_fields);
-struct list_head *
+#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
+static struct kmem_cache *field_cachep;
+static struct kmem_cache *file_cachep;
+/* Double loops, do not use break, only goto's work */
+#define do_for_each_event_file(tr, file)                        \
+        list_for_each_entry(tr, &ftrace_trace_arrays, list) {   \
+                list_for_each_entry(file, &tr->events, list)
+#define do_for_each_event_file_safe(tr, file)                   \
+        list_for_each_entry(tr, &ftrace_trace_arrays, list) {   \
+                struct ftrace_event_file *___n;                         \
+                list_for_each_entry_safe(file, ___n, &tr->events, list)
+#define while_for_each_event_file()             \
+        }
+static struct list_head *
 trace_get_fields(struct ftrace_event_call *event_call)
 {
        if (!event_call->class->get_fields)
@@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call)
        return event_call->class->get_fields(event_call);
 }
+static struct ftrace_event_field *
+__find_event_field(struct list_head *head, char *name)
+{
+        struct ftrace_event_field *field;
+        list_for_each_entry(field, head, link) {
+                if (!strcmp(field->name, name))
+                        return field;
+        }
+        return NULL;
+}
+struct ftrace_event_field *
+trace_find_event_field(struct ftrace_event_call *call, char *name)
+{
+        struct ftrace_event_field *field;
+        struct list_head *head;
+        field = __find_event_field(&ftrace_common_fields, name);
+        if (field)
+                return field;
+        head = trace_get_fields(call);
+        return __find_event_field(head, name);
+}
 static int __trace_define_field(struct list_head *head, const char *type,
                                const char *name, int offset, int size,
                                int is_signed, int filter_type)
 {
        struct ftrace_event_field *field;
-        field = kzalloc(sizeof(*field), GFP_KERNEL);
+        field = kmem_cache_alloc(field_cachep, GFP_TRACE);
        if (!field)
                goto err;
-        field->name = kstrdup(name, GFP_KERNEL);
+        field->name = name;
-        if (!field->name)
+        field->type = type;
-                goto err;
-        field->type = kstrdup(type, GFP_KERNEL);
-        if (!field->type)
-                goto err;
        if (filter_type == FILTER_OTHER)
                field->filter_type = filter_assign_type(type);
@@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
        return 0;
 err:
-        if (field)
+        kmem_cache_free(field_cachep, field);
-                kfree(field->name);
-        kfree(field);
        return -ENOMEM;
 }
@@ -120,7 +158,7 @@ static int trace_define_common_fields(void)
        return ret;
 }
-void trace_destroy_fields(struct ftrace_event_call *call)
+static void trace_destroy_fields(struct ftrace_event_call *call)
 {
        struct ftrace_event_field *field, *next;
        struct list_head *head;
@@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)
        head = trace_get_fields(call);
        list_for_each_entry_safe(field, next, head, link) {
                list_del(&field->link);
-                kfree(field->type);
+                kmem_cache_free(field_cachep, field);
-                kfree(field->name);
-                kfree(field);
        }
 }
@@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);
 int ftrace_event_reg(struct ftrace_event_call *call,
                     enum trace_reg type, void *data)
 {
+        struct ftrace_event_file *file = data;
        switch (type) {
        case TRACE_REG_REGISTER:
                return tracepoint_probe_register(call->name,
                                                 call->class->probe,
-                                                 call);
+                                                 file);
        case TRACE_REG_UNREGISTER:
                tracepoint_probe_unregister(call->name,
                                            call->class->probe,
-                                            call);
+                                            file);
                return 0;
 #ifdef CONFIG_PERF_EVENTS
@@ -183,54 +221,100 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);
 void trace_event_enable_cmd_record(bool enable)
 {
-        struct ftrace_event_call *call;
+        struct ftrace_event_file *file;
+        struct trace_array *tr;
        mutex_lock(&event_mutex);
-        list_for_each_entry(call, &ftrace_events, list) {
+        do_for_each_event_file(tr, file) {
-                if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+                if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
                        continue;
                if (enable) {
                        tracing_start_cmdline_record();
-                        call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+                        set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
                } else {
                        tracing_stop_cmdline_record();
-                        call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+                        clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
                }
-        }
+        } while_for_each_event_file();
        mutex_unlock(&event_mutex);
 }
-static int ftrace_event_enable_disable(struct ftrace_event_call *call,
+static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
-                                        int enable)
+                                         int enable, int soft_disable)
 {
+        struct ftrace_event_call *call = file->event_call;
        int ret = 0;
+        int disable;
        switch (enable) {
        case 0:
-                if (call->flags & TRACE_EVENT_FL_ENABLED) {
+                /*
-                        call->flags &= ~TRACE_EVENT_FL_ENABLED;
+                 * When soft_disable is set and enable is cleared, we want
-                        if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+                 * to clear the SOFT_DISABLED flag but leave the event in the
+                 * state that it was. That is, if the event was enabled and
+                 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
+                 * is set we do not want the event to be enabled before we
+                 * clear the bit.
+                 *
+                 * When soft_disable is not set but the SOFT_MODE flag is,
+                 * we do nothing. Do not disable the tracepoint, otherwise
+                 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
+                 */
+                if (soft_disable) {
+                        disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
+                        clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+                } else
+                        disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
+                if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
+                        clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+                        if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
                                tracing_stop_cmdline_record();
-                                call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+                                clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
                        }
-                        call->class->reg(call, TRACE_REG_UNREGISTER, NULL);
+                        call->class->reg(call, TRACE_REG_UNREGISTER, file);
                }
+                /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */
+                if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+                        set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
                break;
        case 1:
-                if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
+                /*
+                 * When soft_disable is set and enable is set, we want to
+                 * register the tracepoint for the event, but leave the event
+                 * as is. That means, if the event was already enabled, we do
+                 * nothing (but set SOFT_MODE). If the event is disabled, we
+                 * set SOFT_DISABLED before enabling the event tracepoint, so
+                 * it still seems to be disabled.
+                 */
+                if (!soft_disable)
+                        clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+                else
+                        set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+                if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
+                        /* Keep the event disabled, when going to SOFT_MODE. */
+                        if (soft_disable)
+                                set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
                        if (trace_flags & TRACE_ITER_RECORD_CMD) {
                                tracing_start_cmdline_record();
-                                call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+                                set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
                        }
-                        ret = call->class->reg(call, TRACE_REG_REGISTER, NULL);
+                        ret = call->class->reg(call, TRACE_REG_REGISTER, file);
                        if (ret) {
                                tracing_stop_cmdline_record();
                                pr_info("event trace: Could not enable event "
                                        "%s\n", call->name);
                                break;
                        }
-                        call->flags |= TRACE_EVENT_FL_ENABLED;
+                        set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+                        /* WAS_ENABLED gets set but never cleared. */
+                        call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
                }
                break;
        }
@@ -238,13 +322,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
        return ret;
 }
-static void ftrace_clear_events(void)
+static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+                                       int enable)
 {
-        struct ftrace_event_call *call;
+        return __ftrace_event_enable_disable(file, enable, 0);
+}
+static void ftrace_clear_events(struct trace_array *tr)
+{
+        struct ftrace_event_file *file;
        mutex_lock(&event_mutex);
-        list_for_each_entry(call, &ftrace_events, list) {
+        list_for_each_entry(file, &tr->events, list) {
-                ftrace_event_enable_disable(call, 0);
+                ftrace_event_enable_disable(file, 0);
        }
        mutex_unlock(&event_mutex);
 }
@@ -257,11 +347,12 @@ static void __put_system(struct event_subsystem *system)
        if (--system->ref_count)
                return;
+        list_del(&system->list);
        if (filter) {
                kfree(filter->filter_string);
                kfree(filter);
        }
-        kfree(system->name);
        kfree(system);
 }
@@ -271,24 +362,45 @@ static void __get_system(struct event_subsystem *system)
        system->ref_count++;
 }
-static void put_system(struct event_subsystem *system)
+static void __get_system_dir(struct ftrace_subsystem_dir *dir)
+{
+        WARN_ON_ONCE(dir->ref_count == 0);
+        dir->ref_count++;
+        __get_system(dir->subsystem);
+}
+static void __put_system_dir(struct ftrace_subsystem_dir *dir)
+{
+        WARN_ON_ONCE(dir->ref_count == 0);
+        /* If the subsystem is about to be freed, the dir must be too */
+        WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1);
+        __put_system(dir->subsystem);
+        if (!--dir->ref_count)
+                kfree(dir);
+}
+static void put_system(struct ftrace_subsystem_dir *dir)
 {
        mutex_lock(&event_mutex);
-        __put_system(system);
+        __put_system_dir(dir);
        mutex_unlock(&event_mutex);
 }
 /*
 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
 */
-static int __ftrace_set_clr_event(const char *match, const char *sub,
+static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
-                                  const char *event, int set)
+                                  const char *sub, const char *event, int set)
 {
+        struct ftrace_event_file *file;
        struct ftrace_event_call *call;
        int ret = -EINVAL;
        mutex_lock(&event_mutex);
-        list_for_each_entry(call, &ftrace_events, list) {
+        list_for_each_entry(file, &tr->events, list) {
+                call = file->event_call;
                if (!call->name || !call->class || !call->class->reg)
                        continue;
@@ -307,7 +419,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
                if (event && strcmp(event, call->name) != 0)
                        continue;
-                ftrace_event_enable_disable(call, set);
+                ftrace_event_enable_disable(file, set);
                ret = 0;
        }
@@ -316,7 +428,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
        return ret;
 }
-static int ftrace_set_clr_event(char *buf, int set)
+static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
 {
        char *event = NULL, *sub = NULL, *match;
@@ -344,7 +456,7 @@ static int ftrace_set_clr_event(char *buf, int set)
                        event = NULL;
        }
-        return __ftrace_set_clr_event(match, sub, event, set);
+        return __ftrace_set_clr_event(tr, match, sub, event, set);
 }
 /**
@@ -361,7 +473,9 @@ static int ftrace_set_clr_event(char *buf, int set)
 */
 int trace_set_clr_event(const char *system, const char *event, int set)
 {
-        return __ftrace_set_clr_event(NULL, system, event, set);
+        struct trace_array *tr = top_trace_array();
+        return __ftrace_set_clr_event(tr, NULL, system, event, set);
 }
 EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -373,6 +487,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
                   size_t cnt, loff_t *ppos)
 {
        struct trace_parser parser;
+        struct seq_file *m = file->private_data;
+        struct trace_array *tr = m->private;
        ssize_t read, ret;
        if (!cnt)
@@ -395,7 +511,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
                parser.buffer[parser.idx] = 0;
-                ret = ftrace_set_clr_event(parser.buffer + !set, set);
+                ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
                if (ret)
                        goto out_put;
        }
@@ -411,17 +527,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct ftrace_event_call *call = v;
+        struct ftrace_event_file *file = v;
+        struct ftrace_event_call *call;
+        struct trace_array *tr = m->private;
        (*pos)++;
-        list_for_each_entry_continue(call, &ftrace_events, list) {
+        list_for_each_entry_continue(file, &tr->events, list) {
+                call = file->event_call;
                /*
                 * The ftrace subsystem is for showing formats only.
                 * They can not be enabled or disabled via the event files.
                 */
                if (call->class && call->class->reg)
-                        return call;
+                        return file;
        }
        return NULL;
@@ -429,30 +548,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-        struct ftrace_event_call *call;
+        struct ftrace_event_file *file;
+        struct trace_array *tr = m->private;
        loff_t l;
        mutex_lock(&event_mutex);
-        call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+        file = list_entry(&tr->events, struct ftrace_event_file, list);
        for (l = 0; l <= *pos; ) {
-                call = t_next(m, call, &l);
+                file = t_next(m, file, &l);
-                if (!call)
+                if (!file)
                        break;
        }
-        return call;
+        return file;
 }
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-        struct ftrace_event_call *call = v;
+        struct ftrace_event_file *file = v;
+        struct trace_array *tr = m->private;
        (*pos)++;
-        list_for_each_entry_continue(call, &ftrace_events, list) {
+        list_for_each_entry_continue(file, &tr->events, list) {
-                if (call->flags & TRACE_EVENT_FL_ENABLED)
+                if (file->flags & FTRACE_EVENT_FL_ENABLED)
-                        return call;
+                        return file;
        }
        return NULL;
@@ -460,23 +581,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
-        struct ftrace_event_call *call;
+        struct ftrace_event_file *file;
+        struct trace_array *tr = m->private;
        loff_t l;
        mutex_lock(&event_mutex);
-        call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+        file = list_entry(&tr->events, struct ftrace_event_file, list);
        for (l = 0; l <= *pos; ) {
-                call = s_next(m, call, &l);
+                file = s_next(m, file, &l);
-                if (!call)
+                if (!file)
                        break;
        }
-        return call;
+        return file;
 }
 static int t_show(struct seq_file *m, void *v)
 {
-        struct ftrace_event_call *call = v;
+        struct ftrace_event_file *file = v;
+        struct ftrace_event_call *call = file->event_call;
        if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
                seq_printf(m, "%s:", call->class->system);
@@ -494,25 +617,31 @@ static ssize_t
 event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                  loff_t *ppos)
 {
-        struct ftrace_event_call *call = filp->private_data;
+        struct ftrace_event_file *file = filp->private_data;
        char *buf;
-        if (call->flags & TRACE_EVENT_FL_ENABLED)
+        if (file->flags & FTRACE_EVENT_FL_ENABLED) {
-                buf = "1\n";
+                if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
-        else
+                        buf = "0*\n";
+                else
+                        buf = "1\n";
+        } else
                buf = "0\n";
-        return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
 }
 static ssize_t
 event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
                   loff_t *ppos)
 {
-        struct ftrace_event_call *call = filp->private_data;
+        struct ftrace_event_file *file = filp->private_data;
        unsigned long val;
        int ret;
+        if (!file)
+                return -EINVAL;
        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
        if (ret)
                return ret;
@@ -525,7 +654,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
        case 0:
        case 1:
                mutex_lock(&event_mutex);
-                ret = ftrace_event_enable_disable(call, val);
+                ret = ftrace_event_enable_disable(file, val);
                mutex_unlock(&event_mutex);
                break;
@@ -543,14 +672,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                   loff_t *ppos)
 {
        const char set_to_char[4] = { '?', '0', '1', 'X' };
-        struct event_subsystem *system = filp->private_data;
+        struct ftrace_subsystem_dir *dir = filp->private_data;
+        struct event_subsystem *system = dir->subsystem;
        struct ftrace_event_call *call;
+        struct ftrace_event_file *file;
+        struct trace_array *tr = dir->tr;
        char buf[2];
        int set = 0;
        int ret;
        mutex_lock(&event_mutex);
-        list_for_each_entry(call, &ftrace_events, list) {
+        list_for_each_entry(file, &tr->events, list) {
+                call = file->event_call;
                if (!call->name || !call->class || !call->class->reg)
                        continue;
@@ -562,7 +695,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                 * or if all events or cleared, or if we have
                 * a mixture.
                 */
-                set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
+                set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
                /*
                 * If we have a mixture, no need to look further.
@@ -584,7 +717,8 @@ static ssize_t
 system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
                    loff_t *ppos)
 {
-        struct event_subsystem *system = filp->private_data;
+        struct ftrace_subsystem_dir *dir = filp->private_data;
+        struct event_subsystem *system = dir->subsystem;
        const char *name = NULL;
        unsigned long val;
        ssize_t ret;
@@ -607,7 +741,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (system)
                name = system->name;
-        ret = __ftrace_set_clr_event(NULL, name, NULL, val);
+        ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
        if (ret)
                goto out;
@@ -845,43 +979,75 @@ static LIST_HEAD(event_subsystems);
 static int subsystem_open(struct inode *inode, struct file *filp)
 {
        struct event_subsystem *system = NULL;
+        struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
+        struct trace_array *tr;
        int ret;
-        if (!inode->i_private)
-                goto skip_search;
        /* Make sure the system still exists */
        mutex_lock(&event_mutex);
-        list_for_each_entry(system, &event_subsystems, list) {
+        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
-                if (system == inode->i_private) {
+                list_for_each_entry(dir, &tr->systems, list) {
-                        /* Don't open systems with no events */
+                        if (dir == inode->i_private) {
-                        if (!system->nr_events) {
+                                /* Don't open systems with no events */
-                                system = NULL;
+                                if (dir->nr_events) {
-                                break;
+                                        __get_system_dir(dir);
+                                        system = dir->subsystem;
+                                }
+                                goto exit_loop;
                        }
-                        __get_system(system);
-                        break;
                }
        }
+ exit_loop:
        mutex_unlock(&event_mutex);
-        if (system != inode->i_private)
+        if (!system)
                return -ENODEV;
- skip_search:
+        /* Some versions of gcc think dir can be uninitialized here */
+        WARN_ON(!dir);
        ret = tracing_open_generic(inode, filp);
-        if (ret < 0 && system)
+        if (ret < 0)
-                put_system(system);
+                put_system(dir);
+        return ret;
+}
+static int system_tr_open(struct inode *inode, struct file *filp)
+{
+        struct ftrace_subsystem_dir *dir;
+        struct trace_array *tr = inode->i_private;
+        int ret;
+        /* Make a temporary dir that has no system but points to tr */
+        dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+        if (!dir)
+                return -ENOMEM;
+        dir->tr = tr;
+        ret = tracing_open_generic(inode, filp);
+        if (ret < 0)
+                kfree(dir);
+        filp->private_data = dir;
        return ret;
 }
 static int subsystem_release(struct inode *inode, struct file *file)
 {
-        struct event_subsystem *system = inode->i_private;
+        struct ftrace_subsystem_dir *dir = file->private_data;
-        if (system)
+        /*
-                put_system(system);
+         * If dir->subsystem is NULL, then this is a temporary
+         * descriptor that was made for a trace_array to enable
+         * all subsystems.
+         */
+        if (dir->subsystem)
+                put_system(dir);
+        else
+                kfree(dir);
        return 0;
 }
@@ -890,7 +1056,8 @@ static ssize_t
 subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
                      loff_t *ppos)
 {
-        struct event_subsystem *system = filp->private_data;
+        struct ftrace_subsystem_dir *dir = filp->private_data;
+        struct event_subsystem *system = dir->subsystem;
        struct trace_seq *s;
        int r;
@@ -915,7 +1082,7 @@ static ssize_t
 subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                       loff_t *ppos)
 {
-        struct event_subsystem *system = filp->private_data;
+        struct ftrace_subsystem_dir *dir = filp->private_data;
        char *buf;
        int err;
@@ -932,7 +1099,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
        }
        buf[cnt] = '\0';
-        err = apply_subsystem_event_filter(system, buf);
+        err = apply_subsystem_event_filter(dir, buf);
        free_page((unsigned long) buf);
        if (err < 0)
                return err;
@@ -1041,30 +1208,35 @@ static const struct file_operations ftrace_system_enable_fops = {
        .release = subsystem_release,
 };
+static const struct file_operations ftrace_tr_enable_fops = {
+        .open = system_tr_open,
+        .read = system_enable_read,
+        .write = system_enable_write,
+        .llseek = default_llseek,
+        .release = subsystem_release,
+};
 static const struct file_operations ftrace_show_header_fops = {
        .open = tracing_open_generic,
        .read = show_header,
        .llseek = default_llseek,
 };
-static struct dentry *event_trace_events_dir(void)
+static int
+ftrace_event_open(struct inode *inode, struct file *file,
+                  const struct seq_operations *seq_ops)
 {
-        static struct dentry *d_tracer;
+        struct seq_file *m;
-        static struct dentry *d_events;
+        int ret;
-        if (d_events)
-                return d_events;
-        d_tracer = tracing_init_dentry();
-        if (!d_tracer)
-                return NULL;
-        d_events = debugfs_create_dir("events", d_tracer);
+        ret = seq_open(file, seq_ops);
-        if (!d_events)
+        if (ret < 0)
-                pr_warning("Could not create debugfs "
+                return ret;
-                           "'events' directory\n");
+        m = file->private_data;
+        /* copy tr over to seq ops */
+        m->private = inode->i_private;
-        return d_events;
+        return ret;
 }
 static int
@@ -1072,117 +1244,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)
 {
        const struct seq_operations *seq_ops = &show_event_seq_ops;
-        return seq_open(file, seq_ops);
+        return ftrace_event_open(inode, file, seq_ops);
 }
 static int
 ftrace_event_set_open(struct inode *inode, struct file *file)
 {
        const struct seq_operations *seq_ops = &show_set_event_seq_ops;
+        struct trace_array *tr = inode->i_private;
        if ((file->f_mode & FMODE_WRITE) &&
            (file->f_flags & O_TRUNC))
-                ftrace_clear_events();
+                ftrace_clear_events(tr);
-        return seq_open(file, seq_ops);
+        return ftrace_event_open(inode, file, seq_ops);
+}
+static struct event_subsystem *
+create_new_subsystem(const char *name)
+{
+        struct event_subsystem *system;
+        /* need to create new entry */
+        system = kmalloc(sizeof(*system), GFP_KERNEL);
+        if (!system)
+                return NULL;
+        system->ref_count = 1;
+        system->name = name;
+        system->filter = NULL;
+        system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+        if (!system->filter)
+                goto out_free;
+        list_add(&system->list, &event_subsystems);
+        return system;
+ out_free:
+        kfree(system);
+        return NULL;
 }
 static struct dentry *
-event_subsystem_dir(const char *name, struct dentry *d_events)
+event_subsystem_dir(struct trace_array *tr, const char *name,
+                    struct ftrace_event_file *file, struct dentry *parent)
 {
+        struct ftrace_subsystem_dir *dir;
        struct event_subsystem *system;
        struct dentry *entry;
        /* First see if we did not already create this dir */
-        list_for_each_entry(system, &event_subsystems, list) {
+        list_for_each_entry(dir, &tr->systems, list) {
+                system = dir->subsystem;
                if (strcmp(system->name, name) == 0) {
-                        system->nr_events++;
+                        dir->nr_events++;
-                        return system->entry;
+                        file->system = dir;
+                        return dir->entry;
                }
        }
-        /* need to create new entry */
+        /* Now see if the system itself exists. */
-        system = kmalloc(sizeof(*system), GFP_KERNEL);
+        list_for_each_entry(system, &event_subsystems, list) {
-        if (!system) {
+                if (strcmp(system->name, name) == 0)
-                pr_warning("No memory to create event subsystem %s\n",
+                        break;
-                           name);
-                return d_events;
        }
+        /* Reset system variable when not found */
+        if (&system->list == &event_subsystems)
+                system = NULL;
-        system->entry = debugfs_create_dir(name, d_events);
+        dir = kmalloc(sizeof(*dir), GFP_KERNEL);
-        if (!system->entry) {
+        if (!dir)
-                pr_warning("Could not create event subsystem %s\n",
+                goto out_fail;
-                           name);
-                kfree(system);
-                return d_events;
-        }
-        system->nr_events = 1;
+        if (!system) {
-        system->ref_count = 1;
+                system = create_new_subsystem(name);
-        system->name = kstrdup(name, GFP_KERNEL);
+                if (!system)
-        if (!system->name) {
+                        goto out_free;
-                debugfs_remove(system->entry);
+        } else
-                kfree(system);
+                __get_system(system);
-                return d_events;
+        dir->entry = debugfs_create_dir(name, parent);
+        if (!dir->entry) {
+                pr_warning("Failed to create system directory %s\n", name);
+                __put_system(system);
+                goto out_free;
        }
-        list_add(&system->list, &event_subsystems);
+        dir->tr = tr;
+        dir->ref_count = 1;
-        system->filter = NULL;
+        dir->nr_events = 1;
+        dir->subsystem = system;
-        system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+        file->system = dir;
-        if (!system->filter) {
-                pr_warning("Could not allocate filter for subsystem "
-                           "'%s'\n", name);
-                return system->entry;
-        }
-        entry = debugfs_create_file("filter", 0644, system->entry, system,
+        entry = debugfs_create_file("filter", 0644, dir->entry, dir,
                                    &ftrace_subsystem_filter_fops);
        if (!entry) {
                kfree(system->filter);
                system->filter = NULL;
-                pr_warning("Could not create debugfs "
+                pr_warning("Could not create debugfs '%s/filter' entry\n", name);
-                           "'%s/filter' entry\n", name);
        }
-        trace_create_file("enable", 0644, system->entry, system,
+        trace_create_file("enable", 0644, dir->entry, dir,
                          &ftrace_system_enable_fops);
-        return system->entry;
+        list_add(&dir->list, &tr->systems);
+        return dir->entry;
+ out_free:
+        kfree(dir);
+ out_fail:
+        /* Only print this message if failed on memory allocation */
+        if (!dir || !system)
+                pr_warning("No memory to create event subsystem %s\n",
+                           name);
+        return NULL;
 }
 static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+event_create_dir(struct dentry *parent,
+                 struct ftrace_event_file *file,
                 const struct file_operations *id,
                 const struct file_operations *enable,
                 const struct file_operations *filter,
                 const struct file_operations *format)
 {
+        struct ftrace_event_call *call = file->event_call;
+        struct trace_array *tr = file->tr;
        struct list_head *head;
+        struct dentry *d_events;
        int ret;
        /*
         * If the trace point header did not define TRACE_SYSTEM
         * then the system would be called "TRACE_SYSTEM".
         */
-        if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
+        if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
-                d_events = event_subsystem_dir(call->class->system, d_events);
+                d_events = event_subsystem_dir(tr, call->class->system, file, parent);
+                if (!d_events)
-        call->dir = debugfs_create_dir(call->name, d_events);
+                        return -ENOMEM;
-        if (!call->dir) {
+        } else
-                pr_warning("Could not create debugfs "
+                d_events = parent;
-                           "'%s' directory\n", call->name);
+        file->dir = debugfs_create_dir(call->name, d_events);
+        if (!file->dir) {
+                pr_warning("Could not create debugfs '%s' directory\n",
+                           call->name);
                return -1;
        }
        if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
-                trace_create_file("enable", 0644, call->dir, call,
+                trace_create_file("enable", 0644, file->dir, file,
                                  enable);
 #ifdef CONFIG_PERF_EVENTS
        if (call->event.type && call->class->reg)
-                trace_create_file("id", 0444, call->dir, call,
+                trace_create_file("id", 0444, file->dir, call,
                                  id);
 #endif
@@ -1196,23 +1416,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
                if (ret < 0) {
                        pr_warning("Could not initialize trace point"
                                   " events/%s\n", call->name);
-                        return ret;
+                        return -1;
                }
        }
-        trace_create_file("filter", 0644, call->dir, call,
+        trace_create_file("filter", 0644, file->dir, call,
                          filter);
-        trace_create_file("format", 0444, call->dir, call,
+        trace_create_file("format", 0444, file->dir, call,
                          format);
        return 0;
 }
+static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+{
+        if (!dir)
+                return;
+        if (!--dir->nr_events) {
+                debugfs_remove_recursive(dir->entry);
+                list_del(&dir->list);
+                __put_system_dir(dir);
+        }
+}
+static void remove_event_from_tracers(struct ftrace_event_call *call)
+{
+        struct ftrace_event_file *file;
+        struct trace_array *tr;
+        do_for_each_event_file_safe(tr, file) {
+                if (file->event_call != call)
+                        continue;
+                list_del(&file->list);
+                debugfs_remove_recursive(file->dir);
+                remove_subsystem(file->system);
+                kmem_cache_free(file_cachep, file);
+                /*
+                 * The do_for_each_event_file_safe() is
+                 * a double loop. After finding the call for this
+                 * trace_array, we use break to jump to the next
+                 * trace_array.
+                 */
+                break;
+        } while_for_each_event_file();
+}
 static void event_remove(struct ftrace_event_call *call)
 {
-        ftrace_event_enable_disable(call, 0);
+        struct trace_array *tr;
+        struct ftrace_event_file *file;
+        do_for_each_event_file(tr, file) {
+                if (file->event_call != call)
+                        continue;
+                ftrace_event_enable_disable(file, 0);
+                /*
+                 * The do_for_each_event_file() is
+                 * a double loop. After finding the call for this
+                 * trace_array, we use break to jump to the next
+                 * trace_array.
+                 */
+                break;
+        } while_for_each_event_file();
        if (call->event.funcs)
                __unregister_ftrace_event(&call->event);
+        remove_event_from_tracers(call);
        list_del(&call->list);
 }
@@ -1234,82 +1507,99 @@ static int event_init(struct ftrace_event_call *call)
 }
 static int
-__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
+__register_event(struct ftrace_event_call *call, struct module *mod)
-                       const struct file_operations *id,
-                       const struct file_operations *enable,
-                       const struct file_operations *filter,
-                       const struct file_operations *format)
 {
-        struct dentry *d_events;
        int ret;
        ret = event_init(call);
        if (ret < 0)
                return ret;
-        d_events = event_trace_events_dir();
+        list_add(&call->list, &ftrace_events);
-        if (!d_events)
-                return -ENOENT;
-        ret = event_create_dir(call, d_events, id, enable, filter, format);
-        if (!ret)
-                list_add(&call->list, &ftrace_events);
        call->mod = mod;
-        return ret;
+        return 0;
+}
+/* Add an event to a trace directory */
+static int
+__trace_add_new_event(struct ftrace_event_call *call,
+                      struct trace_array *tr,
+                      const struct file_operations *id,
+                      const struct file_operations *enable,
+                      const struct file_operations *filter,
+                      const struct file_operations *format)
+{
+        struct ftrace_event_file *file;
+        file = kmem_cache_alloc(file_cachep, GFP_TRACE);
+        if (!file)
+                return -ENOMEM;
+        file->event_call = call;
+        file->tr = tr;
+        list_add(&file->list, &tr->events);
+        return event_create_dir(tr->event_dir, file, id, enable, filter, format);
 }
+/*
+ * Just create a decriptor for early init. A descriptor is required
+ * for enabling events at boot. We want to enable events before
+ * the filesystem is initialized.
+ */
+static __init int
+__trace_early_add_new_event(struct ftrace_event_call *call,
+                            struct trace_array *tr)
+{
+        struct ftrace_event_file *file;
+        file = kmem_cache_alloc(file_cachep, GFP_TRACE);
+        if (!file)
+                return -ENOMEM;
+        file->event_call = call;
+        file->tr = tr;
+        list_add(&file->list, &tr->events);
+        return 0;
+}
+struct ftrace_module_file_ops;
+static void __add_event_to_tracers(struct ftrace_event_call *call,
+                                   struct ftrace_module_file_ops *file_ops);
 /* Add an additional event_call dynamically */
 int trace_add_event_call(struct ftrace_event_call *call)
 {
        int ret;
        mutex_lock(&event_mutex);
-        ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
-                                     &ftrace_enable_fops,
-                                     &ftrace_event_filter_fops,
-                                     &ftrace_event_format_fops);
-        mutex_unlock(&event_mutex);
-        return ret;
-}
-static void remove_subsystem_dir(const char *name)
+        ret = __register_event(call, NULL);
-{
+        if (ret >= 0)
-        struct event_subsystem *system;
+                __add_event_to_tracers(call, NULL);
-        if (strcmp(name, TRACE_SYSTEM) == 0)
-                return;
-        list_for_each_entry(system, &event_subsystems, list) {
+        mutex_unlock(&event_mutex);
-                if (strcmp(system->name, name) == 0) {
+        return ret;
-                        if (!--system->nr_events) {
-                                debugfs_remove_recursive(system->entry);
-                                list_del(&system->list);
-                                __put_system(system);
-                        }
-                        break;
-                }
-        }
 }
 /*
- * Must be called under locking both of event_mutex and trace_event_mutex.
+ * Must be called under locking both of event_mutex and trace_event_sem.
 */
 static void __trace_remove_event_call(struct ftrace_event_call *call)
 {
        event_remove(call);
        trace_destroy_fields(call);
        destroy_preds(call);
-        debugfs_remove_recursive(call->dir);
-        remove_subsystem_dir(call->class->system);
 }
 /* Remove an event_call */
 void trace_remove_event_call(struct ftrace_event_call *call)
 {
        mutex_lock(&event_mutex);
-        down_write(&trace_event_mutex);
+        down_write(&trace_event_sem);
        __trace_remove_event_call(call);
-        up_write(&trace_event_mutex);
+        up_write(&trace_event_sem);
        mutex_unlock(&event_mutex);
 }
@@ -1336,6 +1626,26 @@ struct ftrace_module_file_ops {
 };
 static struct ftrace_module_file_ops *
+find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
+{
+        /*
+         * As event_calls are added in groups by module,
+         * when we find one file_ops, we don't need to search for
+         * each call in that module, as the rest should be the
+         * same. Only search for a new one if the last one did
+         * not match.
+         */
+        if (file_ops && mod == file_ops->mod)
+                return file_ops;
+        list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+                if (file_ops->mod == mod)
+                        return file_ops;
+        }
+        return NULL;
+}
+static struct ftrace_module_file_ops *
 trace_create_file_ops(struct module *mod)
 {
        struct ftrace_module_file_ops *file_ops;
@@ -1386,9 +1696,8 @@ static void trace_module_add_events(struct module *mod)
                return;
        for_each_event(call, start, end) {
-                __trace_add_event_call(*call, mod,
+                __register_event(*call, mod);
-                                       &file_ops->id, &file_ops->enable,
+                __add_event_to_tracers(*call, file_ops);
-                                       &file_ops->filter, &file_ops->format);
        }
 }
@@ -1396,12 +1705,13 @@ static void trace_module_remove_events(struct module *mod)
 {
        struct ftrace_module_file_ops *file_ops;
        struct ftrace_event_call *call, *p;
-        bool found = false;
+        bool clear_trace = false;
-        down_write(&trace_event_mutex);
+        down_write(&trace_event_sem);
        list_for_each_entry_safe(call, p, &ftrace_events, list) {
                if (call->mod == mod) {
-                        found = true;
+                        if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
+                                clear_trace = true;
                        __trace_remove_event_call(call);
                }
        }
@@ -1415,14 +1725,18 @@ static void trace_module_remove_events(struct module *mod)
                list_del(&file_ops->list);
                kfree(file_ops);
        }
+        up_write(&trace_event_sem);
        /*
         * It is safest to reset the ring buffer if the module being unloaded
-         * registered any events.
+         * registered any events that were used. The only worry is if
+         * a new module gets loaded, and takes on the same id as the events
+         * of this module. When printing out the buffer, traced events left
+         * over from this module may be passed to the new module events and
+         * unexpected results may occur.
         */
-        if (found)
+        if (clear_trace)
-                tracing_reset_current_online_cpus();
+                tracing_reset_all_online_cpus();
-        up_write(&trace_event_mutex);
 }
 static int trace_module_notify(struct notifier_block *self,
@@ -1443,14 +1757,433 @@ static int trace_module_notify(struct notifier_block *self,
        return 0;
 }
+static int
+__trace_add_new_mod_event(struct ftrace_event_call *call,
+                          struct trace_array *tr,
+                          struct ftrace_module_file_ops *file_ops)
+{
+        return __trace_add_new_event(call, tr,
+                                     &file_ops->id, &file_ops->enable,
+                                     &file_ops->filter, &file_ops->format);
+}
 #else
-static int trace_module_notify(struct notifier_block *self,
+static inline struct ftrace_module_file_ops *
-                               unsigned long val, void *data)
+find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
+{
+        return NULL;
+}
+static inline int trace_module_notify(struct notifier_block *self,
+                                      unsigned long val, void *data)
 {
        return 0;
 }
+static inline int
+__trace_add_new_mod_event(struct ftrace_event_call *call,
+                          struct trace_array *tr,
+                          struct ftrace_module_file_ops *file_ops)
+{
+        return -ENODEV;
+}
 #endif /* CONFIG_MODULES */
+/* Create a new event directory structure for a trace directory. */
+static void
+__trace_add_event_dirs(struct trace_array *tr)
+{
+        struct ftrace_module_file_ops *file_ops = NULL;
+        struct ftrace_event_call *call;
+        int ret;
+        list_for_each_entry(call, &ftrace_events, list) {
+                if (call->mod) {
+                        /*
+                         * Directories for events by modules need to
+                         * keep module ref counts when opened (as we don't
+                         * want the module to disappear when reading one
+                         * of these files). The file_ops keep account of
+                         * the module ref count.
+                         */
+                        file_ops = find_ftrace_file_ops(file_ops, call->mod);
+                        if (!file_ops)
+                                continue; /* Warn? */
+                        ret = __trace_add_new_mod_event(call, tr, file_ops);
+                        if (ret < 0)
+                                pr_warning("Could not create directory for event %s\n",
+                                           call->name);
+                        continue;
+                }
+                ret = __trace_add_new_event(call, tr,
+                                            &ftrace_event_id_fops,
+                                            &ftrace_enable_fops,
+                                            &ftrace_event_filter_fops,
+                                            &ftrace_event_format_fops);
+                if (ret < 0)
+                        pr_warning("Could not create directory for event %s\n",
+                                   call->name);
+        }
+}
+#ifdef CONFIG_DYNAMIC_FTRACE
+/* Avoid typos */
+#define ENABLE_EVENT_STR        "enable_event"
+#define DISABLE_EVENT_STR       "disable_event"
+struct event_probe_data {
+        struct ftrace_event_file        *file;
+        unsigned long                   count;
+        int                             ref;
+        bool                            enable;
+};
+static struct ftrace_event_file *
+find_event_file(struct trace_array *tr, const char *system,  const char *event)
+{
+        struct ftrace_event_file *file;
+        struct ftrace_event_call *call;
+        list_for_each_entry(file, &tr->events, list) {
+                call = file->event_call;
+                if (!call->name || !call->class || !call->class->reg)
+                        continue;
+                if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+                        continue;
+                if (strcmp(event, call->name) == 0 &&
+                    strcmp(system, call->class->system) == 0)
+                        return file;
+        }
+        return NULL;
+}
+static void
+event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+{
+        struct event_probe_data **pdata = (struct event_probe_data **)_data;
+        struct event_probe_data *data = *pdata;
+        if (!data)
+                return;
+        if (data->enable)
+                clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+        else
+                set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+}
+static void
+event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+{
+        struct event_probe_data **pdata = (struct event_probe_data **)_data;
+        struct event_probe_data *data = *pdata;
+        if (!data)
+                return;
+        if (!data->count)
+                return;
+        /* Skip if the event is in a state we want to switch to */
+        if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+                return;
+        if (data->count != -1)
+                (data->count)--;
+        event_enable_probe(ip, parent_ip, _data);
+}
+static int
+event_enable_print(struct seq_file *m, unsigned long ip,
+                      struct ftrace_probe_ops *ops, void *_data)
+{
+        struct event_probe_data *data = _data;
+        seq_printf(m, "%ps:", (void *)ip);
+        seq_printf(m, "%s:%s:%s",
+                   data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+                   data->file->event_call->class->system,
+                   data->file->event_call->name);
+        if (data->count == -1)
+                seq_printf(m, ":unlimited\n");
+        else
+                seq_printf(m, ":count=%ld\n", data->count);
+        return 0;
+}
+static int
+event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip,
+                  void **_data)
+{
+        struct event_probe_data **pdata = (struct event_probe_data **)_data;
+        struct event_probe_data *data = *pdata;
+        data->ref++;
+        return 0;
+}
+static void
+event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip,
+                  void **_data)
+{
+        struct event_probe_data **pdata = (struct event_probe_data **)_data;
+        struct event_probe_data *data = *pdata;
+        if (WARN_ON_ONCE(data->ref <= 0))
+                return;
+        data->ref--;
+        if (!data->ref) {
+                /* Remove the SOFT_MODE flag */
+                __ftrace_event_enable_disable(data->file, 0, 1);
+                module_put(data->file->event_call->mod);
+                kfree(data);
+        }
+        *pdata = NULL;
+}
+static struct ftrace_probe_ops event_enable_probe_ops = {
+        .func                   = event_enable_probe,
+        .print                  = event_enable_print,
+        .init                   = event_enable_init,
+        .free                   = event_enable_free,
+};
+static struct ftrace_probe_ops event_enable_count_probe_ops = {
+        .func                   = event_enable_count_probe,
+        .print                  = event_enable_print,
+        .init                   = event_enable_init,
+        .free                   = event_enable_free,
+};
+static struct ftrace_probe_ops event_disable_probe_ops = {
+        .func                   = event_enable_probe,
+        .print                  = event_enable_print,
+        .init                   = event_enable_init,
+        .free                   = event_enable_free,
+};
+static struct ftrace_probe_ops event_disable_count_probe_ops = {
+        .func                   = event_enable_count_probe,
+        .print                  = event_enable_print,
+        .init                   = event_enable_init,
+        .free                   = event_enable_free,
+};
+static int
+event_enable_func(struct ftrace_hash *hash,
+                  char *glob, char *cmd, char *param, int enabled)
+{
+        struct trace_array *tr = top_trace_array();
+        struct ftrace_event_file *file;
+        struct ftrace_probe_ops *ops;
+        struct event_probe_data *data;
+        const char *system;
+        const char *event;
+        char *number;
+        bool enable;
+        int ret;
+        /* hash funcs only work with set_ftrace_filter */
+        if (!enabled)
+                return -EINVAL;
+        if (!param)
+                return -EINVAL;
+        system = strsep(&param, ":");
+        if (!param)
+                return -EINVAL;
+        event = strsep(&param, ":");
+        mutex_lock(&event_mutex);
+        ret = -EINVAL;
+        file = find_event_file(tr, system, event);
+        if (!file)
+                goto out;
+        enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+        if (enable)
+                ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops;
+        else
+                ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
+        if (glob[0] == '!') {
+                unregister_ftrace_function_probe_func(glob+1, ops);
+                ret = 0;
+                goto out;
+        }
+        ret = -ENOMEM;
+        data = kzalloc(sizeof(*data), GFP_KERNEL);
+        if (!data)
+                goto out;
+        data->enable = enable;
+        data->count = -1;
+        data->file = file;
+        if (!param)
+                goto out_reg;
+        number = strsep(&param, ":");
+        ret = -EINVAL;
+        if (!strlen(number))
+                goto out_free;
+        /*
+         * We use the callback data field (which is a pointer)
+         * as our counter.
+         */
+        ret = kstrtoul(number, 0, &data->count);
+        if (ret)
+                goto out_free;
+ out_reg:
+        /* Don't let event modules unload while probe registered */
+        ret = try_module_get(file->event_call->mod);
+        if (!ret)
+                goto out_free;
+        ret = __ftrace_event_enable_disable(file, 1, 1);
+        if (ret < 0)
+                goto out_put;
+        ret = register_ftrace_function_probe(glob, ops, data);
+        if (!ret)
+                goto out_disable;
+ out:
+        mutex_unlock(&event_mutex);
+        return ret;
+ out_disable:
+        __ftrace_event_enable_disable(file, 0, 1);
+ out_put:
+        module_put(file->event_call->mod);
+ out_free:
+        kfree(data);
+        goto out;
+}
+static struct ftrace_func_command event_enable_cmd = {
+        .name                   = ENABLE_EVENT_STR,
+        .func                   = event_enable_func,
+};
+static struct ftrace_func_command event_disable_cmd = {
+        .name                   = DISABLE_EVENT_STR,
+        .func                   = event_enable_func,
+};
+static __init int register_event_cmds(void)
+{
+        int ret;
+        ret = register_ftrace_command(&event_enable_cmd);
+        if (WARN_ON(ret < 0))
+                return ret;
+        ret = register_ftrace_command(&event_disable_cmd);
+        if (WARN_ON(ret < 0))
+                unregister_ftrace_command(&event_enable_cmd);
+        return ret;
+}
+#else
+static inline int register_event_cmds(void) { return 0; }
+#endif /* CONFIG_DYNAMIC_FTRACE */
+/*
+ * The top level array has already had its ftrace_event_file
+ * descriptors created in order to allow for early events to
+ * be recorded. This function is called after the debugfs has been
+ * initialized, and we now have to create the files associated
+ * to the events.
+ */
+static __init void
+__trace_early_add_event_dirs(struct trace_array *tr)
+{
+        struct ftrace_event_file *file;
+        int ret;
+        list_for_each_entry(file, &tr->events, list) {
+                ret = event_create_dir(tr->event_dir, file,
+                                       &ftrace_event_id_fops,
+                                       &ftrace_enable_fops,
+                                       &ftrace_event_filter_fops,
+                                       &ftrace_event_format_fops);
+                if (ret < 0)
+                        pr_warning("Could not create directory for event %s\n",
+                                   file->event_call->name);
+        }
+}
+/*
+ * For early boot up, the top trace array requires to have
+ * a list of events that can be enabled. This must be done before
+ * the filesystem is set up in order to allow events to be traced
+ * early.
+ */
+static __init void
+__trace_early_add_events(struct trace_array *tr)
+{
+        struct ftrace_event_call *call;
+        int ret;
+        list_for_each_entry(call, &ftrace_events, list) {
+                /* Early boot up should not have any modules loaded */
+                if (WARN_ON_ONCE(call->mod))
+                        continue;
+                ret = __trace_early_add_new_event(call, tr);
+                if (ret < 0)
+                        pr_warning("Could not create early event %s\n",
+                                   call->name);
+        }
+}
+/* Remove the event directory structure for a trace directory. */
+static void
+__trace_remove_event_dirs(struct trace_array *tr)
+{
+        struct ftrace_event_file *file, *next;
+        list_for_each_entry_safe(file, next, &tr->events, list) {
+                list_del(&file->list);
+                debugfs_remove_recursive(file->dir);
+                remove_subsystem(file->system);
+                kmem_cache_free(file_cachep, file);
+        }
+}
+static void
+__add_event_to_tracers(struct ftrace_event_call *call,
+                       struct ftrace_module_file_ops *file_ops)
+{
+        struct trace_array *tr;
+        list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+                if (file_ops)
+                        __trace_add_new_mod_event(call, tr, file_ops);
+                else
+                        __trace_add_new_event(call, tr,
+                                              &ftrace_event_id_fops,
+                                              &ftrace_enable_fops,
+                                              &ftrace_event_filter_fops,
+                                              &ftrace_event_format_fops);
+        }
+}
 static struct notifier_block trace_module_nb = {
        .notifier_call = trace_module_notify,
        .priority = 0,
@@ -1464,15 +2197,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
 static __init int setup_trace_event(char *str)
 {
        strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
-        ring_buffer_expanded = 1;
+        ring_buffer_expanded = true;
-        tracing_selftest_disabled = 1;
+        tracing_selftest_disabled = true;
        return 1;
 }
 __setup("trace_event=", setup_trace_event);
+/* Expects to have event_mutex held when called */
+static int
+create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
+{
+        struct dentry *d_events;
+        struct dentry *entry;
+        entry = debugfs_create_file("set_event", 0644, parent,
+                                    tr, &ftrace_set_event_fops);
+        if (!entry) {
+                pr_warning("Could not create debugfs 'set_event' entry\n");
+                return -ENOMEM;
+        }
+        d_events = debugfs_create_dir("events", parent);
+        if (!d_events) {
+                pr_warning("Could not create debugfs 'events' directory\n");
+                return -ENOMEM;
+        }
+        /* ring buffer internal formats */
+        trace_create_file("header_page", 0444, d_events,
+                          ring_buffer_print_page_header,
+                          &ftrace_show_header_fops);
+        trace_create_file("header_event", 0444, d_events,
+                          ring_buffer_print_entry_header,
+                          &ftrace_show_header_fops);
+        trace_create_file("enable", 0644, d_events,
+                          tr, &ftrace_tr_enable_fops);
+        tr->event_dir = d_events;
+        return 0;
+}
+/**
+ * event_trace_add_tracer - add a instance of a trace_array to events
+ * @parent: The parent dentry to place the files/directories for events in
+ * @tr: The trace array associated with these events
+ *
+ * When a new instance is created, it needs to set up its events
+ * directory, as well as other files associated with events. It also
+ * creates the event hierachry in the @parent/events directory.
+ *
+ * Returns 0 on success.
+ */
+int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+        int ret;
+        mutex_lock(&event_mutex);
+        ret = create_event_toplevel_files(parent, tr);
+        if (ret)
+                goto out_unlock;
+        down_write(&trace_event_sem);
+        __trace_add_event_dirs(tr);
+        up_write(&trace_event_sem);
+ out_unlock:
+        mutex_unlock(&event_mutex);
+        return ret;
+}
+/*
+ * The top trace array already had its file descriptors created.
+ * Now the files themselves need to be created.
+ */
+static __init int
+early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+        int ret;
+        mutex_lock(&event_mutex);
+        ret = create_event_toplevel_files(parent, tr);
+        if (ret)
+                goto out_unlock;
+        down_write(&trace_event_sem);
+        __trace_early_add_event_dirs(tr);
+        up_write(&trace_event_sem);
+ out_unlock:
+        mutex_unlock(&event_mutex);
+        return ret;
+}
+int event_trace_del_tracer(struct trace_array *tr)
+{
+        /* Disable any running events */
+        __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
+        mutex_lock(&event_mutex);
+        down_write(&trace_event_sem);
+        __trace_remove_event_dirs(tr);
+        debugfs_remove_recursive(tr->event_dir);
+        up_write(&trace_event_sem);
+        tr->event_dir = NULL;
+        mutex_unlock(&event_mutex);
+        return 0;
+}
+static __init int event_trace_memsetup(void)
+{
+        field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
+        file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
+        return 0;
+}
 static __init int event_trace_enable(void)
 {
+        struct trace_array *tr = top_trace_array();
        struct ftrace_event_call **iter, *call;
        char *buf = bootup_event_buf;
        char *token;
@@ -1486,6 +2339,14 @@ static __init int event_trace_enable(void)
                        list_add(&call->list, &ftrace_events);
        }
+        /*
+         * We need the top trace array to have a working set of trace
+         * points at early init, before the debug files and directories
+         * are created. Create the file entries now, and attach them
+         * to the actual file dentries later.
+         */
+        __trace_early_add_events(tr);
        while (true) {
                token = strsep(&buf, ",");
@@ -1494,73 +2355,43 @@ static __init int event_trace_enable(void)
                if (!*token)
                        continue;
-                ret = ftrace_set_clr_event(token, 1);
+                ret = ftrace_set_clr_event(tr, token, 1);
                if (ret)
                        pr_warn("Failed to enable trace event: %s\n", token);
        }
        trace_printk_start_comm();
+        register_event_cmds();
        return 0;
 }
 static __init int event_trace_init(void)
 {
-        struct ftrace_event_call *call;
+        struct trace_array *tr;
        struct dentry *d_tracer;
        struct dentry *entry;
-        struct dentry *d_events;
        int ret;
+        tr = top_trace_array();
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
                return 0;
        entry = debugfs_create_file("available_events", 0444, d_tracer,
-                                    NULL, &ftrace_avail_fops);
+                                    tr, &ftrace_avail_fops);
        if (!entry)
                pr_warning("Could not create debugfs "
                           "'available_events' entry\n");
-        entry = debugfs_create_file("set_event", 0644, d_tracer,
-                                    NULL, &ftrace_set_event_fops);
-        if (!entry)
-                pr_warning("Could not create debugfs "
-                           "'set_event' entry\n");
-        d_events = event_trace_events_dir();
-        if (!d_events)
-                return 0;
-        /* ring buffer internal formats */
-        trace_create_file("header_page", 0444, d_events,
-                          ring_buffer_print_page_header,
-                          &ftrace_show_header_fops);
-        trace_create_file("header_event", 0444, d_events,
-                          ring_buffer_print_entry_header,
-                          &ftrace_show_header_fops);
-        trace_create_file("enable", 0644, d_events,
-                          NULL, &ftrace_system_enable_fops);
        if (trace_define_common_fields())
                pr_warning("tracing: Failed to allocate common fields");
-        /*
+        ret = early_event_add_tracer(d_tracer, tr);
-         * Early initialization already enabled ftrace event.
+        if (ret)
-         * Now it's only necessary to create the event directory.
+                return ret;
-         */
-        list_for_each_entry(call, &ftrace_events, list) {
-                ret = event_create_dir(call, d_events,
-                                       &ftrace_event_id_fops,
-                                       &ftrace_enable_fops,
-                                       &ftrace_event_filter_fops,
-                                       &ftrace_event_format_fops);
-                if (ret < 0)
-                        event_remove(call);
-        }
        ret = register_module_notifier(&trace_module_nb);
        if (ret)
@@ -1568,6 +2399,7 @@ static __init int event_trace_init(void)
        return 0;
 }
+early_initcall(event_trace_memsetup);
 core_initcall(event_trace_enable);
 fs_initcall(event_trace_init);
@@ -1627,13 +2459,20 @@ static __init void event_test_stuff(void)
 */
 static __init void event_trace_self_tests(void)
 {
+        struct ftrace_subsystem_dir *dir;
+        struct ftrace_event_file *file;
        struct ftrace_event_call *call;
        struct event_subsystem *system;
+        struct trace_array *tr;
        int ret;
+        tr = top_trace_array();
        pr_info("Running tests on trace events:\n");
-        list_for_each_entry(call, &ftrace_events, list) {
+        list_for_each_entry(file, &tr->events, list) {
+                call = file->event_call;
                /* Only test those that have a probe */
                if (!call->class || !call->class->probe)
@@ -1657,15 +2496,15 @@ static __init void event_trace_self_tests(void)
                 * If an event is already enabled, someone is using
                 * it and the self test should not be on.
                 */
-                if (call->flags & TRACE_EVENT_FL_ENABLED) {
+                if (file->flags & FTRACE_EVENT_FL_ENABLED) {
                        pr_warning("Enabled event during self test!\n");
                        WARN_ON_ONCE(1);
                        continue;
                }
-                ftrace_event_enable_disable(call, 1);
+                ftrace_event_enable_disable(file, 1);
                event_test_stuff();
-                ftrace_event_enable_disable(call, 0);
+                ftrace_event_enable_disable(file, 0);
                pr_cont("OK\n");
        }
@@ -1674,7 +2513,9 @@ static __init void event_trace_self_tests(void)
        pr_info("Running tests on trace event systems:\n");
-        list_for_each_entry(system, &event_subsystems, list) {
+        list_for_each_entry(dir, &tr->systems, list) {
+                system = dir->subsystem;
                /* the ftrace system is special, skip it */
                if (strcmp(system->name, "ftrace") == 0)
@@ -1682,7 +2523,7 @@ static __init void event_trace_self_tests(void)
                pr_info("Testing event system %s: ", system->name);
-                ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
+                ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
                if (WARN_ON_ONCE(ret)) {
                        pr_warning("error enabling system %s\n",
                                   system->name);
@@ -1691,7 +2532,7 @@ static __init void event_trace_self_tests(void)
                event_test_stuff();
-                ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
+                ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
                if (WARN_ON_ONCE(ret)) {
                        pr_warning("error disabling system %s\n",
                                   system->name);
@@ -1706,7 +2547,7 @@ static __init void event_trace_self_tests(void)
        pr_info("Running tests on all trace events:\n");
        pr_info("Testing all events: ");
-        ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
+        ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error enabling all events\n");
                return;
@@ -1715,7 +2556,7 @@ static __init void event_trace_self_tests(void)
        event_test_stuff();
        /* reset sysname */
-        ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
+        ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error disabling all events\n");
                return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e5b0ca8b8d4d..a6361178de5a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system,
        mutex_unlock(&event_mutex);
 }
-static struct ftrace_event_field *
-__find_event_field(struct list_head *head, char *name)
-{
-        struct ftrace_event_field *field;
-        list_for_each_entry(field, head, link) {
-                if (!strcmp(field->name, name))
-                        return field;
-        }
-        return NULL;
-}
-static struct ftrace_event_field *
-find_event_field(struct ftrace_event_call *call, char *name)
-{
-        struct ftrace_event_field *field;
-        struct list_head *head;
-        field = __find_event_field(&ftrace_common_fields, name);
-        if (field)
-                return field;
-        head = trace_get_fields(call);
-        return __find_event_field(head, name);
-}
 static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
 {
        stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
@@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
                return NULL;
        }
-        field = find_event_field(call, operand1);
+        field = trace_find_event_field(call, operand1);
        if (!field) {
                parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
                return NULL;
@@ -1907,16 +1880,17 @@ out_unlock:
        return err;
 }
-int apply_subsystem_event_filter(struct event_subsystem *system,
+int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
                                 char *filter_string)
 {
+        struct event_subsystem *system = dir->subsystem;
        struct event_filter *filter;
        int err = 0;
        mutex_lock(&event_mutex);
        /* Make sure the system still has events */
-        if (!system->nr_events) {
+        if (!dir->nr_events) {
                err = -ENODEV;
                goto out_unlock;
        }
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e039906b037d..d21a74670088 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void)		\
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter)     \
-int                                                                     \
+static int __init                                                       \
 ftrace_define_fields_##name(struct ftrace_event_call *event_call)       \
 {                                                                       \
        struct struct_name field;                                       \
@@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\
 #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
                         regfn)                                         \
                                                                        \
-struct ftrace_event_class event_class_ftrace_##call = {                 \
+struct ftrace_event_class __refdata event_class_ftrace_##call = {       \
        .system                 = __stringify(TRACE_SYSTEM),            \
        .define_fields          = ftrace_define_fields_##call,          \
        .fields                 = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 601152523326..c4d6d7191988 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);
 static int function_trace_init(struct trace_array *tr)
 {
        func_trace = tr;
-        tr->cpu = get_cpu();
+        tr->trace_buffer.cpu = get_cpu();
        put_cpu();
        tracing_start_cmdline_record();
@@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)
 static void function_trace_start(struct trace_array *tr)
 {
-        tracing_reset_online_cpus(tr);
+        tracing_reset_online_cpus(&tr->trace_buffer);
 }
 /* Our option */
@@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
                goto out;
        cpu = smp_processor_id();
-        data = tr->data[cpu];
+        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
        if (!atomic_read(&data->disabled)) {
                local_save_flags(flags);
                trace_function(tr, ip, parent_ip, flags, pc);
@@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
         */
        local_irq_save(flags);
        cpu = raw_smp_processor_id();
-        data = tr->data[cpu];
+        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
        disabled = atomic_inc_return(&data->disabled);
        if (likely(disabled == 1)) {
@@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly =
 };
 #ifdef CONFIG_DYNAMIC_FTRACE
-static void
+static int update_count(void **data)
-ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
 {
-        long *count = (long *)data;
+        unsigned long *count = (long *)data;
-        if (tracing_is_on())
-                return;
        if (!*count)
-                return;
+                return 0;
        if (*count != -1)
                (*count)--;
-        tracing_on();
+        return 1;
 }
 static void
-ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
 {
-        long *count = (long *)data;
+        if (tracing_is_on())
+                return;
+        if (update_count(data))
+                tracing_on();
+}
+static void
+ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
+{
        if (!tracing_is_on())
                return;
-        if (!*count)
+        if (update_count(data))
+                tracing_off();
+}
+static void
+ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+{
+        if (tracing_is_on())
                return;
-        if (*count != -1)
+        tracing_on();
-                (*count)--;
+}
+static void
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+{
+        if (!tracing_is_on())
+                return;
        tracing_off();
 }
-static int
+/*
-ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
+ * Skip 4:
-                         struct ftrace_probe_ops *ops, void *data);
+ *   ftrace_stacktrace()
+ *   function_trace_probe_call()
+ *   ftrace_ops_list_func()
+ *   ftrace_call()
+ */
+#define STACK_SKIP 4
-static struct ftrace_probe_ops traceon_probe_ops = {
+static void
-        .func                   = ftrace_traceon,
+ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
-        .print                  = ftrace_trace_onoff_print,
+{
-};
+        trace_dump_stack(STACK_SKIP);
+}
-static struct ftrace_probe_ops traceoff_probe_ops = {
+static void
-        .func                   = ftrace_traceoff,
+ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
-        .print                  = ftrace_trace_onoff_print,
+{
-};
+        if (!tracing_is_on())
+                return;
+        if (update_count(data))
+                trace_dump_stack(STACK_SKIP);
+}
 static int
-ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
+ftrace_probe_print(const char *name, struct seq_file *m,
-                         struct ftrace_probe_ops *ops, void *data)
+                   unsigned long ip, void *data)
 {
        long count = (long)data;
-        seq_printf(m, "%ps:", (void *)ip);
+        seq_printf(m, "%ps:%s", (void *)ip, name);
-        if (ops == &traceon_probe_ops)
-                seq_printf(m, "traceon");
-        else
-                seq_printf(m, "traceoff");
        if (count == -1)
                seq_printf(m, ":unlimited\n");
@@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 }
 static int
-ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
+ftrace_traceon_print(struct seq_file *m, unsigned long ip,
+                         struct ftrace_probe_ops *ops, void *data)
 {
-        struct ftrace_probe_ops *ops;
+        return ftrace_probe_print("traceon", m, ip, data);
+}
-        /* we register both traceon and traceoff to this callback */
-        if (strcmp(cmd, "traceon") == 0)
-                ops = &traceon_probe_ops;
-        else
-                ops = &traceoff_probe_ops;
-        unregister_ftrace_function_probe_func(glob, ops);
+static int
+ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
+                         struct ftrace_probe_ops *ops, void *data)
+{
+        return ftrace_probe_print("traceoff", m, ip, data);
+}
-        return 0;
+static int
+ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
+                        struct ftrace_probe_ops *ops, void *data)
+{
+        return ftrace_probe_print("stacktrace", m, ip, data);
 }
+static struct ftrace_probe_ops traceon_count_probe_ops = {
+        .func                   = ftrace_traceon_count,
+        .print                  = ftrace_traceon_print,
+};
+static struct ftrace_probe_ops traceoff_count_probe_ops = {
+        .func                   = ftrace_traceoff_count,
+        .print                  = ftrace_traceoff_print,
+};
+static struct ftrace_probe_ops stacktrace_count_probe_ops = {
+        .func                   = ftrace_stacktrace_count,
+        .print                  = ftrace_stacktrace_print,
+};
+static struct ftrace_probe_ops traceon_probe_ops = {
+        .func                   = ftrace_traceon,
+        .print                  = ftrace_traceon_print,
+};
+static struct ftrace_probe_ops traceoff_probe_ops = {
+        .func                   = ftrace_traceoff,
+        .print                  = ftrace_traceoff_print,
+};
+static struct ftrace_probe_ops stacktrace_probe_ops = {
+        .func                   = ftrace_stacktrace,
+        .print                  = ftrace_stacktrace_print,
+};
 static int
-ftrace_trace_onoff_callback(struct ftrace_hash *hash,
+ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
-                            char *glob, char *cmd, char *param, int enable)
+                            struct ftrace_hash *hash, char *glob,
+                            char *cmd, char *param, int enable)
 {
-        struct ftrace_probe_ops *ops;
        void *count = (void *)-1;
        char *number;
        int ret;
@@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
        if (!enable)
                return -EINVAL;
-        if (glob[0] == '!')
+        if (glob[0] == '!') {
-                return ftrace_trace_onoff_unreg(glob+1, cmd, param);
+                unregister_ftrace_function_probe_func(glob+1, ops);
+                return 0;
-        /* we register both traceon and traceoff to this callback */
+        }
-        if (strcmp(cmd, "traceon") == 0)
-                ops = &traceon_probe_ops;
-        else
-                ops = &traceoff_probe_ops;
        if (!param)
                goto out_reg;
@@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
        return ret < 0 ? ret : 0;
 }
+static int
+ftrace_trace_onoff_callback(struct ftrace_hash *hash,
+                            char *glob, char *cmd, char *param, int enable)
+{
+        struct ftrace_probe_ops *ops;
+        /* we register both traceon and traceoff to this callback */
+        if (strcmp(cmd, "traceon") == 0)
+                ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
+        else
+                ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
+        return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+                                           param, enable);
+}
+static int
+ftrace_stacktrace_callback(struct ftrace_hash *hash,
+                           char *glob, char *cmd, char *param, int enable)
+{
+        struct ftrace_probe_ops *ops;
+        ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
+        return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+                                           param, enable);
+}
 static struct ftrace_func_command ftrace_traceon_cmd = {
        .name                   = "traceon",
        .func                   = ftrace_trace_onoff_callback,
@@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {
        .func                   = ftrace_trace_onoff_callback,
 };
+static struct ftrace_func_command ftrace_stacktrace_cmd = {
+        .name                   = "stacktrace",
+        .func                   = ftrace_stacktrace_callback,
+};
 static int __init init_func_cmd_traceon(void)
 {
        int ret;
@@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void)
        ret = register_ftrace_command(&ftrace_traceon_cmd);
        if (ret)
                unregister_ftrace_command(&ftrace_traceoff_cmd);
+        ret = register_ftrace_command(&ftrace_stacktrace_cmd);
+        if (ret) {
+                unregister_ftrace_command(&ftrace_traceoff_cmd);
+                unregister_ftrace_command(&ftrace_traceon_cmd);
+        }
        return ret;
 }
 #else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 39ada66389cc..8388bc99f2ee 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,
 {
        struct ftrace_event_call *call = &event_funcgraph_entry;
        struct ring_buffer_event *event;
-        struct ring_buffer *buffer = tr->buffer;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        struct ftrace_graph_ent_entry *entry;
        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
        local_irq_save(flags);
        cpu = raw_smp_processor_id();
-        data = tr->data[cpu];
+        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
        disabled = atomic_inc_return(&data->disabled);
        if (likely(disabled == 1)) {
                pc = preempt_count();
@@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,
 {
        struct ftrace_event_call *call = &event_funcgraph_exit;
        struct ring_buffer_event *event;
-        struct ring_buffer *buffer = tr->buffer;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        struct ftrace_graph_ret_entry *entry;
        if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
        local_irq_save(flags);
        cpu = raw_smp_processor_id();
-        data = tr->data[cpu];
+        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
        disabled = atomic_inc_return(&data->disabled);
        if (likely(disabled == 1)) {
                pc = preempt_count();
@@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,
                         * We need to consume the current entry to see
                         * the next one.
                         */
-                        ring_buffer_consume(iter->tr->buffer, iter->cpu,
+                        ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
                                            NULL, NULL);
-                        event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+                        event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
                                                 NULL, NULL);
                }
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 713a2cac4881..b19d065a28cb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -32,7 +32,8 @@ enum {
 static int trace_type __read_mostly;
-static int save_lat_flag;
+static int save_flags;
+static bool function_enabled;
 static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
 static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,
        if (!irqs_disabled_flags(*flags))
                return 0;
-        *data = tr->data[cpu];
+        *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
        disabled = atomic_inc_return(&(*data)->disabled);
        if (likely(disabled == 1))
@@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
                per_cpu(tracing_cpu, cpu) = 0;
        tracing_max_latency = 0;
-        tracing_reset_online_cpus(irqsoff_trace);
+        tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
        return start_irqsoff_tracer(irqsoff_trace, set);
 }
@@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
        if (per_cpu(tracing_cpu, cpu))
                return;
-        data = tr->data[cpu];
+        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
        if (unlikely(!data) || atomic_read(&data->disabled))
                return;
@@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
        if (!tracer_enabled)
                return;
-        data = tr->data[cpu];
+        data = per_cpu_ptr(tr->trace_buffer.data, cpu);
        if (unlikely(!data) ||
            !data->critical_start || atomic_read(&data->disabled))
@@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
-static int start_irqsoff_tracer(struct trace_array *tr, int graph)
+static int register_irqsoff_function(int graph, int set)
 {
-        int ret = 0;
+        int ret;
-        if (!graph)
+        /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
-                ret = register_ftrace_function(&trace_ops);
+        if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
-        else
+                return 0;
+        if (graph)
                ret = register_ftrace_graph(&irqsoff_graph_return,
                                            &irqsoff_graph_entry);
+        else
+                ret = register_ftrace_function(&trace_ops);
+        if (!ret)
+                function_enabled = true;
+        return ret;
+}
+static void unregister_irqsoff_function(int graph)
+{
+        if (!function_enabled)
+                return;
+        if (graph)
+                unregister_ftrace_graph();
+        else
+                unregister_ftrace_function(&trace_ops);
+        function_enabled = false;
+}
+static void irqsoff_function_set(int set)
+{
+        if (set)
+                register_irqsoff_function(is_graph(), 1);
+        else
+                unregister_irqsoff_function(is_graph());
+}
+static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set)
+{
+        if (mask & TRACE_ITER_FUNCTION)
+                irqsoff_function_set(set);
+        return trace_keep_overwrite(tracer, mask, set);
+}
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
+{
+        int ret;
+        ret = register_irqsoff_function(graph, 0);
        if (!ret && tracing_is_enabled())
                tracer_enabled = 1;
@@ -550,22 +596,22 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 {
        tracer_enabled = 0;
-        if (!graph)
+        unregister_irqsoff_function(graph);
-                unregister_ftrace_function(&trace_ops);
-        else
-                unregister_ftrace_graph();
 }
 static void __irqsoff_tracer_init(struct trace_array *tr)
 {
-        save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+        save_flags = trace_flags;
-        trace_flags |= TRACE_ITER_LATENCY_FMT;
+        /* non overwrite screws up the latency tracers */
+        set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
+        set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
        tracing_max_latency = 0;
        irqsoff_trace = tr;
        /* make sure that the tracer is visible */
        smp_wmb();
-        tracing_reset_online_cpus(tr);
+        tracing_reset_online_cpus(&tr->trace_buffer);
        if (start_irqsoff_tracer(tr, is_graph()))
                printk(KERN_ERR "failed to start irqsoff tracer\n");
@@ -573,10 +619,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
+        int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+        int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
        stop_irqsoff_tracer(tr, is_graph());
-        if (!save_lat_flag)
+        set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
-                trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+        set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
 }
 static void irqsoff_tracer_start(struct trace_array *tr)
@@ -609,6 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly =
        .print_line     = irqsoff_print_line,
        .flags          = &tracer_flags,
        .set_flag       = irqsoff_set_flag,
+        .flag_changed   = irqsoff_flag_changed,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_irqsoff,
 #endif
@@ -642,6 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly =
        .print_line     = irqsoff_print_line,
        .flags          = &tracer_flags,
        .set_flag       = irqsoff_set_flag,
+        .flag_changed   = irqsoff_flag_changed,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptoff,
 #endif
@@ -677,6 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
        .print_line     = irqsoff_print_line,
        .flags          = &tracer_flags,
        .set_flag       = irqsoff_set_flag,
+        .flag_changed   = irqsoff_flag_changed,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptirqsoff,
 #endif
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 3c5c5dfea0b3..bd90e1b06088 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
        trace_init_global_iter(&iter);
        for_each_tracing_cpu(cpu) {
-                atomic_inc(&iter.tr->data[cpu]->disabled);
+                atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
        }
        old_userobj = trace_flags;
@@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
        iter.iter_flags |= TRACE_FILE_LAT_FMT;
        iter.pos = -1;
-        if (cpu_file == TRACE_PIPE_ALL_CPU) {
+        if (cpu_file == RING_BUFFER_ALL_CPUS) {
                for_each_tracing_cpu(cpu) {
                        iter.buffer_iter[cpu] =
-                        ring_buffer_read_prepare(iter.tr->buffer, cpu);
+                        ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
                        ring_buffer_read_start(iter.buffer_iter[cpu]);
                        tracing_iter_reset(&iter, cpu);
                }
        } else {
                iter.cpu_file = cpu_file;
                iter.buffer_iter[cpu_file] =
-                        ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
+                        ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
                ring_buffer_read_start(iter.buffer_iter[cpu_file]);
                tracing_iter_reset(&iter, cpu_file);
        }
@@ -83,7 +83,7 @@ out:
        trace_flags = old_userobj;
        for_each_tracing_cpu(cpu) {
-                atomic_dec(&iter.tr->data[cpu]->disabled);
+                atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
        }
        for_each_tracing_cpu(cpu)
@@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)
                    !cpu_online(cpu_file))
                        return KDB_BADINT;
        } else {
-                cpu_file = TRACE_PIPE_ALL_CPU;
+                cpu_file = RING_BUFFER_ALL_CPUS;
        }
        kdb_trap_printk++;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fd3c8aae55e5..a5e8f4878bfa 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)
        overrun_detected = false;
        prev_overruns = 0;
-        tracing_reset_online_cpus(tr);
+        tracing_reset_online_cpus(&tr->trace_buffer);
 }
 static int mmio_trace_init(struct trace_array *tr)
@@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)
 static unsigned long count_overruns(struct trace_iterator *iter)
 {
        unsigned long cnt = atomic_xchg(&dropped_count, 0);
-        unsigned long over = ring_buffer_overruns(iter->tr->buffer);
+        unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
        if (over > prev_overruns)
                cnt += over - prev_overruns;
@@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
                                struct mmiotrace_rw *rw)
 {
        struct ftrace_event_call *call = &event_mmiotrace_rw;
-        struct ring_buffer *buffer = tr->buffer;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        struct ring_buffer_event *event;
        struct trace_mmiotrace_rw *entry;
        int pc = preempt_count();
@@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 void mmio_trace_rw(struct mmiotrace_rw *rw)
 {
        struct trace_array *tr = mmio_trace_array;
-        struct trace_array_cpu *data = tr->data[smp_processor_id()];
+        struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
        __trace_mmiotrace_rw(tr, data, rw);
 }
@@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
                                struct mmiotrace_map *map)
 {
        struct ftrace_event_call *call = &event_mmiotrace_map;
-        struct ring_buffer *buffer = tr->buffer;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        struct ring_buffer_event *event;
        struct trace_mmiotrace_map *entry;
        int pc = preempt_count();
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
        struct trace_array_cpu *data;
        preempt_disable();
-        data = tr->data[smp_processor_id()];
+        data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
        __trace_mmiotrace_map(tr, data, map);
        preempt_enable();
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 697e88d13907..bb922d9ee51b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE  128
-DECLARE_RWSEM(trace_event_mutex);
+DECLARE_RWSEM(trace_event_sem);
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
@@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
        return ret;
 }
+enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
+{
+        struct trace_seq *s = &iter->seq;
+        struct trace_entry *entry = iter->ent;
+        struct bputs_entry *field;
+        int ret;
+        trace_assign_type(field, entry);
+        ret = trace_seq_puts(s, field->str);
+        if (!ret)
+                return TRACE_TYPE_PARTIAL_LINE;
+        return TRACE_TYPE_HANDLED;
+}
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
 {
        struct trace_seq *s = &iter->seq;
@@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 }
 EXPORT_SYMBOL(ftrace_print_hex_seq);
+int ftrace_raw_output_prep(struct trace_iterator *iter,
+                           struct trace_event *trace_event)
+{
+        struct ftrace_event_call *event;
+        struct trace_seq *s = &iter->seq;
+        struct trace_seq *p = &iter->tmp_seq;
+        struct trace_entry *entry;
+        int ret;
+        event = container_of(trace_event, struct ftrace_event_call, event);
+        entry = iter->ent;
+        if (entry->type != event->event.type) {
+                WARN_ON_ONCE(1);
+                return TRACE_TYPE_UNHANDLED;
+        }
+        trace_seq_init(p);
+        ret = trace_seq_printf(s, "%s: ", event->name);
+        if (!ret)
+                return TRACE_TYPE_PARTIAL_LINE;
+        return 0;
+}
+EXPORT_SYMBOL(ftrace_raw_output_prep);
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
@@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
 {
        unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
        unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
-        unsigned long long abs_ts = iter->ts - iter->tr->time_start;
+        unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
        unsigned long long rel_ts = next_ts - iter->ts;
        struct trace_seq *s = &iter->seq;
@@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list)
 void trace_event_read_lock(void)
 {
-        down_read(&trace_event_mutex);
+        down_read(&trace_event_sem);
 }
 void trace_event_read_unlock(void)
 {
-        up_read(&trace_event_mutex);
+        up_read(&trace_event_sem);
 }
 /**
@@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event)
        unsigned key;
        int ret = 0;
-        down_write(&trace_event_mutex);
+        down_write(&trace_event_sem);
        if (WARN_ON(!event))
                goto out;
@@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event)
        ret = event->type;
 out:
-        up_write(&trace_event_mutex);
+        up_write(&trace_event_sem);
        return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
 /*
- * Used by module code with the trace_event_mutex held for write.
+ * Used by module code with the trace_event_sem held for write.
 */
 int __unregister_ftrace_event(struct trace_event *event)
 {
@@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event)
 */
 int unregister_ftrace_event(struct trace_event *event)
 {
-        down_write(&trace_event_mutex);
+        down_write(&trace_event_sem);
        __unregister_ftrace_event(event);
-        up_write(&trace_event_mutex);
+        up_write(&trace_event_sem);
        return 0;
 }
@@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = {
        .funcs          = &trace_user_stack_funcs,
 };
+/* TRACE_BPUTS */
+static enum print_line_t
+trace_bputs_print(struct trace_iterator *iter, int flags,
+                   struct trace_event *event)
+{
+        struct trace_entry *entry = iter->ent;
+        struct trace_seq *s = &iter->seq;
+        struct bputs_entry *field;
+        trace_assign_type(field, entry);
+        if (!seq_print_ip_sym(s, field->ip, flags))
+                goto partial;
+        if (!trace_seq_puts(s, ": "))
+                goto partial;
+        if (!trace_seq_puts(s, field->str))
+                goto partial;
+        return TRACE_TYPE_HANDLED;
+ partial:
+        return TRACE_TYPE_PARTIAL_LINE;
+}
+static enum print_line_t
+trace_bputs_raw(struct trace_iterator *iter, int flags,
+                struct trace_event *event)
+{
+        struct bputs_entry *field;
+        struct trace_seq *s = &iter->seq;
+        trace_assign_type(field, iter->ent);
+        if (!trace_seq_printf(s, ": %lx : ", field->ip))
+                goto partial;
+        if (!trace_seq_puts(s, field->str))
+                goto partial;
+        return TRACE_TYPE_HANDLED;
+ partial:
+        return TRACE_TYPE_PARTIAL_LINE;
+}
+static struct trace_event_functions trace_bputs_funcs = {
+        .trace          = trace_bputs_print,
+        .raw            = trace_bputs_raw,
+};
+static struct trace_event trace_bputs_event = {
+        .type           = TRACE_BPUTS,
+        .funcs          = &trace_bputs_funcs,
+};
 /* TRACE_BPRINT */
 static enum print_line_t
 trace_bprint_print(struct trace_iterator *iter, int flags,
@@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = {
        &trace_wake_event,
        &trace_stack_event,
        &trace_user_stack_event,
+        &trace_bputs_event,
        &trace_bprint_event,
        &trace_print_event,
        NULL
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index c038eba0492b..127a9d8c8357 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -5,6 +5,8 @@
 #include "trace.h"
 extern enum print_line_t
+trace_print_bputs_msg_only(struct trace_iterator *iter);
+extern enum print_line_t
 trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_printk_msg_only(struct trace_iterator *iter);
@@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
 /* used by module unregistering */
 extern int __unregister_ftrace_event(struct trace_event *event);
-extern struct rw_semaphore trace_event_mutex;
+extern struct rw_semaphore trace_event_sem;
 #define MAX_MEMHEX_BYTES        8
 #define HEX_CHARS               (MAX_MEMHEX_BYTES*2 + 1)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3374c792ccd8..4e98e3b257a3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
                           unsigned long flags, int pc)
 {
        struct ftrace_event_call *call = &event_context_switch;
-        struct ring_buffer *buffer = tr->buffer;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        struct ring_buffer_event *event;
        struct ctx_switch_entry *entry;
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
        pc = preempt_count();
        local_irq_save(flags);
        cpu = raw_smp_processor_id();
-        data = ctx_trace->data[cpu];
+        data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
        if (likely(!atomic_read(&data->disabled)))
                tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
@@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
        struct ftrace_event_call *call = &event_wakeup;
        struct ring_buffer_event *event;
        struct ctx_switch_entry *entry;
-        struct ring_buffer *buffer = tr->buffer;
+        struct ring_buffer *buffer = tr->trace_buffer.buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
                                          sizeof(*entry), flags, pc);
@@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
        pc = preempt_count();
        local_irq_save(flags);
        cpu = raw_smp_processor_id();
-        data = ctx_trace->data[cpu];
+        data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
        if (likely(!atomic_read(&data->disabled)))
                tracing_sched_wakeup_trace(ctx_trace, wakee, current,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 75aa97fbe1a1..fee77e15d815 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -36,7 +36,8 @@ static void __wakeup_reset(struct trace_array *tr);
 static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
 static void wakeup_graph_return(struct ftrace_graph_ret *trace);
-static int save_lat_flag;
+static int save_flags;
+static bool function_enabled;
 #define TRACE_DISPLAY_GRAPH     1
@@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
        if (cpu != wakeup_current_cpu)
                goto out_enable;
-        *data = tr->data[cpu];
+        *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
        disabled = atomic_inc_return(&(*data)->disabled);
        if (unlikely(disabled != 1))
                goto out;
@@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly =
 };
 #endif /* CONFIG_FUNCTION_TRACER */
-static int start_func_tracer(int graph)
+static int register_wakeup_function(int graph, int set)
 {
        int ret;
-        if (!graph)
+        /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
-                ret = register_ftrace_function(&trace_ops);
+        if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
-        else
+                return 0;
+        if (graph)
                ret = register_ftrace_graph(&wakeup_graph_return,
                                            &wakeup_graph_entry);
+        else
+                ret = register_ftrace_function(&trace_ops);
+        if (!ret)
+                function_enabled = true;
+        return ret;
+}
+static void unregister_wakeup_function(int graph)
+{
+        if (!function_enabled)
+                return;
+        if (graph)
+                unregister_ftrace_graph();
+        else
+                unregister_ftrace_function(&trace_ops);
+        function_enabled = false;
+}
+static void wakeup_function_set(int set)
+{
+        if (set)
+                register_wakeup_function(is_graph(), 1);
+        else
+                unregister_wakeup_function(is_graph());
+}
+static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set)
+{
+        if (mask & TRACE_ITER_FUNCTION)
+                wakeup_function_set(set);
+        return trace_keep_overwrite(tracer, mask, set);
+}
+static int start_func_tracer(int graph)
+{
+        int ret;
+        ret = register_wakeup_function(graph, 0);
        if (!ret && tracing_is_enabled())
                tracer_enabled = 1;
@@ -156,10 +202,7 @@ static void stop_func_tracer(int graph)
 {
        tracer_enabled = 0;
-        if (!graph)
+        unregister_wakeup_function(graph);
-                unregister_ftrace_function(&trace_ops);
-        else
-                unregister_ftrace_graph();
 }
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore,
        /* disable local data, not wakeup_cpu data */
        cpu = raw_smp_processor_id();
-        disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
+        disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
        if (likely(disabled != 1))
                goto out;
@@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore,
                goto out_unlock;
        /* The task we are waiting for is waking up */
-        data = wakeup_trace->data[wakeup_cpu];
+        data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
        __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
        tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -387,7 +430,7 @@ out_unlock:
        arch_spin_unlock(&wakeup_lock);
        local_irq_restore(flags);
 out:
-        atomic_dec(&wakeup_trace->data[cpu]->disabled);
+        atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
 }
 static void __wakeup_reset(struct trace_array *tr)
@@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr)
 {
        unsigned long flags;
-        tracing_reset_online_cpus(tr);
+        tracing_reset_online_cpus(&tr->trace_buffer);
        local_irq_save(flags);
        arch_spin_lock(&wakeup_lock);
@@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
                return;
        pc = preempt_count();
-        disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
+        disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
        if (unlikely(disabled != 1))
                goto out;
@@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
        local_save_flags(flags);
-        data = wakeup_trace->data[wakeup_cpu];
+        data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
        data->preempt_timestamp = ftrace_now(cpu);
        tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
@@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 out_locked:
        arch_spin_unlock(&wakeup_lock);
 out:
-        atomic_dec(&wakeup_trace->data[cpu]->disabled);
+        atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
 }
 static void start_wakeup_tracer(struct trace_array *tr)
@@ -540,8 +583,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 static int __wakeup_tracer_init(struct trace_array *tr)
 {
-        save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+        save_flags = trace_flags;
-        trace_flags |= TRACE_ITER_LATENCY_FMT;
+        /* non overwrite screws up the latency tracers */
+        set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
+        set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
        tracing_max_latency = 0;
        wakeup_trace = tr;
@@ -563,12 +609,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
 static void wakeup_tracer_reset(struct trace_array *tr)
 {
+        int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+        int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
        stop_wakeup_tracer(tr);
        /* make sure we put back any tasks we are tracing */
        wakeup_reset(tr);
-        if (!save_lat_flag)
+        set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
-                trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+        set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
 }
 static void wakeup_tracer_start(struct trace_array *tr)
@@ -594,6 +643,7 @@ static struct tracer wakeup_tracer __read_mostly =
        .print_line     = wakeup_print_line,
        .flags          = &tracer_flags,
        .set_flag       = wakeup_set_flag,
+        .flag_changed   = wakeup_flag_changed,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
@@ -615,6 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
        .print_line     = wakeup_print_line,
        .flags          = &tracer_flags,
        .set_flag       = wakeup_set_flag,
+        .flag_changed   = wakeup_flag_changed,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 51c819c12c29..55e2cf66967b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)
        return 0;
 }
-static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
+static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
 {
        struct ring_buffer_event *event;
        struct trace_entry *entry;
        unsigned int loops = 0;
-        while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
+        while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {
                entry = ring_buffer_event_data(event);
                /*
@@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
 * Test the trace buffer to see if all the elements
 * are still sane.
 */
-static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
 {
        unsigned long flags, cnt = 0;
        int cpu, ret = 0;
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
        local_irq_save(flags);
        arch_spin_lock(&ftrace_max_lock);
-        cnt = ring_buffer_entries(tr->buffer);
+        cnt = ring_buffer_entries(buf->buffer);
        /*
         * The trace_test_buffer_cpu runs a while loop to consume all data.
@@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
         */
        tracing_off();
        for_each_possible_cpu(cpu) {
-                ret = trace_test_buffer_cpu(tr, cpu);
+                ret = trace_test_buffer_cpu(buf, cpu);
                if (ret)
                        break;
        }
@@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        msleep(100);
        /* we should have nothing in the buffer */
-        ret = trace_test_buffer(tr, &count);
+        ret = trace_test_buffer(&tr->trace_buffer, &count);
        if (ret)
                goto out;
@@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        ftrace_enabled = 0;
        /* check the trace buffer */
-        ret = trace_test_buffer(tr, &count);
+        ret = trace_test_buffer(&tr->trace_buffer, &count);
        tracing_start();
        /* we should only have one item */
@@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
        ftrace_enabled = 0;
        /* check the trace buffer */
-        ret = trace_test_buffer(tr, &count);
+        ret = trace_test_buffer(&tr->trace_buffer, &count);
        trace->reset(tr);
        tracing_start();
@@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 /* Maximum number of functions to trace before diagnosing a hang */
 #define GRAPH_MAX_FUNC_TEST     100000000
-static void
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
 static unsigned int graph_hang_thresh;
 /* Wrap the real function entry probe to avoid possible hanging */
@@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
        if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
                ftrace_graph_stop();
                printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
-                if (ftrace_dump_on_oops)
+                if (ftrace_dump_on_oops) {
-                        __ftrace_dump(false, DUMP_ALL);
+                        ftrace_dump(DUMP_ALL);
+                        /* ftrace_dump() disables tracing */
+                        tracing_on();
+                }
                return 0;
        }
@@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
         * Simulate the init() callback but we attach a watchdog callback
         * to detect and recover from possible hangs
         */
-        tracing_reset_online_cpus(tr);
+        tracing_reset_online_cpus(&tr->trace_buffer);
        set_graph_array(tr);
        ret = register_ftrace_graph(&trace_graph_return,
                                    &trace_graph_entry_watchdog);
@@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
        tracing_stop();
        /* check the trace buffer */
-        ret = trace_test_buffer(tr, &count);
+        ret = trace_test_buffer(&tr->trace_buffer, &count);
        trace->reset(tr);
        tracing_start();
@@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
        /* stop the tracing. */
        tracing_stop();
        /* check both trace buffers */
-        ret = trace_test_buffer(tr, NULL);
+        ret = trace_test_buffer(&tr->trace_buffer, NULL);
        if (!ret)
-                ret = trace_test_buffer(&max_tr, &count);
+                ret = trace_test_buffer(&tr->max_buffer, &count);
        trace->reset(tr);
        tracing_start();
@@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
        /* stop the tracing. */
        tracing_stop();
        /* check both trace buffers */
-        ret = trace_test_buffer(tr, NULL);
+        ret = trace_test_buffer(&tr->trace_buffer, NULL);
        if (!ret)
-                ret = trace_test_buffer(&max_tr, &count);
+                ret = trace_test_buffer(&tr->max_buffer, &count);
        trace->reset(tr);
        tracing_start();
@@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
        /* stop the tracing. */
        tracing_stop();
        /* check both trace buffers */
-        ret = trace_test_buffer(tr, NULL);
+        ret = trace_test_buffer(&tr->trace_buffer, NULL);
        if (ret)
                goto out;
-        ret = trace_test_buffer(&max_tr, &count);
+        ret = trace_test_buffer(&tr->max_buffer, &count);
        if (ret)
                goto out;
@@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
        /* stop the tracing. */
        tracing_stop();
        /* check both trace buffers */
-        ret = trace_test_buffer(tr, NULL);
+        ret = trace_test_buffer(&tr->trace_buffer, NULL);
        if (ret)
                goto out;
-        ret = trace_test_buffer(&max_tr, &count);
+        ret = trace_test_buffer(&tr->max_buffer, &count);
        if (!ret && !count) {
                printk(KERN_CONT ".. no entries found ..");
@@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
        /* stop the tracing. */
        tracing_stop();
        /* check both trace buffers */
-        ret = trace_test_buffer(tr, NULL);
+        ret = trace_test_buffer(&tr->trace_buffer, NULL);
        printk("ret = %d\n", ret);
        if (!ret)
-                ret = trace_test_buffer(&max_tr, &count);
+                ret = trace_test_buffer(&tr->max_buffer, &count);
        trace->reset(tr);
@@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
        /* stop the tracing. */
        tracing_stop();
        /* check the trace buffer */
-        ret = trace_test_buffer(tr, &count);
+        ret = trace_test_buffer(&tr->trace_buffer, &count);
        trace->reset(tr);
        tracing_start();
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc701..b20428c5efe2 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -20,13 +20,24 @@
 #define STACK_TRACE_ENTRIES 500
+#ifdef CC_USING_FENTRY
+# define fentry         1
+#else
+# define fentry         0
+#endif
 static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
         { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
 static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
+/*
+ * Reserve one entry for the passed in ip. This will allow
+ * us to remove most or all of the stack size overhead
+ * added by the stack tracer itself.
+ */
 static struct stack_trace max_stack_trace = {
-        .max_entries            = STACK_TRACE_ENTRIES,
+        .max_entries            = STACK_TRACE_ENTRIES - 1,
-        .entries                = stack_dump_trace,
+        .entries                = &stack_dump_trace[1],
 };
 static unsigned long max_stack_size;
@@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
 int stack_tracer_enabled;
 static int last_stack_tracer_enabled;
-static inline void check_stack(void)
+static inline void
+check_stack(unsigned long ip, unsigned long *stack)
 {
        unsigned long this_size, flags;
        unsigned long *p, *top, *start;
+        static int tracer_frame;
+        int frame_size = ACCESS_ONCE(tracer_frame);
        int i;
-        this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
+        this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
        this_size = THREAD_SIZE - this_size;
+        /* Remove the frame of the tracer */
+        this_size -= frame_size;
        if (this_size <= max_stack_size)
                return;
        /* we do not handle interrupt stacks yet */
-        if (!object_is_on_stack(&this_size))
+        if (!object_is_on_stack(stack))
                return;
        local_irq_save(flags);
        arch_spin_lock(&max_stack_lock);
+        /* In case another CPU set the tracer_frame on us */
+        if (unlikely(!frame_size))
+                this_size -= tracer_frame;
        /* a race could have already updated it */
        if (this_size <= max_stack_size)
                goto out;
@@ -70,10 +90,18 @@ static inline void check_stack(void)
        save_stack_trace(&max_stack_trace);
        /*
+         * Add the passed in ip from the function tracer.
+         * Searching for this on the stack will skip over
+         * most of the overhead from the stack tracer itself.
+         */
+        stack_dump_trace[0] = ip;
+        max_stack_trace.nr_entries++;
+        /*
         * Now find where in the stack these are.
         */
        i = 0;
-        start = &this_size;
+        start = stack;
        top = (unsigned long *)
                (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -97,6 +125,18 @@ static inline void check_stack(void)
                                found = 1;
                                /* Start the search from here */
                                start = p + 1;
+                                /*
+                                 * We do not want to show the overhead
+                                 * of the stack tracer stack in the
+                                 * max stack. If we haven't figured
+                                 * out what that is, then figure it out
+                                 * now.
+                                 */
+                                if (unlikely(!tracer_frame) && i == 1) {
+                                        tracer_frame = (p - stack) *
+                                                sizeof(unsigned long);
+                                        max_stack_size -= tracer_frame;
+                                }
                        }
                }
@@ -113,6 +153,7 @@ static void
 stack_trace_call(unsigned long ip, unsigned long parent_ip,
                 struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
+        unsigned long stack;
        int cpu;
        preempt_disable_notrace();
@@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
        if (per_cpu(trace_active, cpu)++ != 0)
                goto out;
-        check_stack();
+        /*
+         * When fentry is used, the traced function does not get
+         * its stack frame set up, and we lose the parent.
+         * The ip is pretty useless because the function tracer
+         * was called before that function set up its stack frame.
+         * In this case, we use the parent ip.
+         *
+         * By adding the return address of either the parent ip
+         * or the current ip we can disregard most of the stack usage
+         * caused by the stack tracer itself.
+         *
+         * The function tracer always reports the address of where the
+         * mcount call was, but the stack will hold the return address.
+         */
+        if (fentry)
+                ip = parent_ip;
+        else
+                ip += MCOUNT_INSN_SIZE;
+        check_stack(ip, &stack);
 out:
        per_cpu(trace_active, cpu)--;
@@ -322,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {
        .open = stack_trace_filter_open,
        .read = seq_read,
        .write = ftrace_filter_write,
-        .llseek = ftrace_regex_lseek,
+        .llseek = ftrace_filter_lseek,
        .release = ftrace_regex_release,
 };
@@ -371,6 +431,8 @@ static __init int stack_trace_init(void)
        struct dentry *d_tracer;
        d_tracer = tracing_init_dentry();
+        if (!d_tracer)
+                return 0;
        trace_create_file("stack_max_size", 0644, d_tracer,
                        &max_stack_size, &stack_max_size_fops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 96cffb269e73..847f88a6194b 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -307,6 +307,8 @@ static int tracing_stat_init(void)
        struct dentry *d_tracing;
        d_tracing = tracing_init_dentry();
+        if (!d_tracing)
+                return 0;
        stat_dir = debugfs_create_dir("trace_stat", d_tracing);
        if (!stat_dir)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7a809e321058..8f2ac73c7a5f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -12,10 +12,6 @@
 #include "trace.h"
 static DEFINE_MUTEX(syscall_trace_lock);
-static int sys_refcount_enter;
-static int sys_refcount_exit;
-static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
 static int syscall_enter_register(struct ftrace_event_call *event,
                                 enum trace_reg type, void *data);
@@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
        /*
         * Only compare after the "sys" prefix. Archs that use
         * syscall wrappers may have syscalls symbols aliases prefixed
-         * with "SyS" instead of "sys", leading to an unwanted
+         * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
         * mismatch.
         */
        return !strcmp(sym + 3, name + 3);
@@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
                kfree(call->print_fmt);
 }
-static int syscall_enter_define_fields(struct ftrace_event_call *call)
+static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
 {
        struct syscall_trace_enter trace;
        struct syscall_metadata *meta = call->data;
@@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)
        return ret;
 }
-static int syscall_exit_define_fields(struct ftrace_event_call *call)
+static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
 {
        struct syscall_trace_exit trace;
        int ret;
@@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
        return ret;
 }
-static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 {
+        struct trace_array *tr = data;
        struct syscall_trace_enter *entry;
        struct syscall_metadata *sys_data;
        struct ring_buffer_event *event;
@@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0)
                return;
-        if (!test_bit(syscall_nr, enabled_enter_syscalls))
+        if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
@@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
-        event = trace_current_buffer_lock_reserve(&buffer,
+        buffer = tr->trace_buffer.buffer;
+        event = trace_buffer_lock_reserve(buffer,
                        sys_data->enter_event->event.type, size, 0, 0);
        if (!event)
                return;
@@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
-static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 {
+        struct trace_array *tr = data;
        struct syscall_trace_exit *entry;
        struct syscall_metadata *sys_data;
        struct ring_buffer_event *event;
@@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0)
                return;
-        if (!test_bit(syscall_nr, enabled_exit_syscalls))
+        if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))
                return;
        sys_data = syscall_nr_to_meta(syscall_nr);
        if (!sys_data)
                return;
-        event = trace_current_buffer_lock_reserve(&buffer,
+        buffer = tr->trace_buffer.buffer;
+        event = trace_buffer_lock_reserve(buffer,
                        sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
        if (!event)
                return;
@@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
-static int reg_event_syscall_enter(struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct ftrace_event_file *file,
+                                   struct ftrace_event_call *call)
 {
+        struct trace_array *tr = file->tr;
        int ret = 0;
        int num;
@@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call)
        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
-        if (!sys_refcount_enter)
+        if (!tr->sys_refcount_enter)
-                ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
+                ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
        if (!ret) {
-                set_bit(num, enabled_enter_syscalls);
+                set_bit(num, tr->enabled_enter_syscalls);
-                sys_refcount_enter++;
+                tr->sys_refcount_enter++;
        }
        mutex_unlock(&syscall_trace_lock);
        return ret;
 }
-static void unreg_event_syscall_enter(struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct ftrace_event_file *file,
+                                      struct ftrace_event_call *call)
 {
+        struct trace_array *tr = file->tr;
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return;
        mutex_lock(&syscall_trace_lock);
-        sys_refcount_enter--;
+        tr->sys_refcount_enter--;
-        clear_bit(num, enabled_enter_syscalls);
+        clear_bit(num, tr->enabled_enter_syscalls);
-        if (!sys_refcount_enter)
+        if (!tr->sys_refcount_enter)
-                unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
+                unregister_trace_sys_enter(ftrace_syscall_enter, tr);
        mutex_unlock(&syscall_trace_lock);
 }
-static int reg_event_syscall_exit(struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct ftrace_event_file *file,
+                                  struct ftrace_event_call *call)
 {
+        struct trace_array *tr = file->tr;
        int ret = 0;
        int num;
@@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call)
        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return -ENOSYS;
        mutex_lock(&syscall_trace_lock);
-        if (!sys_refcount_exit)
+        if (!tr->sys_refcount_exit)
-                ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
+                ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
        if (!ret) {
-                set_bit(num, enabled_exit_syscalls);
+                set_bit(num, tr->enabled_exit_syscalls);
-                sys_refcount_exit++;
+                tr->sys_refcount_exit++;
        }
        mutex_unlock(&syscall_trace_lock);
        return ret;
 }
-static void unreg_event_syscall_exit(struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct ftrace_event_file *file,
+                                     struct ftrace_event_call *call)
 {
+        struct trace_array *tr = file->tr;
        int num;
        num = ((struct syscall_metadata *)call->data)->syscall_nr;
        if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
                return;
        mutex_lock(&syscall_trace_lock);
-        sys_refcount_exit--;
+        tr->sys_refcount_exit--;
-        clear_bit(num, enabled_exit_syscalls);
+        clear_bit(num, tr->enabled_exit_syscalls);
-        if (!sys_refcount_exit)
+        if (!tr->sys_refcount_exit)
-                unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
+                unregister_trace_sys_exit(ftrace_syscall_exit, tr);
        mutex_unlock(&syscall_trace_lock);
 }
@@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
        .trace          = print_syscall_exit,
 };
-struct ftrace_event_class event_class_syscall_enter = {
+struct ftrace_event_class __refdata event_class_syscall_enter = {
        .system         = "syscalls",
        .reg            = syscall_enter_register,
        .define_fields  = syscall_enter_define_fields,
@@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = {
        .raw_init       = init_syscall_trace,
 };
-struct ftrace_event_class event_class_syscall_exit = {
+struct ftrace_event_class __refdata event_class_syscall_exit = {
        .system         = "syscalls",
        .reg            = syscall_exit_register,
        .define_fields  = syscall_exit_define_fields,
@@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
 static int syscall_enter_register(struct ftrace_event_call *event,
                                 enum trace_reg type, void *data)
 {
+        struct ftrace_event_file *file = data;
        switch (type) {
        case TRACE_REG_REGISTER:
-                return reg_event_syscall_enter(event);
+                return reg_event_syscall_enter(file, event);
        case TRACE_REG_UNREGISTER:
-                unreg_event_syscall_enter(event);
+                unreg_event_syscall_enter(file, event);
                return 0;
 #ifdef CONFIG_PERF_EVENTS
@@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,
 static int syscall_exit_register(struct ftrace_event_call *event,
                                 enum trace_reg type, void *data)
 {
+        struct ftrace_event_file *file = data;
        switch (type) {
        case TRACE_REG_REGISTER:
-                return reg_event_syscall_exit(event);
+                return reg_event_syscall_exit(file, event);
        case TRACE_REG_UNREGISTER:
-                unreg_event_syscall_exit(event);
+                unreg_event_syscall_exit(file, event);
                return 0;
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8dad2a92dee9..32494fb0ee64 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,6 +28,18 @@
 #define UPROBE_EVENT_SYSTEM     "uprobes"
+struct uprobe_trace_entry_head {
+        struct trace_entry      ent;
+        unsigned long           vaddr[];
+};
+#define SIZEOF_TRACE_ENTRY(is_return)                   \
+        (sizeof(struct uprobe_trace_entry_head) +       \
+         sizeof(unsigned long) * (is_return ? 2 : 1))
+#define DATAOF_TRACE_ENTRY(entry, is_return)            \
+        ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
 struct trace_uprobe_filter {
        rwlock_t                rwlock;
        int                     nr_systemwide;
@@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock);
 static LIST_HEAD(uprobe_list);
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+                                unsigned long func, struct pt_regs *regs);
 static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
 {
@@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
        return !filter->nr_systemwide && list_empty(&filter->perf_events);
 }
+static inline bool is_ret_probe(struct trace_uprobe *tu)
+{
+        return tu->consumer.ret_handler != NULL;
+}
 /*
 * Allocate new trace_uprobe and initialize it (including uprobes).
 */
 static struct trace_uprobe *
-alloc_trace_uprobe(const char *group, const char *event, int nargs)
+alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
 {
        struct trace_uprobe *tu;
@@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
        INIT_LIST_HEAD(&tu->list);
        tu->consumer.handler = uprobe_dispatcher;
+        if (is_ret)
+                tu->consumer.ret_handler = uretprobe_dispatcher;
        init_trace_uprobe_filter(&tu->filter);
        return tu;
@@ -180,7 +201,7 @@ end:
 /*
 * Argument syntax:
- *  - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
+ *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS]
 *
 *  - Remove uprobe: -:[GRP/]EVENT
 */
@@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv)
        char buf[MAX_EVENT_NAME_LEN];
        struct path path;
        unsigned long offset;
-        bool is_delete;
+        bool is_delete, is_return;
        int i, ret;
        inode = NULL;
        ret = 0;
        is_delete = false;
+        is_return = false;
        event = NULL;
        group = NULL;
        /* argc must be >= 1 */
        if (argv[0][0] == '-')
                is_delete = true;
+        else if (argv[0][0] == 'r')
+                is_return = true;
        else if (argv[0][0] != 'p') {
-                pr_info("Probe definition must be started with 'p' or '-'.\n");
+                pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
                return -EINVAL;
        }
@@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv)
                kfree(tail);
        }
-        tu = alloc_trace_uprobe(group, event, argc);
+        tu = alloc_trace_uprobe(group, event, argc, is_return);
        if (IS_ERR(tu)) {
                pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
                ret = PTR_ERR(tu);
@@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
 static int probes_seq_show(struct seq_file *m, void *v)
 {
        struct trace_uprobe *tu = v;
+        char c = is_ret_probe(tu) ? 'r' : 'p';
        int i;
-        seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
+        seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name);
        seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
        for (i = 0; i < tu->nr_args; i++)
@@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = {
        .release        = seq_release,
 };
-/* uprobe handler */
+static void uprobe_trace_print(struct trace_uprobe *tu,
-static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+                                unsigned long func, struct pt_regs *regs)
 {
        struct uprobe_trace_entry_head *entry;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
-        u8 *data;
+        void *data;
-        int size, i, pc;
+        int size, i;
-        unsigned long irq_flags;
        struct ftrace_event_call *call = &tu->call;
-        local_save_flags(irq_flags);
+        size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-        pc = preempt_count();
-        size = sizeof(*entry) + tu->size;
        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-                                                  size, irq_flags, pc);
+                                                  size + tu->size, 0, 0);
        if (!event)
-                return 0;
+                return;
        entry = ring_buffer_event_data(event);
-        entry->ip = instruction_pointer(task_pt_regs(current));
+        if (is_ret_probe(tu)) {
-        data = (u8 *)&entry[1];
+                entry->vaddr[0] = func;
+                entry->vaddr[1] = instruction_pointer(regs);
+                data = DATAOF_TRACE_ENTRY(entry, true);
+        } else {
+                entry->vaddr[0] = instruction_pointer(regs);
+                data = DATAOF_TRACE_ENTRY(entry, false);
+        }
        for (i = 0; i < tu->nr_args; i++)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
        if (!filter_current_check_discard(buffer, call, entry, event))
-                trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+                trace_buffer_unlock_commit(buffer, event, 0, 0);
+}
+/* uprobe handler */
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+        if (!is_ret_probe(tu))
+                uprobe_trace_print(tu, 0, regs);
        return 0;
 }
+static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
+                                struct pt_regs *regs)
+{
+        uprobe_trace_print(tu, func, regs);
+}
 /* Event entry printers */
 static enum print_line_t
 print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
 {
-        struct uprobe_trace_entry_head *field;
+        struct uprobe_trace_entry_head *entry;
        struct trace_seq *s = &iter->seq;
        struct trace_uprobe *tu;
        u8 *data;
        int i;
-        field = (struct uprobe_trace_entry_head *)iter->ent;
+        entry = (struct uprobe_trace_entry_head *)iter->ent;
        tu = container_of(event, struct trace_uprobe, call.event);
-        if (!trace_seq_printf(s, "%s: (", tu->call.name))
+        if (is_ret_probe(tu)) {
-                goto partial;
+                if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name,
+                                        entry->vaddr[1], entry->vaddr[0]))
-        if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
+                        goto partial;
-                goto partial;
+                data = DATAOF_TRACE_ENTRY(entry, true);
+        } else {
-        if (!trace_seq_puts(s, ")"))
+                if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name,
-                goto partial;
+                                        entry->vaddr[0]))
+                        goto partial;
+                data = DATAOF_TRACE_ENTRY(entry, false);
+        }
-        data = (u8 *)&field[1];
        for (i = 0; i < tu->nr_args; i++) {
                if (!tu->args[i].type->print(s, tu->args[i].name,
-                                             data + tu->args[i].offset, field))
+                                             data + tu->args[i].offset, entry))
                        goto partial;
        }
@@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag)
 static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
-        int ret, i;
+        int ret, i, size;
        struct uprobe_trace_entry_head field;
-        struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
+        struct trace_uprobe *tu = event_call->data;
-        DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+        if (is_ret_probe(tu)) {
+                DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
+                DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
+                size = SIZEOF_TRACE_ENTRY(true);
+        } else {
+                DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
+                size = SIZEOF_TRACE_ENTRY(false);
+        }
        /* Set argument names as fields */
        for (i = 0; i < tu->nr_args; i++) {
                ret = trace_define_field(event_call, tu->args[i].type->fmttype,
                                         tu->args[i].name,
-                                         sizeof(field) + tu->args[i].offset,
+                                         size + tu->args[i].offset,
                                         tu->args[i].type->size,
                                         tu->args[i].type->is_signed,
                                         FILTER_OTHER);
@@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
        int i;
        int pos = 0;
-        fmt = "(%lx)";
+        if (is_ret_probe(tu)) {
-        arg = "REC->" FIELD_STRING_IP;
+                fmt = "(%lx <- %lx)";
+                arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+        } else {
+                fmt = "(%lx)";
+                arg = "REC->" FIELD_STRING_IP;
+        }
        /* When len=0, we just calculate the needed length */
@@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
        return ret;
 }
-/* uprobe profile handler */
+static void uprobe_perf_print(struct trace_uprobe *tu,
-static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+                                unsigned long func, struct pt_regs *regs)
 {
        struct ftrace_event_call *call = &tu->call;
        struct uprobe_trace_entry_head *entry;
        struct hlist_head *head;
-        u8 *data;
+        void *data;
-        int size, __size, i;
+        int size, rctx, i;
-        int rctx;
-        if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+        size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-                return UPROBE_HANDLER_REMOVE;
+        size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
-        __size = sizeof(*entry) + tu->size;
-        size = ALIGN(__size + sizeof(u32), sizeof(u64));
-        size -= sizeof(u32);
        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
-                return 0;
+                return;
        preempt_disable();
+        head = this_cpu_ptr(call->perf_events);
+        if (hlist_empty(head))
+                goto out;
        entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
        if (!entry)
                goto out;
-        entry->ip = instruction_pointer(task_pt_regs(current));
+        if (is_ret_probe(tu)) {
-        data = (u8 *)&entry[1];
+                entry->vaddr[0] = func;
+                entry->vaddr[1] = instruction_pointer(regs);
+                data = DATAOF_TRACE_ENTRY(entry, true);
+        } else {
+                entry->vaddr[0] = instruction_pointer(regs);
+                data = DATAOF_TRACE_ENTRY(entry, false);
+        }
        for (i = 0; i < tu->nr_args; i++)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
-        head = this_cpu_ptr(call->perf_events);
+        perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
-        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
 out:
        preempt_enable();
+}
+/* uprobe profile handler */
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+        if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+                return UPROBE_HANDLER_REMOVE;
+        if (!is_ret_probe(tu))
+                uprobe_perf_print(tu, 0, regs);
        return 0;
 }
+static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
+                                struct pt_regs *regs)
+{
+        uprobe_perf_print(tu, func, regs);
+}
 #endif  /* CONFIG_PERF_EVENTS */
 static
 int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
 {
-        struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
+        struct trace_uprobe *tu = event->data;
        switch (type) {
        case TRACE_REG_REGISTER:
@@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
        return ret;
 }
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+                                unsigned long func, struct pt_regs *regs)
+{
+        struct trace_uprobe *tu;
+        tu = container_of(con, struct trace_uprobe, consumer);
+        if (tu->flags & TP_FLAG_TRACE)
+                uretprobe_trace_func(tu, func, regs);
+#ifdef CONFIG_PERF_EVENTS
+        if (tu->flags & TP_FLAG_PROFILE)
+                uretprobe_perf_func(tu, func, regs);
+#endif
+        return 0;
+}
 static struct trace_event_functions uprobe_funcs = {
        .trace          = print_uprobe_event
 };
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 0c05a4592047..29f26540e9c9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
        int nr_probes = 0;
        struct tracepoint_func *old, *new;
-        WARN_ON(!probe);
+        if (WARN_ON(!probe))
+                return ERR_PTR(-EINVAL);
        debug_print_probes(entry);
        old = entry->funcs;
@@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
        debug_print_probes(entry);
        /* (N -> M), (N > 1, M >= 0) probes */
-        for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+        if (probe) {
-                if (!probe ||
+                for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-                    (old[nr_probes].func == probe &&
+                        if (old[nr_probes].func == probe &&
-                     old[nr_probes].data == data))
+                             old[nr_probes].data == data)
-                        nr_del++;
+                                nr_del++;
+                }
        }
+        /*
+         * If probe is NULL, then nr_probes = nr_del = 0, and then the
+         * entire entry will be removed.
+         */
        if (nr_probes - nr_del == 0) {
                /* N -> 0, (N > 1) */
                entry->funcs = NULL;
@@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
                if (new == NULL)
                        return ERR_PTR(-ENOMEM);
                for (i = 0; old[i].func; i++)
-                        if (probe &&
+                        if (old[i].func != probe || old[i].data != data)
-                            (old[i].func != probe || old[i].data != data))
                                new[j++] = old[i];
                new[nr_probes - nr_del].func = NULL;
                entry->refcount = nr_probes - nr_del;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d7948eb10225..f6c83d7ef000 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -18,67 +18,43 @@
 SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
 {
-        long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
+        return sys_chown(filename, low2highuid(user), low2highgid(group));
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(3, ret, filename, user, group);
-        return ret;
 }
 SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
 {
-        long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
+        return sys_lchown(filename, low2highuid(user), low2highgid(group));
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(3, ret, filename, user, group);
-        return ret;
 }
 SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
 {
-        long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
+        return sys_fchown(fd, low2highuid(user), low2highgid(group));
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(3, ret, fd, user, group);
-        return ret;
 }
 SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
 {
-        long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
+        return sys_setregid(low2highgid(rgid), low2highgid(egid));
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(2, ret, rgid, egid);
-        return ret;
 }
 SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
 {
-        long ret = sys_setgid(low2highgid(gid));
+        return sys_setgid(low2highgid(gid));
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(1, ret, gid);
-        return ret;
 }
 SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
 {
-        long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
+        return sys_setreuid(low2highuid(ruid), low2highuid(euid));
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(2, ret, ruid, euid);
-        return ret;
 }
 SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
 {
-        long ret = sys_setuid(low2highuid(uid));
+        return sys_setuid(low2highuid(uid));
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(1, ret, uid);
-        return ret;
 }
 SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
 {
-        long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
+        return sys_setresuid(low2highuid(ruid), low2highuid(euid),
                                 low2highuid(suid));
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(3, ret, ruid, euid, suid);
-        return ret;
 }
 SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
@@ -100,11 +76,8 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid
 SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
 {
-        long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
+        return sys_setresgid(low2highgid(rgid), low2highgid(egid),
                                 low2highgid(sgid));
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(3, ret, rgid, egid, sgid);
-        return ret;
 }
@@ -127,18 +100,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid
 SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
 {
-        long ret = sys_setfsuid(low2highuid(uid));
+        return sys_setfsuid(low2highuid(uid));
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(1, ret, uid);
-        return ret;
 }
 SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
 {
-        long ret = sys_setfsgid(low2highgid(gid));
+        return sys_setfsgid(low2highgid(gid));
-        /* avoid REGPARM breakage on x86: */
-        asmlinkage_protect(1, ret, gid);
-        return ret;
 }
 static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/user.c b/kernel/user.c
index e81978e8c03b..69b4c3d48cde 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 /*
 * userns count is 1 for root user, 1 for init_uts_ns,
@@ -51,6 +51,8 @@ struct user_namespace init_user_ns = {
        .owner = GLOBAL_ROOT_UID,
        .group = GLOBAL_ROOT_GID,
        .proc_inum = PROC_USER_INIT_INO,
+        .may_mount_sysfs = true,
+        .may_mount_proc = true,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 8b650837083e..d8c30db06c5b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,7 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 #include <linux/highuid.h>
 #include <linux/cred.h>
 #include <linux/securebits.h>
@@ -21,10 +21,12 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/projid.h>
+#include <linux/fs_struct.h>
 static struct kmem_cache *user_ns_cachep __read_mostly;
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file,
+                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *map);
 static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
@@ -60,6 +62,15 @@ int create_user_ns(struct cred *new)
        kgid_t group = new->egid;
        int ret;
+        /*
+         * Verify that we can not violate the policy of which files
+         * may be accessed that is specified by the root directory,
+         * by verifing that the root directory is at the root of the
+         * mount namespace which allows all files to be accessed.
+         */
+        if (current_chrooted())
+                return -EPERM;
        /* The creator needs a mapping in the parent user namespace
         * or else we won't be able to reasonably tell userspace who
         * created a user_namespace.
@@ -86,6 +97,8 @@ int create_user_ns(struct cred *new)
        set_cred_user_ns(new, ns);
+        update_mnt_policy(ns);
        return 0;
 }
@@ -600,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,
        if (map->nr_extents != 0)
                goto out;
-        /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
+        /*
-         * over the user namespace in order to set the id mapping.
+         * Adjusting namespace settings requires capabilities on the target.
         */
-        if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
+        if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
                goto out;
        /* Get a buffer */
@@ -688,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
        ret = -EPERM;
        /* Validate the user is allowed to use user id's mapped to. */
-        if (!new_idmap_permitted(ns, cap_setid, &new_map))
+        if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
                goto out;
        /* Map the lower ids from the parent user namespace to the
@@ -775,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
                         &ns->projid_map, &ns->parent->projid_map);
 }
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file, 
+                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *new_map)
 {
        /* Allow mapping to your own filesystem ids */
@@ -783,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
                u32 id = new_map->extent[0].lower_first;
                if (cap_setid == CAP_SETUID) {
                        kuid_t uid = make_kuid(ns->parent, id);
-                        if (uid_eq(uid, current_fsuid()))
+                        if (uid_eq(uid, file->f_cred->fsuid))
                                return true;
                }
                else if (cap_setid == CAP_SETGID) {
                        kgid_t gid = make_kgid(ns->parent, id);
-                        if (gid_eq(gid, current_fsgid()))
+                        if (gid_eq(gid, file->f_cred->fsgid))
                                return true;
                }
        }
@@ -799,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
        /* Allow the specified ids if we have the appropriate capability
         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
+         * And the opener of the id file also had the approprpiate capability.
         */
-        if (ns_capable(ns->parent, cap_setid))
+        if (ns_capable(ns->parent, cap_setid) &&
+            file_ns_capable(file, ns->parent, cap_setid))
                return true;
        return false;
@@ -837,6 +853,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
        if (atomic_read(&current->mm->mm_users) > 1)
                return -EINVAL;
+        if (current->fs->users != 1)
+                return -EINVAL;
        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
diff --git a/kernel/utsname.c b/kernel/utsname.c
index a47fc5de3113..2fc8576efaa8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,7 +15,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 static struct uts_namespace *create_uts_ns(void)
 {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4a944676358e..05039e348f07 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -517,6 +517,11 @@ int proc_dowatchdog(struct ctl_table *table, int write,
                return ret;
        set_sample_period();
+        /*
+         * Watchdog threads shouldn't be enabled if they are
+         * disabled. The 'watchdog_disabled' variable check in
+         * watchdog_*_all_cpus() function takes care of this.
+         */
        if (watchdog_enabled && watchdog_thresh)
                watchdog_enable_all_cpus();
        else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 81f2457811eb..4aa9f5bc6b2d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,7 +41,12 @@
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
+#include <linux/jhash.h>
 #include <linux/hashtable.h>
+#include <linux/rculist.h>
+#include <linux/nodemask.h>
+#include <linux/moduleparam.h>
+#include <linux/uaccess.h>
 #include "workqueue_internal.h"
@@ -58,12 +63,11 @@ enum {
         * %WORKER_UNBOUND set and concurrency management disabled, and may
         * be executing on any CPU.  The pool behaves as an unbound one.
         *
-         * Note that DISASSOCIATED can be flipped only while holding
+         * Note that DISASSOCIATED should be flipped only while holding
-         * assoc_mutex to avoid changing binding state while
+         * manager_mutex to avoid changing binding state while
         * create_worker() is in progress.
         */
        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
-        POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
        POOL_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
        POOL_FREEZING           = 1 << 3,       /* freeze in progress */
@@ -74,12 +78,14 @@ enum {
        WORKER_PREP             = 1 << 3,       /* preparing to run works */
        WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
        WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
+        WORKER_REBOUND          = 1 << 8,       /* worker was rebound */
-        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_UNBOUND |
+        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_CPU_INTENSIVE |
-                                  WORKER_CPU_INTENSIVE,
+                                  WORKER_UNBOUND | WORKER_REBOUND,
        NR_STD_WORKER_POOLS     = 2,            /* # standard pools per cpu */
+        UNBOUND_POOL_HASH_ORDER = 6,            /* hashed by pool->attrs */
        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
        MAX_IDLE_WORKERS_RATIO  = 4,            /* 1/4 of busy can be idle */
@@ -97,6 +103,8 @@ enum {
         */
        RESCUER_NICE_LEVEL      = -20,
        HIGHPRI_NICE_LEVEL      = -20,
+        WQ_NAME_LEN             = 24,
 };
 /*
@@ -115,16 +123,26 @@ enum {
 *    cpu or grabbing pool->lock is enough for read access.  If
 *    POOL_DISASSOCIATED is set, it's identical to L.
 *
- * F: wq->flush_mutex protected.
+ * MG: pool->manager_mutex and pool->lock protected.  Writes require both
+ *     locks.  Reads can happen under either lock.
+ *
+ * PL: wq_pool_mutex protected.
+ *
+ * PR: wq_pool_mutex protected for writes.  Sched-RCU protected for reads.
+ *
+ * WQ: wq->mutex protected.
 *
- * W: workqueue_lock protected.
+ * WR: wq->mutex protected for writes.  Sched-RCU protected for reads.
+ *
+ * MD: wq_mayday_lock protected.
 */
 /* struct worker is defined in workqueue_internal.h */
 struct worker_pool {
        spinlock_t              lock;           /* the pool lock */
-        unsigned int            cpu;            /* I: the associated cpu */
+        int                     cpu;            /* I: the associated cpu */
+        int                     node;           /* I: the associated node ID */
        int                     id;             /* I: pool ID */
        unsigned int            flags;          /* X: flags */
@@ -138,12 +156,18 @@ struct worker_pool {
        struct timer_list       idle_timer;     /* L: worker idle timeout */
        struct timer_list       mayday_timer;   /* L: SOS timer for workers */
-        /* workers are chained either in busy_hash or idle_list */
+        /* a workers is either on busy_hash or idle_list, or the manager */
        DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
                                                /* L: hash of busy workers */
-        struct mutex            assoc_mutex;    /* protect POOL_DISASSOCIATED */
+        /* see manage_workers() for details on the two manager mutexes */
-        struct ida              worker_ida;     /* L: for worker IDs */
+        struct mutex            manager_arb;    /* manager arbitration */
+        struct mutex            manager_mutex;  /* manager exclusion */
+        struct idr              worker_idr;     /* MG: worker IDs and iteration */
+        struct workqueue_attrs  *attrs;         /* I: worker attributes */
+        struct hlist_node       hash_node;      /* PL: unbound_pool_hash node */
+        int                     refcnt;         /* PL: refcnt for unbound pools */
        /*
         * The current concurrency level.  As it's likely to be accessed
@@ -151,6 +175,12 @@ struct worker_pool {
         * cacheline.
         */
        atomic_t                nr_running ____cacheline_aligned_in_smp;
+        /*
+         * Destruction of pool is sched-RCU protected to allow dereferences
+         * from get_work_pool().
+         */
+        struct rcu_head         rcu;
 } ____cacheline_aligned_in_smp;
 /*
@@ -164,75 +194,107 @@ struct pool_workqueue {
        struct workqueue_struct *wq;            /* I: the owning workqueue */
        int                     work_color;     /* L: current color */
        int                     flush_color;    /* L: flushing color */
+        int                     refcnt;         /* L: reference count */
        int                     nr_in_flight[WORK_NR_COLORS];
                                                /* L: nr of in_flight works */
        int                     nr_active;      /* L: nr of active works */
        int                     max_active;     /* L: max active works */
        struct list_head        delayed_works;  /* L: delayed works */
-};
+        struct list_head        pwqs_node;      /* WR: node on wq->pwqs */
+        struct list_head        mayday_node;    /* MD: node on wq->maydays */
+        /*
+         * Release of unbound pwq is punted to system_wq.  See put_pwq()
+         * and pwq_unbound_release_workfn() for details.  pool_workqueue
+         * itself is also sched-RCU protected so that the first pwq can be
+         * determined without grabbing wq->mutex.
+         */
+        struct work_struct      unbound_release_work;
+        struct rcu_head         rcu;
+} __aligned(1 << WORK_STRUCT_FLAG_BITS);
 /*
 * Structure used to wait for workqueue flush.
 */
 struct wq_flusher {
-        struct list_head        list;           /* F: list of flushers */
+        struct list_head        list;           /* WQ: list of flushers */
-        int                     flush_color;    /* F: flush color waiting for */
+        int                     flush_color;    /* WQ: flush color waiting for */
        struct completion       done;           /* flush completion */
 };
-/*
+struct wq_device;
- * All cpumasks are assumed to be always set on UP and thus can't be
- * used to determine whether there's something to be done.
- */
-#ifdef CONFIG_SMP
-typedef cpumask_var_t mayday_mask_t;
-#define mayday_test_and_set_cpu(cpu, mask)      \
-        cpumask_test_and_set_cpu((cpu), (mask))
-#define mayday_clear_cpu(cpu, mask)             cpumask_clear_cpu((cpu), (mask))
-#define for_each_mayday_cpu(cpu, mask)          for_each_cpu((cpu), (mask))
-#define alloc_mayday_mask(maskp, gfp)           zalloc_cpumask_var((maskp), (gfp))
-#define free_mayday_mask(mask)                  free_cpumask_var((mask))
-#else
-typedef unsigned long mayday_mask_t;
-#define mayday_test_and_set_cpu(cpu, mask)      test_and_set_bit(0, &(mask))
-#define mayday_clear_cpu(cpu, mask)             clear_bit(0, &(mask))
-#define for_each_mayday_cpu(cpu, mask)          if ((cpu) = 0, (mask))
-#define alloc_mayday_mask(maskp, gfp)           true
-#define free_mayday_mask(mask)                  do { } while (0)
-#endif
 /*
- * The externally visible workqueue abstraction is an array of
+ * The externally visible workqueue.  It relays the issued work items to
- * per-CPU workqueues:
+ * the appropriate worker_pool through its pool_workqueues.
 */
 struct workqueue_struct {
-        unsigned int            flags;          /* W: WQ_* flags */
+        struct list_head        pwqs;           /* WR: all pwqs of this wq */
-        union {
+        struct list_head        list;           /* PL: list of all workqueues */
-                struct pool_workqueue __percpu          *pcpu;
-                struct pool_workqueue                   *single;
+        struct mutex            mutex;          /* protects this wq */
-                unsigned long                           v;
+        int                     work_color;     /* WQ: current work color */
-        } pool_wq;                              /* I: pwq's */
+        int                     flush_color;    /* WQ: current flush color */
-        struct list_head        list;           /* W: list of all workqueues */
-        struct mutex            flush_mutex;    /* protects wq flushing */
-        int                     work_color;     /* F: current work color */
-        int                     flush_color;    /* F: current flush color */
        atomic_t                nr_pwqs_to_flush; /* flush in progress */
-        struct wq_flusher       *first_flusher; /* F: first flusher */
+        struct wq_flusher       *first_flusher; /* WQ: first flusher */
-        struct list_head        flusher_queue;  /* F: flush waiters */
+        struct list_head        flusher_queue;  /* WQ: flush waiters */
-        struct list_head        flusher_overflow; /* F: flush overflow list */
+        struct list_head        flusher_overflow; /* WQ: flush overflow list */
-        mayday_mask_t           mayday_mask;    /* cpus requesting rescue */
+        struct list_head        maydays;        /* MD: pwqs requesting rescue */
        struct worker           *rescuer;       /* I: rescue worker */
-        int                     nr_drainers;    /* W: drain in progress */
+        int                     nr_drainers;    /* WQ: drain in progress */
-        int                     saved_max_active; /* W: saved pwq max_active */
+        int                     saved_max_active; /* WQ: saved pwq max_active */
+        struct workqueue_attrs  *unbound_attrs; /* WQ: only for unbound wqs */
+        struct pool_workqueue   *dfl_pwq;       /* WQ: only for unbound wqs */
+#ifdef CONFIG_SYSFS
+        struct wq_device        *wq_dev;        /* I: for sysfs interface */
+#endif
 #ifdef CONFIG_LOCKDEP
        struct lockdep_map      lockdep_map;
 #endif
-        char                    name[];         /* I: workqueue name */
+        char                    name[WQ_NAME_LEN]; /* I: workqueue name */
+        /* hot fields used during command issue, aligned to cacheline */
+        unsigned int            flags ____cacheline_aligned; /* WQ: WQ_* flags */
+        struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
+        struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
 };
+static struct kmem_cache *pwq_cache;
+static int wq_numa_tbl_len;             /* highest possible NUMA node id + 1 */
+static cpumask_var_t *wq_numa_possible_cpumask;
+                                        /* possible CPUs of each node */
+static bool wq_disable_numa;
+module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+static bool wq_numa_enabled;            /* unbound NUMA affinity enabled */
+/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
+static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
+static DEFINE_MUTEX(wq_pool_mutex);     /* protects pools and workqueues list */
+static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
+static LIST_HEAD(workqueues);           /* PL: list of all workqueues */
+static bool workqueue_freezing;         /* PL: have wqs started freezing? */
+/* the per-cpu worker pools */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
+                                     cpu_worker_pools);
+static DEFINE_IDR(worker_pool_idr);     /* PR: idr of all pools */
+/* PL: hash of all unbound pools keyed by pool->attrs */
+static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
+/* I: attributes used when instantiating standard unbound pools on demand */
+static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
 struct workqueue_struct *system_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_wq);
 struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -244,64 +306,87 @@ EXPORT_SYMBOL_GPL(system_unbound_wq);
 struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
+static int worker_thread(void *__worker);
+static void copy_workqueue_attrs(struct workqueue_attrs *to,
+                                 const struct workqueue_attrs *from);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
-#define for_each_std_worker_pool(pool, cpu)                             \
+#define assert_rcu_or_pool_mutex()                                      \
-        for ((pool) = &std_worker_pools(cpu)[0];                        \
+        rcu_lockdep_assert(rcu_read_lock_sched_held() ||                \
-             (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
+                           lockdep_is_held(&wq_pool_mutex),             \
+                           "sched RCU or wq_pool_mutex should be held")
-#define for_each_busy_worker(worker, i, pool)                           \
+#define assert_rcu_or_wq_mutex(wq)                                      \
-        hash_for_each(pool->busy_hash, i, worker, hentry)
+        rcu_lockdep_assert(rcu_read_lock_sched_held() ||                \
+                           lockdep_is_held(&wq->mutex),                 \
+                           "sched RCU or wq->mutex should be held")
-static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
+#ifdef CONFIG_LOCKDEP
-                                unsigned int sw)
+#define assert_manager_or_pool_lock(pool)                               \
-{
+        WARN_ONCE(debug_locks &&                                        \
-        if (cpu < nr_cpu_ids) {
+                  !lockdep_is_held(&(pool)->manager_mutex) &&           \
-                if (sw & 1) {
+                  !lockdep_is_held(&(pool)->lock),                      \
-                        cpu = cpumask_next(cpu, mask);
+                  "pool->manager_mutex or ->lock should be held")
-                        if (cpu < nr_cpu_ids)
+#else
-                                return cpu;
+#define assert_manager_or_pool_lock(pool)       do { } while (0)
-                }
+#endif
-                if (sw & 2)
-                        return WORK_CPU_UNBOUND;
-        }
-        return WORK_CPU_END;
-}
-static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask,
+#define for_each_cpu_worker_pool(pool, cpu)                             \
-                                 struct workqueue_struct *wq)
+        for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
-{
+             (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
-        return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
+             (pool)++)
-}
-/*
+/**
- * CPU iterators
+ * for_each_pool - iterate through all worker_pools in the system
+ * @pool: iteration cursor
+ * @pi: integer used for iteration
 *
- * An extra cpu number is defined using an invalid cpu number
+ * This must be called either with wq_pool_mutex held or sched RCU read
- * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
+ * locked.  If the pool needs to be used beyond the locking in effect, the
- * specific CPU.  The following iterators are similar to for_each_*_cpu()
+ * caller is responsible for guaranteeing that the pool stays online.
- * iterators but also considers the unbound CPU.
 *
- * for_each_wq_cpu()            : possible CPUs + WORK_CPU_UNBOUND
+ * The if/else clause exists only for the lockdep assertion and can be
- * for_each_online_wq_cpu()     : online CPUs + WORK_CPU_UNBOUND
+ * ignored.
- * for_each_pwq_cpu()           : possible CPUs for bound workqueues,
- *                                WORK_CPU_UNBOUND for unbound workqueues
 */
-#define for_each_wq_cpu(cpu)                                            \
+#define for_each_pool(pool, pi)                                         \
-        for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3);           \
+        idr_for_each_entry(&worker_pool_idr, pool, pi)                  \
-             (cpu) < WORK_CPU_END;                                      \
+                if (({ assert_rcu_or_pool_mutex(); false; })) { }       \
-             (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
+                else
-#define for_each_online_wq_cpu(cpu)                                     \
+/**
-        for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3);             \
+ * for_each_pool_worker - iterate through all workers of a worker_pool
-             (cpu) < WORK_CPU_END;                                      \
+ * @worker: iteration cursor
-             (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
+ * @wi: integer used for iteration
+ * @pool: worker_pool to iterate workers of
+ *
+ * This must be called with either @pool->manager_mutex or ->lock held.
+ *
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
+ */
+#define for_each_pool_worker(worker, wi, pool)                          \
+        idr_for_each_entry(&(pool)->worker_idr, (worker), (wi))         \
+                if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
+                else
-#define for_each_pwq_cpu(cpu, wq)                                       \
+/**
-        for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq));       \
+ * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
-             (cpu) < WORK_CPU_END;                                      \
+ * @pwq: iteration cursor
-             (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))
+ * @wq: the target workqueue
+ *
+ * This must be called either with wq->mutex held or sched RCU read locked.
+ * If the pwq needs to be used beyond the locking in effect, the caller is
+ * responsible for guaranteeing that the pwq stays online.
+ *
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
+ */
+#define for_each_pwq(pwq, wq)                                           \
+        list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node)          \
+                if (({ assert_rcu_or_wq_mutex(wq); false; })) { }       \
+                else
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
@@ -419,76 +504,35 @@ static inline void debug_work_activate(struct work_struct *work) { }
 static inline void debug_work_deactivate(struct work_struct *work) { }
 #endif
-/* Serializes the accesses to the list of workqueues. */
-static DEFINE_SPINLOCK(workqueue_lock);
-static LIST_HEAD(workqueues);
-static bool workqueue_freezing;         /* W: have wqs started freezing? */
-/*
- * The CPU and unbound standard worker pools.  The unbound ones have
- * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
- */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
-                                     cpu_std_worker_pools);
-static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
-/* idr of all pools */
-static DEFINE_MUTEX(worker_pool_idr_mutex);
-static DEFINE_IDR(worker_pool_idr);
-static int worker_thread(void *__worker);
-static struct worker_pool *std_worker_pools(int cpu)
-{
-        if (cpu != WORK_CPU_UNBOUND)
-                return per_cpu(cpu_std_worker_pools, cpu);
-        else
-                return unbound_std_worker_pools;
-}
-static int std_worker_pool_pri(struct worker_pool *pool)
-{
-        return pool - std_worker_pools(pool->cpu);
-}
 /* allocate ID and assign it to @pool */
 static int worker_pool_assign_id(struct worker_pool *pool)
 {
        int ret;
-        mutex_lock(&worker_pool_idr_mutex);
+        lockdep_assert_held(&wq_pool_mutex);
-        idr_pre_get(&worker_pool_idr, GFP_KERNEL);
-        ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
-        mutex_unlock(&worker_pool_idr_mutex);
+        ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
+        if (ret >= 0) {
+                pool->id = ret;
+                return 0;
+        }
        return ret;
 }
-/*
+/**
- * Lookup worker_pool by id.  The idr currently is built during boot and
+ * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
- * never modified.  Don't worry about locking for now.
+ * @wq: the target workqueue
+ * @node: the node ID
+ *
+ * This must be called either with pwq_lock held or sched RCU read locked.
+ * If the pwq needs to be used beyond the locking in effect, the caller is
+ * responsible for guaranteeing that the pwq stays online.
 */
-static struct worker_pool *worker_pool_by_id(int pool_id)
+static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
+                                                  int node)
 {
-        return idr_find(&worker_pool_idr, pool_id);
+        assert_rcu_or_wq_mutex(wq);
-}
+        return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
-static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
-{
-        struct worker_pool *pools = std_worker_pools(cpu);
-        return &pools[highpri];
-}
-static struct pool_workqueue *get_pwq(unsigned int cpu,
-                                      struct workqueue_struct *wq)
-{
-        if (!(wq->flags & WQ_UNBOUND)) {
-                if (likely(cpu < nr_cpu_ids))
-                        return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
-        } else if (likely(cpu == WORK_CPU_UNBOUND))
-                return wq->pool_wq.single;
-        return NULL;
 }
 static unsigned int work_color_to_flags(int color)
@@ -530,7 +574,7 @@ static int work_next_color(int color)
 static inline void set_work_data(struct work_struct *work, unsigned long data,
                                 unsigned long flags)
 {
-        BUG_ON(!work_pending(work));
+        WARN_ON_ONCE(!work_pending(work));
        atomic_long_set(&work->data, data | flags | work_static(work));
 }
@@ -582,13 +626,23 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
 * @work: the work item of interest
 *
 * Return the worker_pool @work was last associated with.  %NULL if none.
+ *
+ * Pools are created and destroyed under wq_pool_mutex, and allows read
+ * access under sched-RCU read lock.  As such, this function should be
+ * called under wq_pool_mutex or with preemption disabled.
+ *
+ * All fields of the returned pool are accessible as long as the above
+ * mentioned locking is in effect.  If the returned pool needs to be used
+ * beyond the critical section, the caller is responsible for ensuring the
+ * returned pool is and stays online.
 */
 static struct worker_pool *get_work_pool(struct work_struct *work)
 {
        unsigned long data = atomic_long_read(&work->data);
-        struct worker_pool *pool;
        int pool_id;
+        assert_rcu_or_pool_mutex();
        if (data & WORK_STRUCT_PWQ)
                return ((struct pool_workqueue *)
                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
@@ -597,9 +651,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
        if (pool_id == WORK_OFFQ_POOL_NONE)
                return NULL;
-        pool = worker_pool_by_id(pool_id);
+        return idr_find(&worker_pool_idr, pool_id);
-        WARN_ON_ONCE(!pool);
-        return pool;
 }
 /**
@@ -688,7 +740,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-        bool managing = pool->flags & POOL_MANAGING_WORKERS;
+        bool managing = mutex_is_locked(&pool->manager_arb);
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;
@@ -743,7 +795,7 @@ static void wake_up_worker(struct worker_pool *pool)
 * CONTEXT:
 * spin_lock_irq(rq->lock)
 */
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
+void wq_worker_waking_up(struct task_struct *task, int cpu)
 {
        struct worker *worker = kthread_data(task);
@@ -768,8 +820,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
 * RETURNS:
 * Worker task on @cpu to wake up, %NULL if none.
 */
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
+struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
-                                       unsigned int cpu)
 {
        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
        struct worker_pool *pool;
@@ -785,7 +836,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
        pool = worker->pool;
        /* this can only happen on the local cpu */
-        BUG_ON(cpu != raw_smp_processor_id());
+        if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
+                return NULL;
        /*
         * The counterpart of the following dec_and_test, implied mb,
@@ -890,13 +942,12 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 * recycled work item as currently executing and make it wait until the
 * current execution finishes, introducing an unwanted dependency.
 *
- * This function checks the work item address, work function and workqueue
+ * This function checks the work item address and work function to avoid
- * to avoid false positives.  Note that this isn't complete as one may
+ * false positives.  Note that this isn't complete as one may construct a
- * construct a work function which can introduce dependency onto itself
+ * work function which can introduce dependency onto itself through a
- * through a recycled work item.  Well, if somebody wants to shoot oneself
+ * recycled work item.  Well, if somebody wants to shoot oneself in the
- * in the foot that badly, there's only so much we can do, and if such
+ * foot that badly, there's only so much we can do, and if such deadlock
- * deadlock actually occurs, it should be easy to locate the culprit work
+ * actually occurs, it should be easy to locate the culprit work function.
- * function.
 *
 * CONTEXT:
 * spin_lock_irq(pool->lock).
@@ -960,6 +1011,64 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
                *nextp = n;
 }
+/**
+ * get_pwq - get an extra reference on the specified pool_workqueue
+ * @pwq: pool_workqueue to get
+ *
+ * Obtain an extra reference on @pwq.  The caller should guarantee that
+ * @pwq has positive refcnt and be holding the matching pool->lock.
+ */
+static void get_pwq(struct pool_workqueue *pwq)
+{
+        lockdep_assert_held(&pwq->pool->lock);
+        WARN_ON_ONCE(pwq->refcnt <= 0);
+        pwq->refcnt++;
+}
+/**
+ * put_pwq - put a pool_workqueue reference
+ * @pwq: pool_workqueue to put
+ *
+ * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its
+ * destruction.  The caller should be holding the matching pool->lock.
+ */
+static void put_pwq(struct pool_workqueue *pwq)
+{
+        lockdep_assert_held(&pwq->pool->lock);
+        if (likely(--pwq->refcnt))
+                return;
+        if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
+                return;
+        /*
+         * @pwq can't be released under pool->lock, bounce to
+         * pwq_unbound_release_workfn().  This never recurses on the same
+         * pool->lock as this path is taken only for unbound workqueues and
+         * the release work item is scheduled on a per-cpu workqueue.  To
+         * avoid lockdep warning, unbound pool->locks are given lockdep
+         * subclass of 1 in get_unbound_pool().
+         */
+        schedule_work(&pwq->unbound_release_work);
+}
+/**
+ * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
+ * @pwq: pool_workqueue to put (can be %NULL)
+ *
+ * put_pwq() with locking.  This function also allows %NULL @pwq.
+ */
+static void put_pwq_unlocked(struct pool_workqueue *pwq)
+{
+        if (pwq) {
+                /*
+                 * As both pwqs and pools are sched-RCU protected, the
+                 * following lock operations are safe.
+                 */
+                spin_lock_irq(&pwq->pool->lock);
+                put_pwq(pwq);
+                spin_unlock_irq(&pwq->pool->lock);
+        }
+}
 static void pwq_activate_delayed_work(struct work_struct *work)
 {
        struct pool_workqueue *pwq = get_work_pwq(work);
@@ -991,9 +1100,9 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
 */
 static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
 {
-        /* ignore uncolored works */
+        /* uncolored work items don't participate in flushing or nr_active */
        if (color == WORK_NO_COLOR)
-                return;
+                goto out_put;
        pwq->nr_in_flight[color]--;
@@ -1006,11 +1115,11 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
        /* is flush in progress and are we at the flushing tip? */
        if (likely(pwq->flush_color != color))
-                return;
+                goto out_put;
        /* are there still in-flight works? */
        if (pwq->nr_in_flight[color])
-                return;
+                goto out_put;
        /* this pwq is done, clear flush_color */
        pwq->flush_color = -1;
@@ -1021,6 +1130,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
         */
        if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
                complete(&pwq->wq->first_flusher->done);
+out_put:
+        put_pwq(pwq);
 }
 /**
@@ -1143,11 +1254,12 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
        /* we own @work, set data and link */
        set_work_pwq(work, pwq, extra_flags);
        list_add_tail(&work->entry, head);
+        get_pwq(pwq);
        /*
-         * Ensure either worker_sched_deactivated() sees the above
+         * Ensure either wq_worker_sleeping() sees the above
-         * list_add_tail() or we see zero nr_running to avoid workers
+         * list_add_tail() or we see zero nr_running to avoid workers lying
-         * lying around lazily while there are works to be processed.
+         * around lazily while there are works to be processed.
         */
        smp_mb();
@@ -1171,10 +1283,11 @@ static bool is_chained_work(struct workqueue_struct *wq)
        return worker && worker->current_pwq->wq == wq;
 }
-static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
+static void __queue_work(int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
 {
        struct pool_workqueue *pwq;
+        struct worker_pool *last_pool;
        struct list_head *worklist;
        unsigned int work_flags;
        unsigned int req_cpu = cpu;
@@ -1190,48 +1303,62 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        debug_work_activate(work);
        /* if dying, only works from the same workqueue are allowed */
-        if (unlikely(wq->flags & WQ_DRAINING) &&
+        if (unlikely(wq->flags & __WQ_DRAINING) &&
            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
+retry:
+        if (req_cpu == WORK_CPU_UNBOUND)
+                cpu = raw_smp_processor_id();
-        /* determine the pwq to use */
+        /* pwq which will be used unless @work is executing elsewhere */
-        if (!(wq->flags & WQ_UNBOUND)) {
+        if (!(wq->flags & WQ_UNBOUND))
-                struct worker_pool *last_pool;
+                pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
+        else
-                if (cpu == WORK_CPU_UNBOUND)
+                pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
-                        cpu = raw_smp_processor_id();
-                /*
-                 * It's multi cpu.  If @work was previously on a different
-                 * cpu, it might still be running there, in which case the
-                 * work needs to be queued on that cpu to guarantee
-                 * non-reentrancy.
-                 */
-                pwq = get_pwq(cpu, wq);
-                last_pool = get_work_pool(work);
-                if (last_pool && last_pool != pwq->pool) {
+        /*
-                        struct worker *worker;
+         * If @work was previously on a different pool, it might still be
+         * running there, in which case the work needs to be queued on that
+         * pool to guarantee non-reentrancy.
+         */
+        last_pool = get_work_pool(work);
+        if (last_pool && last_pool != pwq->pool) {
+                struct worker *worker;
-                        spin_lock(&last_pool->lock);
+                spin_lock(&last_pool->lock);
-                        worker = find_worker_executing_work(last_pool, work);
+                worker = find_worker_executing_work(last_pool, work);
-                        if (worker && worker->current_pwq->wq == wq) {
+                if (worker && worker->current_pwq->wq == wq) {
-                                pwq = get_pwq(last_pool->cpu, wq);
+                        pwq = worker->current_pwq;
-                        } else {
-                                /* meh... not running there, queue here */
-                                spin_unlock(&last_pool->lock);
-                                spin_lock(&pwq->pool->lock);
-                        }
                } else {
+                        /* meh... not running there, queue here */
+                        spin_unlock(&last_pool->lock);
                        spin_lock(&pwq->pool->lock);
                }
        } else {
-                pwq = get_pwq(WORK_CPU_UNBOUND, wq);
                spin_lock(&pwq->pool->lock);
        }
+        /*
+         * pwq is determined and locked.  For unbound pools, we could have
+         * raced with pwq release and it could already be dead.  If its
+         * refcnt is zero, repeat pwq selection.  Note that pwqs never die
+         * without another pwq replacing it in the numa_pwq_tbl or while
+         * work items are executing on it, so the retrying is guaranteed to
+         * make forward-progress.
+         */
+        if (unlikely(!pwq->refcnt)) {
+                if (wq->flags & WQ_UNBOUND) {
+                        spin_unlock(&pwq->pool->lock);
+                        cpu_relax();
+                        goto retry;
+                }
+                /* oops */
+                WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
+                          wq->name, cpu);
+        }
        /* pwq determined, queue */
        trace_workqueue_queue_work(req_cpu, pwq, work);
@@ -1286,22 +1413,6 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_work_on);
-/**
- * queue_work - queue work on a workqueue
- * @wq: workqueue to use
- * @work: work to queue
- *
- * Returns %false if @work was already on a queue, %true otherwise.
- *
- * We queue the work to the CPU on which it was submitted, but if the CPU dies
- * it can be processed by another CPU.
- */
-bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
-{
-        return queue_work_on(WORK_CPU_UNBOUND, wq, work);
-}
-EXPORT_SYMBOL_GPL(queue_work);
 void delayed_work_timer_fn(unsigned long __data)
 {
        struct delayed_work *dwork = (struct delayed_work *)__data;
@@ -1377,21 +1488,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 /**
- * queue_delayed_work - queue work on a workqueue after delay
- * @wq: workqueue to use
- * @dwork: delayable work to queue
- * @delay: number of jiffies to wait before queueing
- *
- * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
- */
-bool queue_delayed_work(struct workqueue_struct *wq,
-                        struct delayed_work *dwork, unsigned long delay)
-{
-        return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
-}
-EXPORT_SYMBOL_GPL(queue_delayed_work);
-/**
 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
@@ -1430,21 +1526,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 EXPORT_SYMBOL_GPL(mod_delayed_work_on);
 /**
- * mod_delayed_work - modify delay of or queue a delayed work
- * @wq: workqueue to use
- * @dwork: work to queue
- * @delay: number of jiffies to wait before queueing
- *
- * mod_delayed_work_on() on local CPU.
- */
-bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
-                      unsigned long delay)
-{
-        return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
-}
-EXPORT_SYMBOL_GPL(mod_delayed_work);
-/**
 * worker_enter_idle - enter idle state
 * @worker: worker which is entering idle state
 *
@@ -1458,9 +1539,10 @@ static void worker_enter_idle(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-        BUG_ON(worker->flags & WORKER_IDLE);
+        if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
-        BUG_ON(!list_empty(&worker->entry) &&
+            WARN_ON_ONCE(!list_empty(&worker->entry) &&
-               (worker->hentry.next || worker->hentry.pprev));
+                         (worker->hentry.next || worker->hentry.pprev)))
+                return;
        /* can't use worker_set_flags(), also called from start_worker() */
        worker->flags |= WORKER_IDLE;
@@ -1497,22 +1579,25 @@ static void worker_leave_idle(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-        BUG_ON(!(worker->flags & WORKER_IDLE));
+        if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
+                return;
        worker_clr_flags(worker, WORKER_IDLE);
        pool->nr_idle--;
        list_del_init(&worker->entry);
 }
 /**
- * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool
+ * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
- * @worker: self
+ * @pool: target worker_pool
+ *
+ * Bind %current to the cpu of @pool if it is associated and lock @pool.
 *
 * Works which are scheduled while the cpu is online must at least be
 * scheduled to a worker which is bound to the cpu so that if they are
 * flushed from cpu callbacks while cpu is going down, they are
 * guaranteed to execute on the cpu.
 *
- * This function is to be used by rogue workers and rescuers to bind
+ * This function is to be used by unbound workers and rescuers to bind
 * themselves to the target cpu and may race with cpu going down or
 * coming online.  kthread_bind() can't be used because it may put the
 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
@@ -1533,12 +1618,9 @@ static void worker_leave_idle(struct worker *worker)
 * %true if the associated pool is online (@worker is successfully
 * bound), %false if offline.
 */
-static bool worker_maybe_bind_and_lock(struct worker *worker)
+static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
 __acquires(&pool->lock)
 {
-        struct worker_pool *pool = worker->pool;
-        struct task_struct *task = worker->task;
        while (true) {
                /*
                 * The following call may fail, succeed or succeed
@@ -1547,14 +1629,13 @@ __acquires(&pool->lock)
                 * against POOL_DISASSOCIATED.
                 */
                if (!(pool->flags & POOL_DISASSOCIATED))
-                        set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu));
+                        set_cpus_allowed_ptr(current, pool->attrs->cpumask);
                spin_lock_irq(&pool->lock);
                if (pool->flags & POOL_DISASSOCIATED)
                        return false;
-                if (task_cpu(task) == pool->cpu &&
+                if (task_cpu(current) == pool->cpu &&
-                    cpumask_equal(&current->cpus_allowed,
+                    cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
-                                  get_cpu_mask(pool->cpu)))
                        return true;
                spin_unlock_irq(&pool->lock);
@@ -1569,108 +1650,6 @@ __acquires(&pool->lock)
        }
 }
-/*
- * Rebind an idle @worker to its CPU.  worker_thread() will test
- * list_empty(@worker->entry) before leaving idle and call this function.
- */
-static void idle_worker_rebind(struct worker *worker)
-{
-        /* CPU may go down again inbetween, clear UNBOUND only on success */
-        if (worker_maybe_bind_and_lock(worker))
-                worker_clr_flags(worker, WORKER_UNBOUND);
-        /* rebind complete, become available again */
-        list_add(&worker->entry, &worker->pool->idle_list);
-        spin_unlock_irq(&worker->pool->lock);
-}
-/*
- * Function for @worker->rebind.work used to rebind unbound busy workers to
- * the associated cpu which is coming back online.  This is scheduled by
- * cpu up but can race with other cpu hotplug operations and may be
- * executed twice without intervening cpu down.
- */
-static void busy_worker_rebind_fn(struct work_struct *work)
-{
-        struct worker *worker = container_of(work, struct worker, rebind_work);
-        if (worker_maybe_bind_and_lock(worker))
-                worker_clr_flags(worker, WORKER_UNBOUND);
-        spin_unlock_irq(&worker->pool->lock);
-}
-/**
- * rebind_workers - rebind all workers of a pool to the associated CPU
- * @pool: pool of interest
- *
- * @pool->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
- * is different for idle and busy ones.
- *
- * Idle ones will be removed from the idle_list and woken up.  They will
- * add themselves back after completing rebind.  This ensures that the
- * idle_list doesn't contain any unbound workers when re-bound busy workers
- * try to perform local wake-ups for concurrency management.
- *
- * Busy workers can rebind after they finish their current work items.
- * Queueing the rebind work item at the head of the scheduled list is
- * enough.  Note that nr_running will be properly bumped as busy workers
- * rebind.
- *
- * On return, all non-manager workers are scheduled for rebind - see
- * manage_workers() for the manager special case.  Any idle worker
- * including the manager will not appear on @idle_list until rebind is
- * complete, making local wake-ups safe.
- */
-static void rebind_workers(struct worker_pool *pool)
-{
-        struct worker *worker, *n;
-        int i;
-        lockdep_assert_held(&pool->assoc_mutex);
-        lockdep_assert_held(&pool->lock);
-        /* dequeue and kick idle ones */
-        list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
-                /*
-                 * idle workers should be off @pool->idle_list until rebind
-                 * is complete to avoid receiving premature local wake-ups.
-                 */
-                list_del_init(&worker->entry);
-                /*
-                 * worker_thread() will see the above dequeuing and call
-                 * idle_worker_rebind().
-                 */
-                wake_up_process(worker->task);
-        }
-        /* rebind busy workers */
-        for_each_busy_worker(worker, i, pool) {
-                struct work_struct *rebind_work = &worker->rebind_work;
-                struct workqueue_struct *wq;
-                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
-                                     work_data_bits(rebind_work)))
-                        continue;
-                debug_work_activate(rebind_work);
-                /*
-                 * wq doesn't really matter but let's keep @worker->pool
-                 * and @pwq->pool consistent for sanity.
-                 */
-                if (std_worker_pool_pri(worker->pool))
-                        wq = system_highpri_wq;
-                else
-                        wq = system_wq;
-                insert_work(get_pwq(pool->cpu, wq), rebind_work,
-                            worker->scheduled.next,
-                            work_color_to_flags(WORK_NO_COLOR));
-        }
-}
 static struct worker *alloc_worker(void)
 {
        struct worker *worker;
@@ -1679,7 +1658,6 @@ static struct worker *alloc_worker(void)
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
-                INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
@@ -1702,18 +1680,25 @@ static struct worker *alloc_worker(void)
 */
 static struct worker *create_worker(struct worker_pool *pool)
 {
-        const char *pri = std_worker_pool_pri(pool) ? "H" : "";
        struct worker *worker = NULL;
        int id = -1;
+        char id_buf[16];
+        lockdep_assert_held(&pool->manager_mutex);
+        /*
+         * ID is needed to determine kthread name.  Allocate ID first
+         * without installing the pointer.
+         */
+        idr_preload(GFP_KERNEL);
        spin_lock_irq(&pool->lock);
-        while (ida_get_new(&pool->worker_ida, &id)) {
-                spin_unlock_irq(&pool->lock);
+        id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
-                if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
-                        goto fail;
-                spin_lock_irq(&pool->lock);
-        }
        spin_unlock_irq(&pool->lock);
+        idr_preload_end();
+        if (id < 0)
+                goto fail;
        worker = alloc_worker();
        if (!worker)
@@ -1722,40 +1707,46 @@ static struct worker *create_worker(struct worker_pool *pool)
        worker->pool = pool;
        worker->id = id;
-        if (pool->cpu != WORK_CPU_UNBOUND)
+        if (pool->cpu >= 0)
-                worker->task = kthread_create_on_node(worker_thread,
+                snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
-                                        worker, cpu_to_node(pool->cpu),
+                         pool->attrs->nice < 0  ? "H" : "");
-                                        "kworker/%u:%d%s", pool->cpu, id, pri);
        else
-                worker->task = kthread_create(worker_thread, worker,
+                snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
-                                              "kworker/u:%d%s", id, pri);
+        worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
+                                              "kworker/%s", id_buf);
        if (IS_ERR(worker->task))
                goto fail;
-        if (std_worker_pool_pri(pool))
+        /*
-                set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
+         * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
+         * online CPUs.  It'll be re-applied when any of the CPUs come up.
+         */
+        set_user_nice(worker->task, pool->attrs->nice);
+        set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+        /* prevent userland from meddling with cpumask of workqueue workers */
+        worker->task->flags |= PF_NO_SETAFFINITY;
        /*
-         * Determine CPU binding of the new worker depending on
+         * The caller is responsible for ensuring %POOL_DISASSOCIATED
-         * %POOL_DISASSOCIATED.  The caller is responsible for ensuring the
+         * remains stable across this function.  See the comments above the
-         * flag remains stable across this function.  See the comments
+         * flag definition for details.
-         * above the flag definition for details.
-         *
-         * As an unbound worker may later become a regular one if CPU comes
-         * online, make sure every worker has %PF_THREAD_BOUND set.
         */
-        if (!(pool->flags & POOL_DISASSOCIATED)) {
+        if (pool->flags & POOL_DISASSOCIATED)
-                kthread_bind(worker->task, pool->cpu);
-        } else {
-                worker->task->flags |= PF_THREAD_BOUND;
                worker->flags |= WORKER_UNBOUND;
-        }
+        /* successful, commit the pointer to idr */
+        spin_lock_irq(&pool->lock);
+        idr_replace(&pool->worker_idr, worker, worker->id);
+        spin_unlock_irq(&pool->lock);
        return worker;
 fail:
        if (id >= 0) {
                spin_lock_irq(&pool->lock);
-                ida_remove(&pool->worker_ida, id);
+                idr_remove(&pool->worker_idr, id);
                spin_unlock_irq(&pool->lock);
        }
        kfree(worker);
@@ -1780,6 +1771,30 @@ static void start_worker(struct worker *worker)
 }
 /**
+ * create_and_start_worker - create and start a worker for a pool
+ * @pool: the target pool
+ *
+ * Grab the managership of @pool and create and start a new worker for it.
+ */
+static int create_and_start_worker(struct worker_pool *pool)
+{
+        struct worker *worker;
+        mutex_lock(&pool->manager_mutex);
+        worker = create_worker(pool);
+        if (worker) {
+                spin_lock_irq(&pool->lock);
+                start_worker(worker);
+                spin_unlock_irq(&pool->lock);
+        }
+        mutex_unlock(&pool->manager_mutex);
+        return worker ? 0 : -ENOMEM;
+}
+/**
 * destroy_worker - destroy a workqueue worker
 * @worker: worker to be destroyed
 *
@@ -1791,11 +1806,14 @@ static void start_worker(struct worker *worker)
 static void destroy_worker(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-        int id = worker->id;
+        lockdep_assert_held(&pool->manager_mutex);
+        lockdep_assert_held(&pool->lock);
        /* sanity check frenzy */
-        BUG_ON(worker->current_work);
+        if (WARN_ON(worker->current_work) ||
-        BUG_ON(!list_empty(&worker->scheduled));
+            WARN_ON(!list_empty(&worker->scheduled)))
+                return;
        if (worker->flags & WORKER_STARTED)
                pool->nr_workers--;
@@ -1805,13 +1823,14 @@ static void destroy_worker(struct worker *worker)
        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
+        idr_remove(&pool->worker_idr, worker->id);
        spin_unlock_irq(&pool->lock);
        kthread_stop(worker->task);
        kfree(worker);
        spin_lock_irq(&pool->lock);
-        ida_remove(&pool->worker_ida, id);
 }
 static void idle_worker_timeout(unsigned long __pool)
@@ -1840,23 +1859,21 @@ static void idle_worker_timeout(unsigned long __pool)
        spin_unlock_irq(&pool->lock);
 }
-static bool send_mayday(struct work_struct *work)
+static void send_mayday(struct work_struct *work)
 {
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct workqueue_struct *wq = pwq->wq;
-        unsigned int cpu;
-        if (!(wq->flags & WQ_RESCUER))
+        lockdep_assert_held(&wq_mayday_lock);
-                return false;
+        if (!wq->rescuer)
+                return;
        /* mayday mayday mayday */
-        cpu = pwq->pool->cpu;
+        if (list_empty(&pwq->mayday_node)) {
-        /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
+                list_add_tail(&pwq->mayday_node, &wq->maydays);
-        if (cpu == WORK_CPU_UNBOUND)
-                cpu = 0;
-        if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
                wake_up_process(wq->rescuer->task);
-        return true;
+        }
 }
 static void pool_mayday_timeout(unsigned long __pool)
@@ -1864,7 +1881,8 @@ static void pool_mayday_timeout(unsigned long __pool)
        struct worker_pool *pool = (void *)__pool;
        struct work_struct *work;
-        spin_lock_irq(&pool->lock);
+        spin_lock_irq(&wq_mayday_lock);         /* for wq->maydays */
+        spin_lock(&pool->lock);
        if (need_to_create_worker(pool)) {
                /*
@@ -1877,7 +1895,8 @@ static void pool_mayday_timeout(unsigned long __pool)
                        send_mayday(work);
        }
-        spin_unlock_irq(&pool->lock);
+        spin_unlock(&pool->lock);
+        spin_unlock_irq(&wq_mayday_lock);
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
@@ -1892,8 +1911,8 @@ static void pool_mayday_timeout(unsigned long __pool)
 * sent to all rescuers with works scheduled on @pool to resolve
 * possible allocation deadlock.
 *
- * On return, need_to_create_worker() is guaranteed to be false and
+ * On return, need_to_create_worker() is guaranteed to be %false and
- * may_start_working() true.
+ * may_start_working() %true.
 *
 * LOCKING:
 * spin_lock_irq(pool->lock) which may be released and regrabbed
@@ -1901,7 +1920,7 @@ static void pool_mayday_timeout(unsigned long __pool)
 * manager.
 *
 * RETURNS:
- * false if no action was taken and pool->lock stayed locked, true
+ * %false if no action was taken and pool->lock stayed locked, %true
 * otherwise.
 */
 static bool maybe_create_worker(struct worker_pool *pool)
@@ -1924,7 +1943,8 @@ restart:
                        del_timer_sync(&pool->mayday_timer);
                        spin_lock_irq(&pool->lock);
                        start_worker(worker);
-                        BUG_ON(need_to_create_worker(pool));
+                        if (WARN_ON_ONCE(need_to_create_worker(pool)))
+                                goto restart;
                        return true;
                }
@@ -1957,7 +1977,7 @@ restart:
 * multiple times.  Called only from manager.
 *
 * RETURNS:
- * false if no action was taken and pool->lock stayed locked, true
+ * %false if no action was taken and pool->lock stayed locked, %true
 * otherwise.
 */
 static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2008,42 +2028,37 @@ static bool manage_workers(struct worker *worker)
        struct worker_pool *pool = worker->pool;
        bool ret = false;
-        if (pool->flags & POOL_MANAGING_WORKERS)
+        /*
+         * Managership is governed by two mutexes - manager_arb and
+         * manager_mutex.  manager_arb handles arbitration of manager role.
+         * Anyone who successfully grabs manager_arb wins the arbitration
+         * and becomes the manager.  mutex_trylock() on pool->manager_arb
+         * failure while holding pool->lock reliably indicates that someone
+         * else is managing the pool and the worker which failed trylock
+         * can proceed to executing work items.  This means that anyone
+         * grabbing manager_arb is responsible for actually performing
+         * manager duties.  If manager_arb is grabbed and released without
+         * actual management, the pool may stall indefinitely.
+         *
+         * manager_mutex is used for exclusion of actual management
+         * operations.  The holder of manager_mutex can be sure that none
+         * of management operations, including creation and destruction of
+         * workers, won't take place until the mutex is released.  Because
+         * manager_mutex doesn't interfere with manager role arbitration,
+         * it is guaranteed that the pool's management, while may be
+         * delayed, won't be disturbed by someone else grabbing
+         * manager_mutex.
+         */
+        if (!mutex_trylock(&pool->manager_arb))
                return ret;
-        pool->flags |= POOL_MANAGING_WORKERS;
        /*
-         * To simplify both worker management and CPU hotplug, hold off
+         * With manager arbitration won, manager_mutex would be free in
-         * management while hotplug is in progress.  CPU hotplug path can't
+         * most cases.  trylock first without dropping @pool->lock.
-         * grab %POOL_MANAGING_WORKERS to achieve this because that can
-         * lead to idle worker depletion (all become busy thinking someone
-         * else is managing) which in turn can result in deadlock under
-         * extreme circumstances.  Use @pool->assoc_mutex to synchronize
-         * manager against CPU hotplug.
-         *
-         * assoc_mutex would always be free unless CPU hotplug is in
-         * progress.  trylock first without dropping @pool->lock.
         */
-        if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
+        if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
                spin_unlock_irq(&pool->lock);
-                mutex_lock(&pool->assoc_mutex);
+                mutex_lock(&pool->manager_mutex);
-                /*
-                 * CPU hotplug could have happened while we were waiting
-                 * for assoc_mutex.  Hotplug itself can't handle us
-                 * because manager isn't either on idle or busy list, and
-                 * @pool's state and ours could have deviated.
-                 *
-                 * As hotplug is now excluded via assoc_mutex, we can
-                 * simply try to bind.  It will succeed or fail depending
-                 * on @pool's current state.  Try it and adjust
-                 * %WORKER_UNBOUND accordingly.
-                 */
-                if (worker_maybe_bind_and_lock(worker))
-                        worker->flags &= ~WORKER_UNBOUND;
-                else
-                        worker->flags |= WORKER_UNBOUND;
                ret = true;
        }
@@ -2056,8 +2071,8 @@ static bool manage_workers(struct worker *worker)
        ret |= maybe_destroy_workers(pool);
        ret |= maybe_create_worker(pool);
-        pool->flags &= ~POOL_MANAGING_WORKERS;
+        mutex_unlock(&pool->manager_mutex);
-        mutex_unlock(&pool->assoc_mutex);
+        mutex_unlock(&pool->manager_arb);
        return ret;
 }
@@ -2183,6 +2198,7 @@ __acquires(&pool->lock)
        worker->current_work = NULL;
        worker->current_func = NULL;
        worker->current_pwq = NULL;
+        worker->desc_valid = false;
        pwq_dec_nr_in_flight(pwq, work_color);
 }
@@ -2211,11 +2227,11 @@ static void process_scheduled_works(struct worker *worker)
 * worker_thread - the worker thread function
 * @__worker: self
 *
- * The worker thread function.  There are NR_CPU_WORKER_POOLS dynamic pools
+ * The worker thread function.  All workers belong to a worker_pool -
- * of these per each cpu.  These workers process all works regardless of
+ * either a per-cpu one or dynamic unbound one.  These workers process all
- * their specific target workqueue.  The only exception is works which
+ * work items regardless of their specific target workqueue.  The only
- * belong to workqueues with a rescuer which will be explained in
+ * exception is work items which belong to workqueues with a rescuer which
- * rescuer_thread().
+ * will be explained in rescuer_thread().
 */
 static int worker_thread(void *__worker)
 {
@@ -2227,19 +2243,12 @@ static int worker_thread(void *__worker)
 woke_up:
        spin_lock_irq(&pool->lock);
-        /* we are off idle list if destruction or rebind is requested */
+        /* am I supposed to die? */
-        if (unlikely(list_empty(&worker->entry))) {
+        if (unlikely(worker->flags & WORKER_DIE)) {
                spin_unlock_irq(&pool->lock);
+                WARN_ON_ONCE(!list_empty(&worker->entry));
-                /* if DIE is set, destruction is requested */
+                worker->task->flags &= ~PF_WQ_WORKER;
-                if (worker->flags & WORKER_DIE) {
+                return 0;
-                        worker->task->flags &= ~PF_WQ_WORKER;
-                        return 0;
-                }
-                /* otherwise, rebind */
-                idle_worker_rebind(worker);
-                goto woke_up;
        }
        worker_leave_idle(worker);
@@ -2257,14 +2266,16 @@ recheck:
         * preparing to process a work or actually processing it.
         * Make sure nobody diddled with it while I was sleeping.
         */
-        BUG_ON(!list_empty(&worker->scheduled));
+        WARN_ON_ONCE(!list_empty(&worker->scheduled));
        /*
-         * When control reaches this point, we're guaranteed to have
+         * Finish PREP stage.  We're guaranteed to have at least one idle
-         * at least one idle worker or that someone else has already
+         * worker or that someone else has already assumed the manager
-         * assumed the manager role.
+         * role.  This is where @worker starts participating in concurrency
+         * management if applicable and concurrency management is restored
+         * after being rebound.  See rebind_workers() for details.
         */
-        worker_clr_flags(worker, WORKER_PREP);
+        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
        do {
                struct work_struct *work =
@@ -2306,7 +2317,7 @@ sleep:
 * @__rescuer: self
 *
 * Workqueue rescuer thread function.  There's one rescuer for each
- * workqueue which has WQ_RESCUER set.
+ * workqueue which has WQ_MEM_RECLAIM set.
 *
 * Regular work processing on a pool may block trying to create a new
 * worker which uses GFP_KERNEL allocation which has slight chance of
@@ -2325,8 +2336,6 @@ static int rescuer_thread(void *__rescuer)
        struct worker *rescuer = __rescuer;
        struct workqueue_struct *wq = rescuer->rescue_wq;
        struct list_head *scheduled = &rescuer->scheduled;
-        bool is_unbound = wq->flags & WQ_UNBOUND;
-        unsigned int cpu;
        set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -2344,28 +2353,29 @@ repeat:
                return 0;
        }
-        /*
+        /* see whether any pwq is asking for help */
-         * See whether any cpu is asking for help.  Unbounded
+        spin_lock_irq(&wq_mayday_lock);
-         * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
-         */
+        while (!list_empty(&wq->maydays)) {
-        for_each_mayday_cpu(cpu, wq->mayday_mask) {
+                struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
-                unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
+                                        struct pool_workqueue, mayday_node);
-                struct pool_workqueue *pwq = get_pwq(tcpu, wq);
                struct worker_pool *pool = pwq->pool;
                struct work_struct *work, *n;
                __set_current_state(TASK_RUNNING);
-                mayday_clear_cpu(cpu, wq->mayday_mask);
+                list_del_init(&pwq->mayday_node);
+                spin_unlock_irq(&wq_mayday_lock);
                /* migrate to the target cpu if possible */
+                worker_maybe_bind_and_lock(pool);
                rescuer->pool = pool;
-                worker_maybe_bind_and_lock(rescuer);
                /*
                 * Slurp in all works issued via this workqueue and
                 * process'em.
                 */
-                BUG_ON(!list_empty(&rescuer->scheduled));
+                WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
                list_for_each_entry_safe(work, n, &pool->worklist, entry)
                        if (get_work_pwq(work) == pwq)
                                move_linked_works(work, scheduled, &n);
@@ -2380,9 +2390,13 @@ repeat:
                if (keep_working(pool))
                        wake_up_worker(pool);
-                spin_unlock_irq(&pool->lock);
+                rescuer->pool = NULL;
+                spin_unlock(&pool->lock);
+                spin_lock(&wq_mayday_lock);
        }
+        spin_unlock_irq(&wq_mayday_lock);
        /* rescuers should never participate in concurrency management */
        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
@@ -2486,7 +2500,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
 * advanced to @work_color.
 *
 * CONTEXT:
- * mutex_lock(wq->flush_mutex).
+ * mutex_lock(wq->mutex).
 *
 * RETURNS:
 * %true if @flush_color >= 0 and there's something to flush.  %false
@@ -2496,21 +2510,20 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
                                      int flush_color, int work_color)
 {
        bool wait = false;
-        unsigned int cpu;
+        struct pool_workqueue *pwq;
        if (flush_color >= 0) {
-                BUG_ON(atomic_read(&wq->nr_pwqs_to_flush));
+                WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
                atomic_set(&wq->nr_pwqs_to_flush, 1);
        }
-        for_each_pwq_cpu(cpu, wq) {
+        for_each_pwq(pwq, wq) {
-                struct pool_workqueue *pwq = get_pwq(cpu, wq);
                struct worker_pool *pool = pwq->pool;
                spin_lock_irq(&pool->lock);
                if (flush_color >= 0) {
-                        BUG_ON(pwq->flush_color != -1);
+                        WARN_ON_ONCE(pwq->flush_color != -1);
                        if (pwq->nr_in_flight[flush_color]) {
                                pwq->flush_color = flush_color;
@@ -2520,7 +2533,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
                }
                if (work_color >= 0) {
-                        BUG_ON(work_color != work_next_color(pwq->work_color));
+                        WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
                        pwq->work_color = work_color;
                }
@@ -2537,11 +2550,8 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
 * flush_workqueue - ensure that any scheduled work has run to completion.
 * @wq: workqueue to flush
 *
- * Forces execution of the workqueue and blocks until its completion.
+ * This function sleeps until all work items which were queued on entry
- * This is typically used in driver shutdown handlers.
+ * have finished execution, but it is not livelocked by new incoming ones.
- *
- * We sleep until all works which were queued on entry have been handled,
- * but we are not livelocked by new incoming ones.
 */
 void flush_workqueue(struct workqueue_struct *wq)
 {
@@ -2555,7 +2565,7 @@ void flush_workqueue(struct workqueue_struct *wq)
        lock_map_acquire(&wq->lockdep_map);
        lock_map_release(&wq->lockdep_map);
-        mutex_lock(&wq->flush_mutex);
+        mutex_lock(&wq->mutex);
        /*
         * Start-to-wait phase
@@ -2568,13 +2578,13 @@ void flush_workqueue(struct workqueue_struct *wq)
                 * becomes our flush_color and work_color is advanced
                 * by one.
                 */
-                BUG_ON(!list_empty(&wq->flusher_overflow));
+                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
                this_flusher.flush_color = wq->work_color;
                wq->work_color = next_color;
                if (!wq->first_flusher) {
                        /* no flush in progress, become the first flusher */
-                        BUG_ON(wq->flush_color != this_flusher.flush_color);
+                        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
                        wq->first_flusher = &this_flusher;
@@ -2587,7 +2597,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                        }
                } else {
                        /* wait in queue */
-                        BUG_ON(wq->flush_color == this_flusher.flush_color);
+                        WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }
@@ -2600,7 +2610,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                list_add_tail(&this_flusher.list, &wq->flusher_overflow);
        }
-        mutex_unlock(&wq->flush_mutex);
+        mutex_unlock(&wq->mutex);
        wait_for_completion(&this_flusher.done);
@@ -2613,7 +2623,7 @@ void flush_workqueue(struct workqueue_struct *wq)
        if (wq->first_flusher != &this_flusher)
                return;
-        mutex_lock(&wq->flush_mutex);
+        mutex_lock(&wq->mutex);
        /* we might have raced, check again with mutex held */
        if (wq->first_flusher != &this_flusher)
@@ -2621,8 +2631,8 @@ void flush_workqueue(struct workqueue_struct *wq)
        wq->first_flusher = NULL;
-        BUG_ON(!list_empty(&this_flusher.list));
+        WARN_ON_ONCE(!list_empty(&this_flusher.list));
-        BUG_ON(wq->flush_color != this_flusher.flush_color);
+        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
        while (true) {
                struct wq_flusher *next, *tmp;
@@ -2635,8 +2645,8 @@ void flush_workqueue(struct workqueue_struct *wq)
                        complete(&next->done);
                }
-                BUG_ON(!list_empty(&wq->flusher_overflow) &&
+                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
-                       wq->flush_color != work_next_color(wq->work_color));
+                             wq->flush_color != work_next_color(wq->work_color));
                /* this flush_color is finished, advance by one */
                wq->flush_color = work_next_color(wq->flush_color);
@@ -2660,7 +2670,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                }
                if (list_empty(&wq->flusher_queue)) {
-                        BUG_ON(wq->flush_color != wq->work_color);
+                        WARN_ON_ONCE(wq->flush_color != wq->work_color);
                        break;
                }
@@ -2668,8 +2678,8 @@ void flush_workqueue(struct workqueue_struct *wq)
                 * Need to flush more colors.  Make the next flusher
                 * the new first flusher and arm pwqs.
                 */
-                BUG_ON(wq->flush_color == wq->work_color);
+                WARN_ON_ONCE(wq->flush_color == wq->work_color);
-                BUG_ON(wq->flush_color != next->flush_color);
+                WARN_ON_ONCE(wq->flush_color != next->flush_color);
                list_del_init(&next->list);
                wq->first_flusher = next;
@@ -2685,7 +2695,7 @@ void flush_workqueue(struct workqueue_struct *wq)
        }
 out_unlock:
-        mutex_unlock(&wq->flush_mutex);
+        mutex_unlock(&wq->mutex);
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -2703,22 +2713,23 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
 void drain_workqueue(struct workqueue_struct *wq)
 {
        unsigned int flush_cnt = 0;
-        unsigned int cpu;
+        struct pool_workqueue *pwq;
        /*
         * __queue_work() needs to test whether there are drainers, is much
         * hotter than drain_workqueue() and already looks at @wq->flags.
-         * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
+         * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
         */
-        spin_lock(&workqueue_lock);
+        mutex_lock(&wq->mutex);
        if (!wq->nr_drainers++)
-                wq->flags |= WQ_DRAINING;
+                wq->flags |= __WQ_DRAINING;
-        spin_unlock(&workqueue_lock);
+        mutex_unlock(&wq->mutex);
 reflush:
        flush_workqueue(wq);
-        for_each_pwq_cpu(cpu, wq) {
+        mutex_lock(&wq->mutex);
-                struct pool_workqueue *pwq = get_pwq(cpu, wq);
+        for_each_pwq(pwq, wq) {
                bool drained;
                spin_lock_irq(&pwq->pool->lock);
@@ -2730,15 +2741,16 @@ reflush:
                if (++flush_cnt == 10 ||
                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
-                        pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
+                        pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
                                wq->name, flush_cnt);
+                mutex_unlock(&wq->mutex);
                goto reflush;
        }
-        spin_lock(&workqueue_lock);
        if (!--wq->nr_drainers)
-                wq->flags &= ~WQ_DRAINING;
+                wq->flags &= ~__WQ_DRAINING;
-        spin_unlock(&workqueue_lock);
+        mutex_unlock(&wq->mutex);
 }
 EXPORT_SYMBOL_GPL(drain_workqueue);
@@ -2749,11 +2761,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
        struct pool_workqueue *pwq;
        might_sleep();
+        local_irq_disable();
        pool = get_work_pool(work);
-        if (!pool)
+        if (!pool) {
+                local_irq_enable();
                return false;
+        }
-        spin_lock_irq(&pool->lock);
+        spin_lock(&pool->lock);
        /* see the comment in try_to_grab_pending() with the same code */
        pwq = get_work_pwq(work);
        if (pwq) {
@@ -2775,7 +2791,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
         * flusher is not running on the same workqueue by verifying write
         * access.
         */
-        if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
+        if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
                lock_map_acquire(&pwq->wq->lockdep_map);
        else
                lock_map_acquire_read(&pwq->wq->lockdep_map);
@@ -2932,66 +2948,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 /**
- * schedule_work_on - put work task on a specific cpu
- * @cpu: cpu to put the work task on
- * @work: job to be done
- *
- * This puts a job on a specific cpu
- */
-bool schedule_work_on(int cpu, struct work_struct *work)
-{
-        return queue_work_on(cpu, system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work_on);
-/**
- * schedule_work - put work task in global workqueue
- * @work: job to be done
- *
- * Returns %false if @work was already on the kernel-global workqueue and
- * %true otherwise.
- *
- * This puts a job in the kernel-global workqueue if it was not already
- * queued and leaves it in the same position on the kernel-global
- * workqueue otherwise.
- */
-bool schedule_work(struct work_struct *work)
-{
-        return queue_work(system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work);
-/**
- * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
- * @cpu: cpu to use
- * @dwork: job to be done
- * @delay: number of jiffies to wait
- *
- * After waiting for a given time this puts a job in the kernel-global
- * workqueue on the specified CPU.
- */
-bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
-                              unsigned long delay)
-{
-        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
-}
-EXPORT_SYMBOL(schedule_delayed_work_on);
-/**
- * schedule_delayed_work - put work task in global workqueue after delay
- * @dwork: job to be done
- * @delay: number of jiffies to wait or 0 for immediate execution
- *
- * After waiting for a given time this puts a job in the kernel-global
- * workqueue.
- */
-bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
-{
-        return queue_delayed_work(system_wq, dwork, delay);
-}
-EXPORT_SYMBOL(schedule_delayed_work);
-/**
 * schedule_on_each_cpu - execute a function synchronously on each online CPU
 * @func: the function to call
 *
@@ -3084,51 +3040,1025 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
 }
 EXPORT_SYMBOL_GPL(execute_in_process_context);
-int keventd_up(void)
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
+ * following attributes.
+ *
+ *  per_cpu     RO bool : whether the workqueue is per-cpu or unbound
+ *  max_active  RW int  : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ *  id          RO int  : the associated pool ID
+ *  nice        RW int  : nice value of the workers
+ *  cpumask     RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+        struct workqueue_struct         *wq;
+        struct device                   dev;
+};
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+        return wq_dev->wq;
+}
+static ssize_t wq_per_cpu_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static ssize_t wq_max_active_show(struct device *dev,
+                                  struct device_attribute *attr, char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+static ssize_t wq_max_active_store(struct device *dev,
+                                   struct device_attribute *attr,
+                                   const char *buf, size_t count)
 {
-        return system_wq != NULL;
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int val;
+        if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+                return -EINVAL;
+        workqueue_set_max_active(wq, val);
+        return count;
 }
-static int alloc_pwqs(struct workqueue_struct *wq)
+static struct device_attribute wq_sysfs_attrs[] = {
+        __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL),
+        __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store),
+        __ATTR_NULL,
+};
+static ssize_t wq_pool_ids_show(struct device *dev,
+                                struct device_attribute *attr, char *buf)
 {
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        const char *delim = "";
+        int node, written = 0;
+        rcu_read_lock_sched();
+        for_each_node(node) {
+                written += scnprintf(buf + written, PAGE_SIZE - written,
+                                     "%s%d:%d", delim, node,
+                                     unbound_pwq_by_node(wq, node)->pool->id);
+                delim = " ";
+        }
+        written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+        rcu_read_unlock_sched();
+        return written;
+}
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+                            char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int written;
+        mutex_lock(&wq->mutex);
+        written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+        mutex_unlock(&wq->mutex);
+        return written;
+}
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+        struct workqueue_attrs *attrs;
+        attrs = alloc_workqueue_attrs(GFP_KERNEL);
+        if (!attrs)
+                return NULL;
+        mutex_lock(&wq->mutex);
+        copy_workqueue_attrs(attrs, wq->unbound_attrs);
+        mutex_unlock(&wq->mutex);
+        return attrs;
+}
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+                             const char *buf, size_t count)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        struct workqueue_attrs *attrs;
+        int ret;
+        attrs = wq_sysfs_prep_attrs(wq);
+        if (!attrs)
+                return -ENOMEM;
+        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+            attrs->nice >= -20 && attrs->nice <= 19)
+                ret = apply_workqueue_attrs(wq, attrs);
+        else
+                ret = -EINVAL;
+        free_workqueue_attrs(attrs);
+        return ret ?: count;
+}
+static ssize_t wq_cpumask_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int written;
+        mutex_lock(&wq->mutex);
+        written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask);
+        mutex_unlock(&wq->mutex);
+        written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+        return written;
+}
+static ssize_t wq_cpumask_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t count)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        struct workqueue_attrs *attrs;
+        int ret;
+        attrs = wq_sysfs_prep_attrs(wq);
+        if (!attrs)
+                return -ENOMEM;
+        ret = cpumask_parse(buf, attrs->cpumask);
+        if (!ret)
+                ret = apply_workqueue_attrs(wq, attrs);
+        free_workqueue_attrs(attrs);
+        return ret ?: count;
+}
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+                            char *buf)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        int written;
+        mutex_lock(&wq->mutex);
+        written = scnprintf(buf, PAGE_SIZE, "%d\n",
+                            !wq->unbound_attrs->no_numa);
+        mutex_unlock(&wq->mutex);
+        return written;
+}
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+                             const char *buf, size_t count)
+{
+        struct workqueue_struct *wq = dev_to_wq(dev);
+        struct workqueue_attrs *attrs;
+        int v, ret;
+        attrs = wq_sysfs_prep_attrs(wq);
+        if (!attrs)
+                return -ENOMEM;
+        ret = -EINVAL;
+        if (sscanf(buf, "%d", &v) == 1) {
+                attrs->no_numa = !v;
+                ret = apply_workqueue_attrs(wq, attrs);
+        }
+        free_workqueue_attrs(attrs);
+        return ret ?: count;
+}
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+        __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+        __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+        __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+        __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+        __ATTR_NULL,
+};
+static struct bus_type wq_subsys = {
+        .name                           = "workqueue",
+        .dev_attrs                      = wq_sysfs_attrs,
+};
+static int __init wq_sysfs_init(void)
+{
+        return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+static void wq_device_release(struct device *dev)
+{
+        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+        kfree(wq_dev);
+}
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+        struct wq_device *wq_dev;
+        int ret;
        /*
-         * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
+         * Adjusting max_active or creating new pwqs by applyting
-         * Make sure that the alignment isn't lower than that of
+         * attributes breaks ordering guarantee.  Disallow exposing ordered
-         * unsigned long long.
+         * workqueues.
         */
-        const size_t size = sizeof(struct pool_workqueue);
+        if (WARN_ON(wq->flags & __WQ_ORDERED))
-        const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
+                return -EINVAL;
-                                   __alignof__(unsigned long long));
-        if (!(wq->flags & WQ_UNBOUND))
+        wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
-                wq->pool_wq.pcpu = __alloc_percpu(size, align);
+        if (!wq_dev)
-        else {
+                return -ENOMEM;
-                void *ptr;
+        wq_dev->wq = wq;
+        wq_dev->dev.bus = &wq_subsys;
+        wq_dev->dev.init_name = wq->name;
+        wq_dev->dev.release = wq_device_release;
+        /*
+         * unbound_attrs are created separately.  Suppress uevent until
+         * everything is ready.
+         */
+        dev_set_uevent_suppress(&wq_dev->dev, true);
+        ret = device_register(&wq_dev->dev);
+        if (ret) {
+                kfree(wq_dev);
+                wq->wq_dev = NULL;
+                return ret;
+        }
+        if (wq->flags & WQ_UNBOUND) {
+                struct device_attribute *attr;
+                for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+                        ret = device_create_file(&wq_dev->dev, attr);
+                        if (ret) {
+                                device_unregister(&wq_dev->dev);
+                                wq->wq_dev = NULL;
+                                return ret;
+                        }
+                }
+        }
+        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+        return 0;
+}
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+        struct wq_device *wq_dev = wq->wq_dev;
+        if (!wq->wq_dev)
+                return;
+        wq->wq_dev = NULL;
+        device_unregister(&wq_dev->dev);
+}
+#else   /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)     { }
+#endif  /* CONFIG_SYSFS */
+/**
+ * free_workqueue_attrs - free a workqueue_attrs
+ * @attrs: workqueue_attrs to free
+ *
+ * Undo alloc_workqueue_attrs().
+ */
+void free_workqueue_attrs(struct workqueue_attrs *attrs)
+{
+        if (attrs) {
+                free_cpumask_var(attrs->cpumask);
+                kfree(attrs);
+        }
+}
+/**
+ * alloc_workqueue_attrs - allocate a workqueue_attrs
+ * @gfp_mask: allocation mask to use
+ *
+ * Allocate a new workqueue_attrs, initialize with default settings and
+ * return it.  Returns NULL on failure.
+ */
+struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
+{
+        struct workqueue_attrs *attrs;
+        attrs = kzalloc(sizeof(*attrs), gfp_mask);
+        if (!attrs)
+                goto fail;
+        if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
+                goto fail;
+        cpumask_copy(attrs->cpumask, cpu_possible_mask);
+        return attrs;
+fail:
+        free_workqueue_attrs(attrs);
+        return NULL;
+}
+static void copy_workqueue_attrs(struct workqueue_attrs *to,
+                                 const struct workqueue_attrs *from)
+{
+        to->nice = from->nice;
+        cpumask_copy(to->cpumask, from->cpumask);
+}
+/* hash value of the content of @attr */
+static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
+{
+        u32 hash = 0;
+        hash = jhash_1word(attrs->nice, hash);
+        hash = jhash(cpumask_bits(attrs->cpumask),
+                     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+        return hash;
+}
+/* content equality test */
+static bool wqattrs_equal(const struct workqueue_attrs *a,
+                          const struct workqueue_attrs *b)
+{
+        if (a->nice != b->nice)
+                return false;
+        if (!cpumask_equal(a->cpumask, b->cpumask))
+                return false;
+        return true;
+}
+/**
+ * init_worker_pool - initialize a newly zalloc'd worker_pool
+ * @pool: worker_pool to initialize
+ *
+ * Initiailize a newly zalloc'd @pool.  It also allocates @pool->attrs.
+ * Returns 0 on success, -errno on failure.  Even on failure, all fields
+ * inside @pool proper are initialized and put_unbound_pool() can be called
+ * on @pool safely to release it.
+ */
+static int init_worker_pool(struct worker_pool *pool)
+{
+        spin_lock_init(&pool->lock);
+        pool->id = -1;
+        pool->cpu = -1;
+        pool->node = NUMA_NO_NODE;
+        pool->flags |= POOL_DISASSOCIATED;
+        INIT_LIST_HEAD(&pool->worklist);
+        INIT_LIST_HEAD(&pool->idle_list);
+        hash_init(pool->busy_hash);
+        init_timer_deferrable(&pool->idle_timer);
+        pool->idle_timer.function = idle_worker_timeout;
+        pool->idle_timer.data = (unsigned long)pool;
+        setup_timer(&pool->mayday_timer, pool_mayday_timeout,
+                    (unsigned long)pool);
+        mutex_init(&pool->manager_arb);
+        mutex_init(&pool->manager_mutex);
+        idr_init(&pool->worker_idr);
+        INIT_HLIST_NODE(&pool->hash_node);
+        pool->refcnt = 1;
+        /* shouldn't fail above this point */
+        pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
+        if (!pool->attrs)
+                return -ENOMEM;
+        return 0;
+}
+static void rcu_free_pool(struct rcu_head *rcu)
+{
+        struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
+        idr_destroy(&pool->worker_idr);
+        free_workqueue_attrs(pool->attrs);
+        kfree(pool);
+}
+/**
+ * put_unbound_pool - put a worker_pool
+ * @pool: worker_pool to put
+ *
+ * Put @pool.  If its refcnt reaches zero, it gets destroyed in sched-RCU
+ * safe manner.  get_unbound_pool() calls this function on its failure path
+ * and this function should be able to release pools which went through,
+ * successfully or not, init_worker_pool().
+ *
+ * Should be called with wq_pool_mutex held.
+ */
+static void put_unbound_pool(struct worker_pool *pool)
+{
+        struct worker *worker;
+        lockdep_assert_held(&wq_pool_mutex);
+        if (--pool->refcnt)
+                return;
+        /* sanity checks */
+        if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
+            WARN_ON(!list_empty(&pool->worklist)))
+                return;
+        /* release id and unhash */
+        if (pool->id >= 0)
+                idr_remove(&worker_pool_idr, pool->id);
+        hash_del(&pool->hash_node);
+        /*
+         * Become the manager and destroy all workers.  Grabbing
+         * manager_arb prevents @pool's workers from blocking on
+         * manager_mutex.
+         */
+        mutex_lock(&pool->manager_arb);
+        mutex_lock(&pool->manager_mutex);
+        spin_lock_irq(&pool->lock);
+        while ((worker = first_worker(pool)))
+                destroy_worker(worker);
+        WARN_ON(pool->nr_workers || pool->nr_idle);
+        spin_unlock_irq(&pool->lock);
+        mutex_unlock(&pool->manager_mutex);
+        mutex_unlock(&pool->manager_arb);
+        /* shut down the timers */
+        del_timer_sync(&pool->idle_timer);
+        del_timer_sync(&pool->mayday_timer);
+        /* sched-RCU protected to allow dereferences from get_work_pool() */
+        call_rcu_sched(&pool->rcu, rcu_free_pool);
+}
+/**
+ * get_unbound_pool - get a worker_pool with the specified attributes
+ * @attrs: the attributes of the worker_pool to get
+ *
+ * Obtain a worker_pool which has the same attributes as @attrs, bump the
+ * reference count and return it.  If there already is a matching
+ * worker_pool, it will be used; otherwise, this function attempts to
+ * create a new one.  On failure, returns NULL.
+ *
+ * Should be called with wq_pool_mutex held.
+ */
+static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
+{
+        u32 hash = wqattrs_hash(attrs);
+        struct worker_pool *pool;
+        int node;
+        lockdep_assert_held(&wq_pool_mutex);
+        /* do we already have a matching pool? */
+        hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
+                if (wqattrs_equal(pool->attrs, attrs)) {
+                        pool->refcnt++;
+                        goto out_unlock;
+                }
+        }
+        /* nope, create a new one */
+        pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+        if (!pool || init_worker_pool(pool) < 0)
+                goto fail;
+        if (workqueue_freezing)
+                pool->flags |= POOL_FREEZING;
+        lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
+        copy_workqueue_attrs(pool->attrs, attrs);
+        /* if cpumask is contained inside a NUMA node, we belong to that node */
+        if (wq_numa_enabled) {
+                for_each_node(node) {
+                        if (cpumask_subset(pool->attrs->cpumask,
+                                           wq_numa_possible_cpumask[node])) {
+                                pool->node = node;
+                                break;
+                        }
+                }
+        }
+        if (worker_pool_assign_id(pool) < 0)
+                goto fail;
+        /* create and start the initial worker */
+        if (create_and_start_worker(pool) < 0)
+                goto fail;
+        /* install */
+        hash_add(unbound_pool_hash, &pool->hash_node, hash);
+out_unlock:
+        return pool;
+fail:
+        if (pool)
+                put_unbound_pool(pool);
+        return NULL;
+}
+static void rcu_free_pwq(struct rcu_head *rcu)
+{
+        kmem_cache_free(pwq_cache,
+                        container_of(rcu, struct pool_workqueue, rcu));
+}
+/*
+ * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
+ * and needs to be destroyed.
+ */
+static void pwq_unbound_release_workfn(struct work_struct *work)
+{
+        struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
+                                                  unbound_release_work);
+        struct workqueue_struct *wq = pwq->wq;
+        struct worker_pool *pool = pwq->pool;
+        bool is_last;
+        if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
+                return;
+        /*
+         * Unlink @pwq.  Synchronization against wq->mutex isn't strictly
+         * necessary on release but do it anyway.  It's easier to verify
+         * and consistent with the linking path.
+         */
+        mutex_lock(&wq->mutex);
+        list_del_rcu(&pwq->pwqs_node);
+        is_last = list_empty(&wq->pwqs);
+        mutex_unlock(&wq->mutex);
+        mutex_lock(&wq_pool_mutex);
+        put_unbound_pool(pool);
+        mutex_unlock(&wq_pool_mutex);
+        call_rcu_sched(&pwq->rcu, rcu_free_pwq);
+        /*
+         * If we're the last pwq going away, @wq is already dead and no one
+         * is gonna access it anymore.  Free it.
+         */
+        if (is_last) {
+                free_workqueue_attrs(wq->unbound_attrs);
+                kfree(wq);
+        }
+}
+/**
+ * pwq_adjust_max_active - update a pwq's max_active to the current setting
+ * @pwq: target pool_workqueue
+ *
+ * If @pwq isn't freezing, set @pwq->max_active to the associated
+ * workqueue's saved_max_active and activate delayed work items
+ * accordingly.  If @pwq is freezing, clear @pwq->max_active to zero.
+ */
+static void pwq_adjust_max_active(struct pool_workqueue *pwq)
+{
+        struct workqueue_struct *wq = pwq->wq;
+        bool freezable = wq->flags & WQ_FREEZABLE;
+        /* for @wq->saved_max_active */
+        lockdep_assert_held(&wq->mutex);
+        /* fast exit for non-freezable wqs */
+        if (!freezable && pwq->max_active == wq->saved_max_active)
+                return;
+        spin_lock_irq(&pwq->pool->lock);
+        if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) {
+                pwq->max_active = wq->saved_max_active;
+                while (!list_empty(&pwq->delayed_works) &&
+                       pwq->nr_active < pwq->max_active)
+                        pwq_activate_first_delayed(pwq);
                /*
-                 * Allocate enough room to align pwq and put an extra
+                 * Need to kick a worker after thawed or an unbound wq's
-                 * pointer at the end pointing back to the originally
+                 * max_active is bumped.  It's a slow path.  Do it always.
-                 * allocated pointer which will be used for free.
                 */
-                ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
+                wake_up_worker(pwq->pool);
-                if (ptr) {
+        } else {
-                        wq->pool_wq.single = PTR_ALIGN(ptr, align);
+                pwq->max_active = 0;
-                        *(void **)(wq->pool_wq.single + 1) = ptr;
+        }
+        spin_unlock_irq(&pwq->pool->lock);
+}
+/* initialize newly alloced @pwq which is associated with @wq and @pool */
+static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
+                     struct worker_pool *pool)
+{
+        BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
+        memset(pwq, 0, sizeof(*pwq));
+        pwq->pool = pool;
+        pwq->wq = wq;
+        pwq->flush_color = -1;
+        pwq->refcnt = 1;
+        INIT_LIST_HEAD(&pwq->delayed_works);
+        INIT_LIST_HEAD(&pwq->pwqs_node);
+        INIT_LIST_HEAD(&pwq->mayday_node);
+        INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
+}
+/* sync @pwq with the current state of its associated wq and link it */
+static void link_pwq(struct pool_workqueue *pwq)
+{
+        struct workqueue_struct *wq = pwq->wq;
+        lockdep_assert_held(&wq->mutex);
+        /* may be called multiple times, ignore if already linked */
+        if (!list_empty(&pwq->pwqs_node))
+                return;
+        /*
+         * Set the matching work_color.  This is synchronized with
+         * wq->mutex to avoid confusing flush_workqueue().
+         */
+        pwq->work_color = wq->work_color;
+        /* sync max_active to the current setting */
+        pwq_adjust_max_active(pwq);
+        /* link in @pwq */
+        list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
+}
+/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
+static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
+                                        const struct workqueue_attrs *attrs)
+{
+        struct worker_pool *pool;
+        struct pool_workqueue *pwq;
+        lockdep_assert_held(&wq_pool_mutex);
+        pool = get_unbound_pool(attrs);
+        if (!pool)
+                return NULL;
+        pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
+        if (!pwq) {
+                put_unbound_pool(pool);
+                return NULL;
+        }
+        init_pwq(pwq, wq, pool);
+        return pwq;
+}
+/* undo alloc_unbound_pwq(), used only in the error path */
+static void free_unbound_pwq(struct pool_workqueue *pwq)
+{
+        lockdep_assert_held(&wq_pool_mutex);
+        if (pwq) {
+                put_unbound_pool(pwq->pool);
+                kmem_cache_free(pwq_cache, pwq);
+        }
+}
+/**
+ * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
+ * @attrs: the wq_attrs of interest
+ * @node: the target NUMA node
+ * @cpu_going_down: if >= 0, the CPU to consider as offline
+ * @cpumask: outarg, the resulting cpumask
+ *
+ * Calculate the cpumask a workqueue with @attrs should use on @node.  If
+ * @cpu_going_down is >= 0, that cpu is considered offline during
+ * calculation.  The result is stored in @cpumask.  This function returns
+ * %true if the resulting @cpumask is different from @attrs->cpumask,
+ * %false if equal.
+ *
+ * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If
+ * enabled and @node has online CPUs requested by @attrs, the returned
+ * cpumask is the intersection of the possible CPUs of @node and
+ * @attrs->cpumask.
+ *
+ * The caller is responsible for ensuring that the cpumask of @node stays
+ * stable.
+ */
+static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
+                                 int cpu_going_down, cpumask_t *cpumask)
+{
+        if (!wq_numa_enabled || attrs->no_numa)
+                goto use_dfl;
+        /* does @node have any online CPUs @attrs wants? */
+        cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
+        if (cpu_going_down >= 0)
+                cpumask_clear_cpu(cpu_going_down, cpumask);
+        if (cpumask_empty(cpumask))
+                goto use_dfl;
+        /* yeap, return possible CPUs in @node that @attrs wants */
+        cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+        return !cpumask_equal(cpumask, attrs->cpumask);
+use_dfl:
+        cpumask_copy(cpumask, attrs->cpumask);
+        return false;
+}
+/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
+static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
+                                                   int node,
+                                                   struct pool_workqueue *pwq)
+{
+        struct pool_workqueue *old_pwq;
+        lockdep_assert_held(&wq->mutex);
+        /* link_pwq() can handle duplicate calls */
+        link_pwq(pwq);
+        old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
+        rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
+        return old_pwq;
+}
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on.  Older pwqs are released as in-flight work
+ * items finish.  Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.  Returns 0 on success and -errno on
+ * failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+                          const struct workqueue_attrs *attrs)
+{
+        struct workqueue_attrs *new_attrs, *tmp_attrs;
+        struct pool_workqueue **pwq_tbl, *dfl_pwq;
+        int node, ret;
+        /* only unbound workqueues can change attributes */
+        if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+                return -EINVAL;
+        /* creating multiple pwqs breaks ordering guarantee */
+        if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+                return -EINVAL;
+        pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
+        new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+        tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+        if (!pwq_tbl || !new_attrs || !tmp_attrs)
+                goto enomem;
+        /* make a copy of @attrs and sanitize it */
+        copy_workqueue_attrs(new_attrs, attrs);
+        cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+        /*
+         * We may create multiple pwqs with differing cpumasks.  Make a
+         * copy of @new_attrs which will be modified and used to obtain
+         * pools.
+         */
+        copy_workqueue_attrs(tmp_attrs, new_attrs);
+        /*
+         * CPUs should stay stable across pwq creations and installations.
+         * Pin CPUs, determine the target cpumask for each node and create
+         * pwqs accordingly.
+         */
+        get_online_cpus();
+        mutex_lock(&wq_pool_mutex);
+        /*
+         * If something goes wrong during CPU up/down, we'll fall back to
+         * the default pwq covering whole @attrs->cpumask.  Always create
+         * it even if we don't use it immediately.
+         */
+        dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+        if (!dfl_pwq)
+                goto enomem_pwq;
+        for_each_node(node) {
+                if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
+                        pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+                        if (!pwq_tbl[node])
+                                goto enomem_pwq;
+                } else {
+                        dfl_pwq->refcnt++;
+                        pwq_tbl[node] = dfl_pwq;
                }
        }
-        /* just in case, make sure it's actually aligned */
+        mutex_unlock(&wq_pool_mutex);
-        BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align));
-        return wq->pool_wq.v ? 0 : -ENOMEM;
+        /* all pwqs have been created successfully, let's install'em */
+        mutex_lock(&wq->mutex);
+        copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+        /* save the previous pwq and install the new one */
+        for_each_node(node)
+                pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+        /* @dfl_pwq might not have been used, ensure it's linked */
+        link_pwq(dfl_pwq);
+        swap(wq->dfl_pwq, dfl_pwq);
+        mutex_unlock(&wq->mutex);
+        /* put the old pwqs */
+        for_each_node(node)
+                put_pwq_unlocked(pwq_tbl[node]);
+        put_pwq_unlocked(dfl_pwq);
+        put_online_cpus();
+        ret = 0;
+        /* fall through */
+out_free:
+        free_workqueue_attrs(tmp_attrs);
+        free_workqueue_attrs(new_attrs);
+        kfree(pwq_tbl);
+        return ret;
+enomem_pwq:
+        free_unbound_pwq(dfl_pwq);
+        for_each_node(node)
+                if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
+                        free_unbound_pwq(pwq_tbl[node]);
+        mutex_unlock(&wq_pool_mutex);
+        put_online_cpus();
+enomem:
+        ret = -ENOMEM;
+        goto out_free;
 }
-static void free_pwqs(struct workqueue_struct *wq)
+/**
+ * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
+ * @wq: the target workqueue
+ * @cpu: the CPU coming up or going down
+ * @online: whether @cpu is coming up or going down
+ *
+ * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
+ * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update NUMA affinity of
+ * @wq accordingly.
+ *
+ * If NUMA affinity can't be adjusted due to memory allocation failure, it
+ * falls back to @wq->dfl_pwq which may not be optimal but is always
+ * correct.
+ *
+ * Note that when the last allowed CPU of a NUMA node goes offline for a
+ * workqueue with a cpumask spanning multiple nodes, the workers which were
+ * already executing the work items for the workqueue will lose their CPU
+ * affinity and may execute on any CPU.  This is similar to how per-cpu
+ * workqueues behave on CPU_DOWN.  If a workqueue user wants strict
+ * affinity, it's the user's responsibility to flush the work item from
+ * CPU_DOWN_PREPARE.
+ */
+static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
+                                   bool online)
 {
-        if (!(wq->flags & WQ_UNBOUND))
+        int node = cpu_to_node(cpu);
-                free_percpu(wq->pool_wq.pcpu);
+        int cpu_off = online ? -1 : cpu;
-        else if (wq->pool_wq.single) {
+        struct pool_workqueue *old_pwq = NULL, *pwq;
-                /* the pointer to free is stored right after the pwq */
+        struct workqueue_attrs *target_attrs;
-                kfree(*(void **)(wq->pool_wq.single + 1));
+        cpumask_t *cpumask;
+        lockdep_assert_held(&wq_pool_mutex);
+        if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
+                return;
+        /*
+         * We don't wanna alloc/free wq_attrs for each wq for each CPU.
+         * Let's use a preallocated one.  The following buf is protected by
+         * CPU hotplug exclusion.
+         */
+        target_attrs = wq_update_unbound_numa_attrs_buf;
+        cpumask = target_attrs->cpumask;
+        mutex_lock(&wq->mutex);
+        if (wq->unbound_attrs->no_numa)
+                goto out_unlock;
+        copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
+        pwq = unbound_pwq_by_node(wq, node);
+        /*
+         * Let's determine what needs to be done.  If the target cpumask is
+         * different from wq's, we need to compare it to @pwq's and create
+         * a new one if they don't match.  If the target cpumask equals
+         * wq's, the default pwq should be used.  If @pwq is already the
+         * default one, nothing to do; otherwise, install the default one.
+         */
+        if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+                if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
+                        goto out_unlock;
+        } else {
+                if (pwq == wq->dfl_pwq)
+                        goto out_unlock;
+                else
+                        goto use_dfl_pwq;
+        }
+        mutex_unlock(&wq->mutex);
+        /* create a new pwq */
+        pwq = alloc_unbound_pwq(wq, target_attrs);
+        if (!pwq) {
+                pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
+                           wq->name);
+                goto out_unlock;
+        }
+        /*
+         * Install the new pwq.  As this function is called only from CPU
+         * hotplug callbacks and applying a new attrs is wrapped with
+         * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
+         * inbetween.
+         */
+        mutex_lock(&wq->mutex);
+        old_pwq = numa_pwq_tbl_install(wq, node, pwq);
+        goto out_unlock;
+use_dfl_pwq:
+        spin_lock_irq(&wq->dfl_pwq->pool->lock);
+        get_pwq(wq->dfl_pwq);
+        spin_unlock_irq(&wq->dfl_pwq->pool->lock);
+        old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
+out_unlock:
+        mutex_unlock(&wq->mutex);
+        put_pwq_unlocked(old_pwq);
+}
+static int alloc_and_link_pwqs(struct workqueue_struct *wq)
+{
+        bool highpri = wq->flags & WQ_HIGHPRI;
+        int cpu;
+        if (!(wq->flags & WQ_UNBOUND)) {
+                wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
+                if (!wq->cpu_pwqs)
+                        return -ENOMEM;
+                for_each_possible_cpu(cpu) {
+                        struct pool_workqueue *pwq =
+                                per_cpu_ptr(wq->cpu_pwqs, cpu);
+                        struct worker_pool *cpu_pools =
+                                per_cpu(cpu_worker_pools, cpu);
+                        init_pwq(pwq, wq, &cpu_pools[highpri]);
+                        mutex_lock(&wq->mutex);
+                        link_pwq(pwq);
+                        mutex_unlock(&wq->mutex);
+                }
+                return 0;
+        } else {
+                return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
        }
 }
@@ -3150,30 +4080,28 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                                               struct lock_class_key *key,
                                               const char *lock_name, ...)
 {
-        va_list args, args1;
+        size_t tbl_size = 0;
+        va_list args;
        struct workqueue_struct *wq;
-        unsigned int cpu;
+        struct pool_workqueue *pwq;
-        size_t namelen;
-        /* determine namelen, allocate wq and format name */
+        /* allocate wq and format name */
-        va_start(args, lock_name);
+        if (flags & WQ_UNBOUND)
-        va_copy(args1, args);
+                tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
-        namelen = vsnprintf(NULL, 0, fmt, args) + 1;
-        wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
+        wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
        if (!wq)
-                goto err;
+                return NULL;
-        vsnprintf(wq->name, namelen, fmt, args1);
+        if (flags & WQ_UNBOUND) {
-        va_end(args);
+                wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
-        va_end(args1);
+                if (!wq->unbound_attrs)
+                        goto err_free_wq;
+        }
-        /*
+        va_start(args, lock_name);
-         * Workqueues which may be used during memory reclaim should
+        vsnprintf(wq->name, sizeof(wq->name), fmt, args);
-         * have a rescuer to guarantee forward progress.
+        va_end(args);
-         */
-        if (flags & WQ_MEM_RECLAIM)
-                flags |= WQ_RESCUER;
        max_active = max_active ?: WQ_DFL_ACTIVE;
        max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -3181,71 +4109,70 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        /* init wq */
        wq->flags = flags;
        wq->saved_max_active = max_active;
-        mutex_init(&wq->flush_mutex);
+        mutex_init(&wq->mutex);
        atomic_set(&wq->nr_pwqs_to_flush, 0);
+        INIT_LIST_HEAD(&wq->pwqs);
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
+        INIT_LIST_HEAD(&wq->maydays);
        lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
        INIT_LIST_HEAD(&wq->list);
-        if (alloc_pwqs(wq) < 0)
+        if (alloc_and_link_pwqs(wq) < 0)
-                goto err;
+                goto err_free_wq;
-        for_each_pwq_cpu(cpu, wq) {
-                struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
-                pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
-                pwq->wq = wq;
-                pwq->flush_color = -1;
-                pwq->max_active = max_active;
-                INIT_LIST_HEAD(&pwq->delayed_works);
-        }
-        if (flags & WQ_RESCUER) {
+        /*
+         * Workqueues which may be used during memory reclaim should
+         * have a rescuer to guarantee forward progress.
+         */
+        if (flags & WQ_MEM_RECLAIM) {
                struct worker *rescuer;
-                if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
+                rescuer = alloc_worker();
-                        goto err;
-                wq->rescuer = rescuer = alloc_worker();
                if (!rescuer)
-                        goto err;
+                        goto err_destroy;
                rescuer->rescue_wq = wq;
                rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
                                               wq->name);
-                if (IS_ERR(rescuer->task))
+                if (IS_ERR(rescuer->task)) {
-                        goto err;
+                        kfree(rescuer);
+                        goto err_destroy;
+                }
-                rescuer->task->flags |= PF_THREAD_BOUND;
+                wq->rescuer = rescuer;
+                rescuer->task->flags |= PF_NO_SETAFFINITY;
                wake_up_process(rescuer->task);
        }
+        if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
+                goto err_destroy;
        /*
-         * workqueue_lock protects global freeze state and workqueues
+         * wq_pool_mutex protects global freeze state and workqueues list.
-         * list.  Grab it, set max_active accordingly and add the new
+         * Grab it, adjust max_active and add the new @wq to workqueues
-         * workqueue to workqueues list.
+         * list.
         */
-        spin_lock(&workqueue_lock);
+        mutex_lock(&wq_pool_mutex);
-        if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
+        mutex_lock(&wq->mutex);
-                for_each_pwq_cpu(cpu, wq)
+        for_each_pwq(pwq, wq)
-                        get_pwq(cpu, wq)->max_active = 0;
+                pwq_adjust_max_active(pwq);
+        mutex_unlock(&wq->mutex);
        list_add(&wq->list, &workqueues);
-        spin_unlock(&workqueue_lock);
+        mutex_unlock(&wq_pool_mutex);
        return wq;
-err:
-        if (wq) {
+err_free_wq:
-                free_pwqs(wq);
+        free_workqueue_attrs(wq->unbound_attrs);
-                free_mayday_mask(wq->mayday_mask);
+        kfree(wq);
-                kfree(wq->rescuer);
+        return NULL;
-                kfree(wq);
+err_destroy:
-        }
+        destroy_workqueue(wq);
        return NULL;
 }
 EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
@@ -3258,60 +4185,78 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 */
 void destroy_workqueue(struct workqueue_struct *wq)
 {
-        unsigned int cpu;
+        struct pool_workqueue *pwq;
+        int node;
        /* drain it before proceeding with destruction */
        drain_workqueue(wq);
+        /* sanity checks */
+        mutex_lock(&wq->mutex);
+        for_each_pwq(pwq, wq) {
+                int i;
+                for (i = 0; i < WORK_NR_COLORS; i++) {
+                        if (WARN_ON(pwq->nr_in_flight[i])) {
+                                mutex_unlock(&wq->mutex);
+                                return;
+                        }
+                }
+                if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) ||
+                    WARN_ON(pwq->nr_active) ||
+                    WARN_ON(!list_empty(&pwq->delayed_works))) {
+                        mutex_unlock(&wq->mutex);
+                        return;
+                }
+        }
+        mutex_unlock(&wq->mutex);
        /*
         * wq list is used to freeze wq, remove from list after
         * flushing is complete in case freeze races us.
         */
-        spin_lock(&workqueue_lock);
+        mutex_lock(&wq_pool_mutex);
-        list_del(&wq->list);
+        list_del_init(&wq->list);
-        spin_unlock(&workqueue_lock);
+        mutex_unlock(&wq_pool_mutex);
-        /* sanity check */
+        workqueue_sysfs_unregister(wq);
-        for_each_pwq_cpu(cpu, wq) {
-                struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                int i;
-                for (i = 0; i < WORK_NR_COLORS; i++)
-                        BUG_ON(pwq->nr_in_flight[i]);
-                BUG_ON(pwq->nr_active);
-                BUG_ON(!list_empty(&pwq->delayed_works));
-        }
-        if (wq->flags & WQ_RESCUER) {
+        if (wq->rescuer) {
                kthread_stop(wq->rescuer->task);
-                free_mayday_mask(wq->mayday_mask);
                kfree(wq->rescuer);
+                wq->rescuer = NULL;
        }
-        free_pwqs(wq);
+        if (!(wq->flags & WQ_UNBOUND)) {
-        kfree(wq);
+                /*
-}
+                 * The base ref is never dropped on per-cpu pwqs.  Directly
-EXPORT_SYMBOL_GPL(destroy_workqueue);
+                 * free the pwqs and wq.
+                 */
-/**
+                free_percpu(wq->cpu_pwqs);
- * pwq_set_max_active - adjust max_active of a pwq
+                kfree(wq);
- * @pwq: target pool_workqueue
+        } else {
- * @max_active: new max_active value.
+                /*
- *
+                 * We're the sole accessor of @wq at this point.  Directly
- * Set @pwq->max_active to @max_active and activate delayed works if
+                 * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
- * increased.
+                 * @wq will be freed when the last pwq is released.
- *
+                 */
- * CONTEXT:
+                for_each_node(node) {
- * spin_lock_irq(pool->lock).
+                        pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
- */
+                        RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
-static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
+                        put_pwq_unlocked(pwq);
-{
+                }
-        pwq->max_active = max_active;
-        while (!list_empty(&pwq->delayed_works) &&
+                /*
-               pwq->nr_active < pwq->max_active)
+                 * Put dfl_pwq.  @wq may be freed any time after dfl_pwq is
-                pwq_activate_first_delayed(pwq);
+                 * put.  Don't access it afterwards.
+                 */
+                pwq = wq->dfl_pwq;
+                wq->dfl_pwq = NULL;
+                put_pwq_unlocked(pwq);
+        }
 }
+EXPORT_SYMBOL_GPL(destroy_workqueue);
 /**
 * workqueue_set_max_active - adjust max_active of a workqueue
@@ -3325,30 +4270,37 @@ static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
 */
 void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 {
-        unsigned int cpu;
+        struct pool_workqueue *pwq;
+        /* disallow meddling with max_active for ordered workqueues */
+        if (WARN_ON(wq->flags & __WQ_ORDERED))
+                return;
        max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
-        spin_lock(&workqueue_lock);
+        mutex_lock(&wq->mutex);
        wq->saved_max_active = max_active;
-        for_each_pwq_cpu(cpu, wq) {
+        for_each_pwq(pwq, wq)
-                struct pool_workqueue *pwq = get_pwq(cpu, wq);
+                pwq_adjust_max_active(pwq);
-                struct worker_pool *pool = pwq->pool;
-                spin_lock_irq(&pool->lock);
+        mutex_unlock(&wq->mutex);
+}
-                if (!(wq->flags & WQ_FREEZABLE) ||
+EXPORT_SYMBOL_GPL(workqueue_set_max_active);
-                    !(pool->flags & POOL_FREEZING))
-                        pwq_set_max_active(pwq, max_active);
-                spin_unlock_irq(&pool->lock);
+/**
-        }
+ * current_is_workqueue_rescuer - is %current workqueue rescuer?
+ *
+ * Determine whether %current is a workqueue rescuer.  Can be used from
+ * work functions to determine whether it's being run off the rescuer task.
+ */
+bool current_is_workqueue_rescuer(void)
+{
+        struct worker *worker = current_wq_worker();
-        spin_unlock(&workqueue_lock);
+        return worker && worker->rescue_wq;
 }
-EXPORT_SYMBOL_GPL(workqueue_set_max_active);
 /**
 * workqueue_congested - test whether a workqueue is congested
@@ -3362,11 +4314,22 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
 * RETURNS:
 * %true if congested, %false otherwise.
 */
-bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
+bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 {
-        struct pool_workqueue *pwq = get_pwq(cpu, wq);
+        struct pool_workqueue *pwq;
+        bool ret;
+        rcu_read_lock_sched();
+        if (!(wq->flags & WQ_UNBOUND))
+                pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
+        else
+                pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
-        return !list_empty(&pwq->delayed_works);
+        ret = !list_empty(&pwq->delayed_works);
+        rcu_read_unlock_sched();
+        return ret;
 }
 EXPORT_SYMBOL_GPL(workqueue_congested);
@@ -3383,24 +4346,104 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
 */
 unsigned int work_busy(struct work_struct *work)
 {
-        struct worker_pool *pool = get_work_pool(work);
+        struct worker_pool *pool;
        unsigned long flags;
        unsigned int ret = 0;
        if (work_pending(work))
                ret |= WORK_BUSY_PENDING;
+        local_irq_save(flags);
+        pool = get_work_pool(work);
        if (pool) {
-                spin_lock_irqsave(&pool->lock, flags);
+                spin_lock(&pool->lock);
                if (find_worker_executing_work(pool, work))
                        ret |= WORK_BUSY_RUNNING;
-                spin_unlock_irqrestore(&pool->lock, flags);
+                spin_unlock(&pool->lock);
        }
+        local_irq_restore(flags);
        return ret;
 }
 EXPORT_SYMBOL_GPL(work_busy);
+/**
+ * set_worker_desc - set description for the current work item
+ * @fmt: printf-style format string
+ * @...: arguments for the format string
+ *
+ * This function can be called by a running work function to describe what
+ * the work item is about.  If the worker task gets dumped, this
+ * information will be printed out together to help debugging.  The
+ * description can be at most WORKER_DESC_LEN including the trailing '\0'.
+ */
+void set_worker_desc(const char *fmt, ...)
+{
+        struct worker *worker = current_wq_worker();
+        va_list args;
+        if (worker) {
+                va_start(args, fmt);
+                vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
+                va_end(args);
+                worker->desc_valid = true;
+        }
+}
+/**
+ * print_worker_info - print out worker information and description
+ * @log_lvl: the log level to use when printing
+ * @task: target task
+ *
+ * If @task is a worker and currently executing a work item, print out the
+ * name of the workqueue being serviced and worker description set with
+ * set_worker_desc() by the currently executing work item.
+ *
+ * This function can be safely called on any task as long as the
+ * task_struct itself is accessible.  While safe, this function isn't
+ * synchronized and may print out mixups or garbages of limited length.
+ */
+void print_worker_info(const char *log_lvl, struct task_struct *task)
+{
+        work_func_t *fn = NULL;
+        char name[WQ_NAME_LEN] = { };
+        char desc[WORKER_DESC_LEN] = { };
+        struct pool_workqueue *pwq = NULL;
+        struct workqueue_struct *wq = NULL;
+        bool desc_valid = false;
+        struct worker *worker;
+        if (!(task->flags & PF_WQ_WORKER))
+                return;
+        /*
+         * This function is called without any synchronization and @task
+         * could be in any state.  Be careful with dereferences.
+         */
+        worker = probe_kthread_data(task);
+        /*
+         * Carefully copy the associated workqueue's workfn and name.  Keep
+         * the original last '\0' in case the original contains garbage.
+         */
+        probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
+        probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
+        probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
+        probe_kernel_read(name, wq->name, sizeof(name) - 1);
+        /* copy worker description */
+        probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
+        if (desc_valid)
+                probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
+        if (fn || name[0] || desc[0]) {
+                printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
+                if (desc[0])
+                        pr_cont(" (%s)", desc);
+                pr_cont("\n");
+        }
+}
 /*
 * CPU hotplug.
 *
@@ -3421,53 +4464,153 @@ static void wq_unbind_fn(struct work_struct *work)
        int cpu = smp_processor_id();
        struct worker_pool *pool;
        struct worker *worker;
-        int i;
+        int wi;
-        for_each_std_worker_pool(pool, cpu) {
+        for_each_cpu_worker_pool(pool, cpu) {
-                BUG_ON(cpu != smp_processor_id());
+                WARN_ON_ONCE(cpu != smp_processor_id());
-                mutex_lock(&pool->assoc_mutex);
+                mutex_lock(&pool->manager_mutex);
                spin_lock_irq(&pool->lock);
                /*
-                 * We've claimed all manager positions.  Make all workers
+                 * We've blocked all manager operations.  Make all workers
                 * unbound and set DISASSOCIATED.  Before this, all workers
                 * except for the ones which are still executing works from
                 * before the last CPU down must be on the cpu.  After
                 * this, they may become diasporas.
                 */
-                list_for_each_entry(worker, &pool->idle_list, entry)
+                for_each_pool_worker(worker, wi, pool)
-                        worker->flags |= WORKER_UNBOUND;
-                for_each_busy_worker(worker, i, pool)
                        worker->flags |= WORKER_UNBOUND;
                pool->flags |= POOL_DISASSOCIATED;
                spin_unlock_irq(&pool->lock);
-                mutex_unlock(&pool->assoc_mutex);
+                mutex_unlock(&pool->manager_mutex);
+                /*
+                 * Call schedule() so that we cross rq->lock and thus can
+                 * guarantee sched callbacks see the %WORKER_UNBOUND flag.
+                 * This is necessary as scheduler callbacks may be invoked
+                 * from other cpus.
+                 */
+                schedule();
+                /*
+                 * Sched callbacks are disabled now.  Zap nr_running.
+                 * After this, nr_running stays zero and need_more_worker()
+                 * and keep_working() are always true as long as the
+                 * worklist is not empty.  This pool now behaves as an
+                 * unbound (in terms of concurrency management) pool which
+                 * are served by workers tied to the pool.
+                 */
+                atomic_set(&pool->nr_running, 0);
+                /*
+                 * With concurrency management just turned off, a busy
+                 * worker blocking could lead to lengthy stalls.  Kick off
+                 * unbound chain execution of currently pending work items.
+                 */
+                spin_lock_irq(&pool->lock);
+                wake_up_worker(pool);
+                spin_unlock_irq(&pool->lock);
        }
+}
-        /*
+/**
-         * Call schedule() so that we cross rq->lock and thus can guarantee
+ * rebind_workers - rebind all workers of a pool to the associated CPU
-         * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
+ * @pool: pool of interest
-         * as scheduler callbacks may be invoked from other cpus.
+ *
-         */
+ * @pool->cpu is coming online.  Rebind all workers to the CPU.
-        schedule();
+ */
+static void rebind_workers(struct worker_pool *pool)
+{
+        struct worker *worker;
+        int wi;
+        lockdep_assert_held(&pool->manager_mutex);
        /*
-         * Sched callbacks are disabled now.  Zap nr_running.  After this,
+         * Restore CPU affinity of all workers.  As all idle workers should
-         * nr_running stays zero and need_more_worker() and keep_working()
+         * be on the run-queue of the associated CPU before any local
-         * are always true as long as the worklist is not empty.  Pools on
+         * wake-ups for concurrency management happen, restore CPU affinty
-         * @cpu now behave as unbound (in terms of concurrency management)
+         * of all workers first and then clear UNBOUND.  As we're called
-         * pools which are served by workers tied to the CPU.
+         * from CPU_ONLINE, the following shouldn't fail.
-         *
-         * On return from this function, the current worker would trigger
-         * unbound chain execution of pending work items if other workers
-         * didn't already.
         */
-        for_each_std_worker_pool(pool, cpu)
+        for_each_pool_worker(worker, wi, pool)
-                atomic_set(&pool->nr_running, 0);
+                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
+                                                  pool->attrs->cpumask) < 0);
+        spin_lock_irq(&pool->lock);
+        for_each_pool_worker(worker, wi, pool) {
+                unsigned int worker_flags = worker->flags;
+                /*
+                 * A bound idle worker should actually be on the runqueue
+                 * of the associated CPU for local wake-ups targeting it to
+                 * work.  Kick all idle workers so that they migrate to the
+                 * associated CPU.  Doing this in the same loop as
+                 * replacing UNBOUND with REBOUND is safe as no worker will
+                 * be bound before @pool->lock is released.
+                 */
+                if (worker_flags & WORKER_IDLE)
+                        wake_up_process(worker->task);
+                /*
+                 * We want to clear UNBOUND but can't directly call
+                 * worker_clr_flags() or adjust nr_running.  Atomically
+                 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
+                 * @worker will clear REBOUND using worker_clr_flags() when
+                 * it initiates the next execution cycle thus restoring
+                 * concurrency management.  Note that when or whether
+                 * @worker clears REBOUND doesn't affect correctness.
+                 *
+                 * ACCESS_ONCE() is necessary because @worker->flags may be
+                 * tested without holding any lock in
+                 * wq_worker_waking_up().  Without it, NOT_RUNNING test may
+                 * fail incorrectly leading to premature concurrency
+                 * management operations.
+                 */
+                WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
+                worker_flags |= WORKER_REBOUND;
+                worker_flags &= ~WORKER_UNBOUND;
+                ACCESS_ONCE(worker->flags) = worker_flags;
+        }
+        spin_unlock_irq(&pool->lock);
+}
+/**
+ * restore_unbound_workers_cpumask - restore cpumask of unbound workers
+ * @pool: unbound pool of interest
+ * @cpu: the CPU which is coming up
+ *
+ * An unbound pool may end up with a cpumask which doesn't have any online
+ * CPUs.  When a worker of such pool get scheduled, the scheduler resets
+ * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
+ * online CPU before, cpus_allowed of all its workers should be restored.
+ */
+static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
+{
+        static cpumask_t cpumask;
+        struct worker *worker;
+        int wi;
+        lockdep_assert_held(&pool->manager_mutex);
+        /* is @cpu allowed for @pool? */
+        if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
+                return;
+        /* is @cpu the only online CPU? */
+        cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
+        if (cpumask_weight(&cpumask) != 1)
+                return;
+        /* as we're called from CPU_ONLINE, the following shouldn't fail */
+        for_each_pool_worker(worker, wi, pool)
+                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
+                                                  pool->attrs->cpumask) < 0);
 }
 /*
@@ -3478,39 +4621,46 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
                                               unsigned long action,
                                               void *hcpu)
 {
-        unsigned int cpu = (unsigned long)hcpu;
+        int cpu = (unsigned long)hcpu;
        struct worker_pool *pool;
+        struct workqueue_struct *wq;
+        int pi;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_UP_PREPARE:
-                for_each_std_worker_pool(pool, cpu) {
+                for_each_cpu_worker_pool(pool, cpu) {
-                        struct worker *worker;
                        if (pool->nr_workers)
                                continue;
+                        if (create_and_start_worker(pool) < 0)
-                        worker = create_worker(pool);
-                        if (!worker)
                                return NOTIFY_BAD;
-                        spin_lock_irq(&pool->lock);
-                        start_worker(worker);
-                        spin_unlock_irq(&pool->lock);
                }
                break;
        case CPU_DOWN_FAILED:
        case CPU_ONLINE:
-                for_each_std_worker_pool(pool, cpu) {
+                mutex_lock(&wq_pool_mutex);
-                        mutex_lock(&pool->assoc_mutex);
-                        spin_lock_irq(&pool->lock);
-                        pool->flags &= ~POOL_DISASSOCIATED;
+                for_each_pool(pool, pi) {
-                        rebind_workers(pool);
+                        mutex_lock(&pool->manager_mutex);
+                        if (pool->cpu == cpu) {
+                                spin_lock_irq(&pool->lock);
+                                pool->flags &= ~POOL_DISASSOCIATED;
+                                spin_unlock_irq(&pool->lock);
-                        spin_unlock_irq(&pool->lock);
+                                rebind_workers(pool);
-                        mutex_unlock(&pool->assoc_mutex);
+                        } else if (pool->cpu < 0) {
+                                restore_unbound_workers_cpumask(pool, cpu);
+                        }
+                        mutex_unlock(&pool->manager_mutex);
                }
+                /* update NUMA affinity of unbound workqueues */
+                list_for_each_entry(wq, &workqueues, list)
+                        wq_update_unbound_numa(wq, cpu, true);
+                mutex_unlock(&wq_pool_mutex);
                break;
        }
        return NOTIFY_OK;
@@ -3524,14 +4674,23 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
                                                 unsigned long action,
                                                 void *hcpu)
 {
-        unsigned int cpu = (unsigned long)hcpu;
+        int cpu = (unsigned long)hcpu;
        struct work_struct unbind_work;
+        struct workqueue_struct *wq;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
-                /* unbinding should happen on the local CPU */
+                /* unbinding per-cpu workers should happen on the local CPU */
                INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
                queue_work_on(cpu, system_highpri_wq, &unbind_work);
+                /* update NUMA affinity of unbound workqueues */
+                mutex_lock(&wq_pool_mutex);
+                list_for_each_entry(wq, &workqueues, list)
+                        wq_update_unbound_numa(wq, cpu, false);
+                mutex_unlock(&wq_pool_mutex);
+                /* wait for per-cpu unbinding to finish */
                flush_work(&unbind_work);
                break;
        }
@@ -3564,7 +4723,7 @@ static void work_for_cpu_fn(struct work_struct *work)
 * It is up to the caller to ensure that the cpu doesn't go offline.
 * The caller must not hold any locks which would prevent @fn from completing.
 */
-long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 {
        struct work_for_cpu wfc = { .fn = fn, .arg = arg };
@@ -3582,44 +4741,40 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
 * freeze_workqueues_begin - begin freezing workqueues
 *
 * Start freezing workqueues.  After this function returns, all freezable
- * workqueues will queue new works to their frozen_works list instead of
+ * workqueues will queue new works to their delayed_works list instead of
 * pool->worklist.
 *
 * CONTEXT:
- * Grabs and releases workqueue_lock and pool->lock's.
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
 void freeze_workqueues_begin(void)
 {
-        unsigned int cpu;
+        struct worker_pool *pool;
+        struct workqueue_struct *wq;
+        struct pool_workqueue *pwq;
+        int pi;
-        spin_lock(&workqueue_lock);
+        mutex_lock(&wq_pool_mutex);
-        BUG_ON(workqueue_freezing);
+        WARN_ON_ONCE(workqueue_freezing);
        workqueue_freezing = true;
-        for_each_wq_cpu(cpu) {
+        /* set FREEZING */
-                struct worker_pool *pool;
+        for_each_pool(pool, pi) {
-                struct workqueue_struct *wq;
+                spin_lock_irq(&pool->lock);
+                WARN_ON_ONCE(pool->flags & POOL_FREEZING);
-                for_each_std_worker_pool(pool, cpu) {
+                pool->flags |= POOL_FREEZING;
-                        spin_lock_irq(&pool->lock);
+                spin_unlock_irq(&pool->lock);
+        }
-                        WARN_ON_ONCE(pool->flags & POOL_FREEZING);
-                        pool->flags |= POOL_FREEZING;
-                        list_for_each_entry(wq, &workqueues, list) {
-                                struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                                if (pwq && pwq->pool == pool &&
-                                    (wq->flags & WQ_FREEZABLE))
-                                        pwq->max_active = 0;
-                        }
-                        spin_unlock_irq(&pool->lock);
+        list_for_each_entry(wq, &workqueues, list) {
-                }
+                mutex_lock(&wq->mutex);
+                for_each_pwq(pwq, wq)
+                        pwq_adjust_max_active(pwq);
+                mutex_unlock(&wq->mutex);
        }
-        spin_unlock(&workqueue_lock);
+        mutex_unlock(&wq_pool_mutex);
 }
 /**
@@ -3629,7 +4784,7 @@ void freeze_workqueues_begin(void)
 * between freeze_workqueues_begin() and thaw_workqueues().
 *
 * CONTEXT:
- * Grabs and releases workqueue_lock.
+ * Grabs and releases wq_pool_mutex.
 *
 * RETURNS:
 * %true if some freezable workqueues are still busy.  %false if freezing
@@ -3637,34 +4792,34 @@ void freeze_workqueues_begin(void)
 */
 bool freeze_workqueues_busy(void)
 {
-        unsigned int cpu;
        bool busy = false;
+        struct workqueue_struct *wq;
+        struct pool_workqueue *pwq;
-        spin_lock(&workqueue_lock);
+        mutex_lock(&wq_pool_mutex);
-        BUG_ON(!workqueue_freezing);
+        WARN_ON_ONCE(!workqueue_freezing);
-        for_each_wq_cpu(cpu) {
+        list_for_each_entry(wq, &workqueues, list) {
-                struct workqueue_struct *wq;
+                if (!(wq->flags & WQ_FREEZABLE))
+                        continue;
                /*
                 * nr_active is monotonically decreasing.  It's safe
                 * to peek without lock.
                 */
-                list_for_each_entry(wq, &workqueues, list) {
+                rcu_read_lock_sched();
-                        struct pool_workqueue *pwq = get_pwq(cpu, wq);
+                for_each_pwq(pwq, wq) {
+                        WARN_ON_ONCE(pwq->nr_active < 0);
-                        if (!pwq || !(wq->flags & WQ_FREEZABLE))
-                                continue;
-                        BUG_ON(pwq->nr_active < 0);
                        if (pwq->nr_active) {
                                busy = true;
+                                rcu_read_unlock_sched();
                                goto out_unlock;
                        }
                }
+                rcu_read_unlock_sched();
        }
 out_unlock:
-        spin_unlock(&workqueue_lock);
+        mutex_unlock(&wq_pool_mutex);
        return busy;
 }
@@ -3675,104 +4830,141 @@ out_unlock:
 * frozen works are transferred to their respective pool worklists.
 *
 * CONTEXT:
- * Grabs and releases workqueue_lock and pool->lock's.
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
 void thaw_workqueues(void)
 {
-        unsigned int cpu;
+        struct workqueue_struct *wq;
+        struct pool_workqueue *pwq;
+        struct worker_pool *pool;
+        int pi;
-        spin_lock(&workqueue_lock);
+        mutex_lock(&wq_pool_mutex);
        if (!workqueue_freezing)
                goto out_unlock;
-        for_each_wq_cpu(cpu) {
+        /* clear FREEZING */
-                struct worker_pool *pool;
+        for_each_pool(pool, pi) {
-                struct workqueue_struct *wq;
+                spin_lock_irq(&pool->lock);
+                WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
+                pool->flags &= ~POOL_FREEZING;
+                spin_unlock_irq(&pool->lock);
+        }
-                for_each_std_worker_pool(pool, cpu) {
+        /* restore max_active and repopulate worklist */
-                        spin_lock_irq(&pool->lock);
+        list_for_each_entry(wq, &workqueues, list) {
+                mutex_lock(&wq->mutex);
+                for_each_pwq(pwq, wq)
+                        pwq_adjust_max_active(pwq);
+                mutex_unlock(&wq->mutex);
+        }
-                        WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
+        workqueue_freezing = false;
-                        pool->flags &= ~POOL_FREEZING;
+out_unlock:
+        mutex_unlock(&wq_pool_mutex);
+}
+#endif /* CONFIG_FREEZER */
-                        list_for_each_entry(wq, &workqueues, list) {
+static void __init wq_numa_init(void)
-                                struct pool_workqueue *pwq = get_pwq(cpu, wq);
+{
+        cpumask_var_t *tbl;
+        int node, cpu;
-                                if (!pwq || pwq->pool != pool ||
+        /* determine NUMA pwq table len - highest node id + 1 */
-                                    !(wq->flags & WQ_FREEZABLE))
+        for_each_node(node)
-                                        continue;
+                wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
-                                /* restore max_active and repopulate worklist */
+        if (num_possible_nodes() <= 1)
-                                pwq_set_max_active(pwq, wq->saved_max_active);
+                return;
-                        }
-                        wake_up_worker(pool);
+        if (wq_disable_numa) {
+                pr_info("workqueue: NUMA affinity support disabled\n");
+                return;
+        }
+        wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
+        BUG_ON(!wq_update_unbound_numa_attrs_buf);
-                        spin_unlock_irq(&pool->lock);
+        /*
+         * We want masks of possible CPUs of each node which isn't readily
+         * available.  Build one from cpu_to_node() which should have been
+         * fully initialized by now.
+         */
+        tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
+        BUG_ON(!tbl);
+        for_each_node(node)
+                BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node));
+        for_each_possible_cpu(cpu) {
+                node = cpu_to_node(cpu);
+                if (WARN_ON(node == NUMA_NO_NODE)) {
+                        pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
+                        /* happens iff arch is bonkers, let's just proceed */
+                        return;
                }
+                cpumask_set_cpu(cpu, tbl[node]);
        }
-        workqueue_freezing = false;
+        wq_numa_possible_cpumask = tbl;
-out_unlock:
+        wq_numa_enabled = true;
-        spin_unlock(&workqueue_lock);
 }
-#endif /* CONFIG_FREEZER */
 static int __init init_workqueues(void)
 {
-        unsigned int cpu;
+        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+        int i, cpu;
        /* make sure we have enough bits for OFFQ pool ID */
        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
                     WORK_CPU_END * NR_STD_WORKER_POOLS);
+        WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
+        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
        hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
+        wq_numa_init();
        /* initialize CPU pools */
-        for_each_wq_cpu(cpu) {
+        for_each_possible_cpu(cpu) {
                struct worker_pool *pool;
-                for_each_std_worker_pool(pool, cpu) {
+                i = 0;
-                        spin_lock_init(&pool->lock);
+                for_each_cpu_worker_pool(pool, cpu) {
+                        BUG_ON(init_worker_pool(pool));
                        pool->cpu = cpu;
-                        pool->flags |= POOL_DISASSOCIATED;
+                        cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
-                        INIT_LIST_HEAD(&pool->worklist);
+                        pool->attrs->nice = std_nice[i++];
-                        INIT_LIST_HEAD(&pool->idle_list);
+                        pool->node = cpu_to_node(cpu);
-                        hash_init(pool->busy_hash);
-                        init_timer_deferrable(&pool->idle_timer);
-                        pool->idle_timer.function = idle_worker_timeout;
-                        pool->idle_timer.data = (unsigned long)pool;
-                        setup_timer(&pool->mayday_timer, pool_mayday_timeout,
-                                    (unsigned long)pool);
-                        mutex_init(&pool->assoc_mutex);
-                        ida_init(&pool->worker_ida);
                        /* alloc pool ID */
+                        mutex_lock(&wq_pool_mutex);
                        BUG_ON(worker_pool_assign_id(pool));
+                        mutex_unlock(&wq_pool_mutex);
                }
        }
        /* create the initial worker */
-        for_each_online_wq_cpu(cpu) {
+        for_each_online_cpu(cpu) {
                struct worker_pool *pool;
-                for_each_std_worker_pool(pool, cpu) {
+                for_each_cpu_worker_pool(pool, cpu) {
-                        struct worker *worker;
+                        pool->flags &= ~POOL_DISASSOCIATED;
+                        BUG_ON(create_and_start_worker(pool) < 0);
+                }
+        }
-                        if (cpu != WORK_CPU_UNBOUND)
+        /* create default unbound wq attrs */
-                                pool->flags &= ~POOL_DISASSOCIATED;
+        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
+                struct workqueue_attrs *attrs;
-                        worker = create_worker(pool);
+                BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
-                        BUG_ON(!worker);
+                attrs->nice = std_nice[i];
-                        spin_lock_irq(&pool->lock);
+                unbound_std_wq_attrs[i] = attrs;
-                        start_worker(worker);
-                        spin_unlock_irq(&pool->lock);
-                }
        }
        system_wq = alloc_workqueue("events", 0, 0);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 07650264ec15..ad83c96b2ece 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -29,16 +29,24 @@ struct worker {
        struct work_struct      *current_work;  /* L: work being processed */
        work_func_t             current_func;   /* L: current_work's fn */
        struct pool_workqueue   *current_pwq; /* L: current_work's pwq */
+        bool                    desc_valid;     /* ->desc is valid */
        struct list_head        scheduled;      /* L: scheduled works */
+        /* 64 bytes boundary on 64bit, 32 on 32bit */
        struct task_struct      *task;          /* I: worker task */
        struct worker_pool      *pool;          /* I: the associated pool */
-        /* 64 bytes boundary on 64bit, 32 on 32bit */
+                                                /* L: for rescuers */
        unsigned long           last_active;    /* L: last active timestamp */
        unsigned int            flags;          /* X: flags */
        int                     id;             /* I: worker id */
-        /* for rebinding worker to CPU */
+        /*
-        struct work_struct      rebind_work;    /* L: for busy worker */
+         * Opaque string set with work_set_desc().  Printed out with task
+         * dump for debugging - WARN, BUG, panic or sysrq.
+         */
+        char                    desc[WORKER_DESC_LEN];
        /* used only by rescuers to point to the target workqueue */
        struct workqueue_struct *rescue_wq;     /* I: the workqueue to rescue */
@@ -58,8 +66,7 @@ static inline struct worker *current_wq_worker(void)
 * Scheduler hooks for concurrency managed workqueue.  Only to be used from
 * sched.c and workqueue.c.
 */
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
+void wq_worker_waking_up(struct task_struct *task, int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
+struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
-                                       unsigned int cpu);
 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */