78 files changed, 3421 insertions, 2670 deletions
diff --git a/kernel/async.c b/kernel/async.c
index bd0c168a3bbe..9d3118384858 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -62,8 +62,10 @@ static async_cookie_t next_cookie = 1;
 #define MAX_WORK        32768
 static LIST_HEAD(async_pending);
-static LIST_HEAD(async_running);
+static ASYNC_DOMAIN(async_running);
+static LIST_HEAD(async_domains);
 static DEFINE_SPINLOCK(async_lock);
+static DEFINE_MUTEX(async_register_mutex);
 struct async_entry {
        struct list_head        list;
@@ -71,7 +73,7 @@ struct async_entry {
        async_cookie_t          cookie;
        async_func_ptr          *func;
        void                    *data;
-        struct list_head        *running;
+        struct async_domain     *running;
 };
 static DECLARE_WAIT_QUEUE_HEAD(async_done);
@@ -82,13 +84,12 @@ static atomic_t entry_count;
 /*
 * MUST be called with the lock held!
 */
-static async_cookie_t  __lowest_in_progress(struct list_head *running)
+static async_cookie_t  __lowest_in_progress(struct async_domain *running)
 {
        struct async_entry *entry;
-        if (!list_empty(running)) {
+        if (!list_empty(&running->domain)) {
-                entry = list_first_entry(running,
+                entry = list_first_entry(&running->domain, typeof(*entry), list);
-                        struct async_entry, list);
                return entry->cookie;
        }
@@ -99,7 +100,7 @@ static async_cookie_t  __lowest_in_progress(struct list_head *running)
        return next_cookie;     /* "infinity" value */
 }
-static async_cookie_t  lowest_in_progress(struct list_head *running)
+static async_cookie_t  lowest_in_progress(struct async_domain *running)
 {
        unsigned long flags;
        async_cookie_t ret;
@@ -119,10 +120,11 @@ static void async_run_entry_fn(struct work_struct *work)
                container_of(work, struct async_entry, work);
        unsigned long flags;
        ktime_t uninitialized_var(calltime), delta, rettime;
+        struct async_domain *running = entry->running;
        /* 1) move self to the running queue */
        spin_lock_irqsave(&async_lock, flags);
-        list_move_tail(&entry->list, entry->running);
+        list_move_tail(&entry->list, &running->domain);
        spin_unlock_irqrestore(&async_lock, flags);
        /* 2) run (and print duration) */
@@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work)
        /* 3) remove self from the running queue */
        spin_lock_irqsave(&async_lock, flags);
        list_del(&entry->list);
+        if (running->registered && --running->count == 0)
+                list_del_init(&running->node);
        /* 4) free the entry */
        kfree(entry);
@@ -156,7 +160,7 @@ static void async_run_entry_fn(struct work_struct *work)
        wake_up(&async_done);
 }
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
 {
        struct async_entry *entry;
        unsigned long flags;
@@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
        spin_lock_irqsave(&async_lock, flags);
        newcookie = entry->cookie = next_cookie++;
        list_add_tail(&entry->list, &async_pending);
+        if (running->registered && running->count++ == 0)
+                list_add_tail(&running->node, &async_domains);
        atomic_inc(&entry_count);
        spin_unlock_irqrestore(&async_lock, flags);
@@ -223,7 +229,7 @@ EXPORT_SYMBOL_GPL(async_schedule);
 * Note: This function may be called from atomic or non-atomic contexts.
 */
 async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
-                                     struct list_head *running)
+                                     struct async_domain *running)
 {
        return __async_schedule(ptr, data, running);
 }
@@ -236,22 +242,52 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
 */
 void async_synchronize_full(void)
 {
+        mutex_lock(&async_register_mutex);
        do {
-                async_synchronize_cookie(next_cookie);
+                struct async_domain *domain = NULL;
-        } while (!list_empty(&async_running) || !list_empty(&async_pending));
+                spin_lock_irq(&async_lock);
+                if (!list_empty(&async_domains))
+                        domain = list_first_entry(&async_domains, typeof(*domain), node);
+                spin_unlock_irq(&async_lock);
+                async_synchronize_cookie_domain(next_cookie, domain);
+        } while (!list_empty(&async_domains));
+        mutex_unlock(&async_register_mutex);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 /**
+ * async_unregister_domain - ensure no more anonymous waiters on this domain
+ * @domain: idle domain to flush out of any async_synchronize_full instances
+ *
+ * async_synchronize_{cookie|full}_domain() are not flushed since callers
+ * of these routines should know the lifetime of @domain
+ *
+ * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
+ */
+void async_unregister_domain(struct async_domain *domain)
+{
+        mutex_lock(&async_register_mutex);
+        spin_lock_irq(&async_lock);
+        WARN_ON(!domain->registered || !list_empty(&domain->node) ||
+                !list_empty(&domain->domain));
+        domain->registered = 0;
+        spin_unlock_irq(&async_lock);
+        mutex_unlock(&async_register_mutex);
+}
+EXPORT_SYMBOL_GPL(async_unregister_domain);
+/**
 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @list: running list to synchronize on
+ * @domain: running list to synchronize on
 *
 * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list have been done.
+ * synchronization domain specified by the running list @domain have been done.
 */
-void async_synchronize_full_domain(struct list_head *list)
+void async_synchronize_full_domain(struct async_domain *domain)
 {
-        async_synchronize_cookie_domain(next_cookie, list);
+        async_synchronize_cookie_domain(next_cookie, domain);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
@@ -261,14 +297,16 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 * @running: running list to synchronize on
 *
 * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list submitted
+ * synchronization domain specified by running list @running submitted
 * prior to @cookie have been done.
 */
-void async_synchronize_cookie_domain(async_cookie_t cookie,
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
-                                     struct list_head *running)
 {
        ktime_t uninitialized_var(starttime), delta, endtime;
+        if (!running)
+                return;
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
                starttime = ktime_get();
diff --git a/kernel/audit.c b/kernel/audit.c
index 1c7f2c61416b..ea3b7b6191c7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb)
 static void audit_printk_skb(struct sk_buff *skb)
 {
        struct nlmsghdr *nlh = nlmsg_hdr(skb);
-        char *data = NLMSG_DATA(nlh);
+        char *data = nlmsg_data(nlh);
        if (nlh->nlmsg_type != AUDIT_EOE) {
                if (printk_ratelimit())
@@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
        if (!skb)
                return NULL;
-        nlh     = NLMSG_NEW(skb, pid, seq, t, size, flags);
+        nlh     = nlmsg_put(skb, pid, seq, t, size, flags);
-        data    = NLMSG_DATA(nlh);
+        if (!nlh)
+                goto out_kfree_skb;
+        data = nlmsg_data(nlh);
        memcpy(data, payload, size);
        return skb;
-nlmsg_failure:                  /* Used by NLMSG_NEW */
+out_kfree_skb:
-        if (skb)
+        kfree_skb(skb);
-                kfree_skb(skb);
        return NULL;
 }
@@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        sessionid = audit_get_sessionid(current);
        security_task_getsecid(current, &sid);
        seq  = nlh->nlmsg_seq;
-        data = NLMSG_DATA(nlh);
+        data = nlmsg_data(nlh);
        switch (msg_type) {
        case AUDIT_GET:
@@ -961,14 +962,17 @@ static void audit_receive(struct sk_buff  *skb)
 static int __init audit_init(void)
 {
        int i;
+        struct netlink_kernel_cfg cfg = {
+                .input  = audit_receive,
+        };
        if (audit_initialized == AUDIT_DISABLED)
                return 0;
        printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
               audit_default ? "enabled" : "disabled");
-        audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
+        audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
-                                           audit_receive, NULL, THIS_MODULE);
+                                           THIS_MODULE, &cfg);
        if (!audit_sock)
                audit_panic("cannot initialize netlink socket");
        else
@@ -1060,13 +1064,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
        ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
        if (!ab->skb)
-                goto nlmsg_failure;
+                goto err;
-        nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
+        nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+        if (!nlh)
+                goto out_kfree_skb;
        return ab;
-nlmsg_failure:                  /* Used by NLMSG_NEW */
+out_kfree_skb:
        kfree_skb(ab->skb);
        ab->skb = NULL;
 err:
@@ -1450,6 +1456,27 @@ void audit_log_key(struct audit_buffer *ab, char *key)
 }
 /**
+ * audit_log_link_denied - report a link restriction denial
+ * @operation: specific link opreation
+ * @link: the path that triggered the restriction
+ */
+void audit_log_link_denied(const char *operation, struct path *link)
+{
+        struct audit_buffer *ab;
+        ab = audit_log_start(current->audit_context, GFP_KERNEL,
+                             AUDIT_ANOM_LINK);
+        audit_log_format(ab, "op=%s action=denied", operation);
+        audit_log_format(ab, " pid=%d comm=", current->pid);
+        audit_log_untrustedstring(ab, current->comm);
+        audit_log_d_path(ab, " path=", link);
+        audit_log_format(ab, " dev=");
+        audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id);
+        audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino);
+        audit_log_end(ab);
+}
+/**
 * audit_log_end - end one audit record
 * @ab: the audit_buffer
 *
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5bf0790497e7..ed206fd88cca 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -250,7 +250,6 @@ static void untag_chunk(struct node *p)
                spin_unlock(&hash_lock);
                spin_unlock(&entry->lock);
                fsnotify_destroy_mark(entry);
-                fsnotify_put_mark(entry);
                goto out;
        }
@@ -259,7 +258,7 @@ static void untag_chunk(struct node *p)
        fsnotify_duplicate_mark(&new->mark, entry);
        if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
-                free_chunk(new);
+                fsnotify_put_mark(&new->mark);
                goto Fallback;
        }
@@ -293,7 +292,7 @@ static void untag_chunk(struct node *p)
        spin_unlock(&hash_lock);
        spin_unlock(&entry->lock);
        fsnotify_destroy_mark(entry);
-        fsnotify_put_mark(entry);
+        fsnotify_put_mark(&new->mark);  /* drop initial reference */
        goto out;
 Fallback:
@@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
        entry = &chunk->mark;
        if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
-                free_chunk(chunk);
+                fsnotify_put_mark(entry);
                return -ENOSPC;
        }
@@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
        insert_hash(chunk);
        spin_unlock(&hash_lock);
        spin_unlock(&entry->lock);
+        fsnotify_put_mark(entry);       /* drop initial reference */
        return 0;
 }
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        fsnotify_duplicate_mark(chunk_entry, old_entry);
        if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
                spin_unlock(&old_entry->lock);
-                free_chunk(chunk);
+                fsnotify_put_mark(chunk_entry);
                fsnotify_put_mark(old_entry);
                return -ENOSPC;
        }
@@ -444,8 +444,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        spin_unlock(&chunk_entry->lock);
        spin_unlock(&old_entry->lock);
        fsnotify_destroy_mark(old_entry);
+        fsnotify_put_mark(chunk_entry); /* drop initial reference */
        fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
-        fsnotify_put_mark(old_entry); /* and kill it */
        return 0;
 }
@@ -595,7 +595,7 @@ void audit_trim_trees(void)
                root_mnt = collect_mounts(&path);
                path_put(&path);
-                if (!root_mnt)
+                if (IS_ERR(root_mnt))
                        goto skip_it;
                spin_lock(&hash_lock);
@@ -669,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
                goto Err;
        mnt = collect_mounts(&path);
        path_put(&path);
-        if (!mnt) {
+        if (IS_ERR(mnt)) {
-                err = -ENOMEM;
+                err = PTR_ERR(mnt);
                goto Err;
        }
@@ -719,8 +719,8 @@ int audit_tag_tree(char *old, char *new)
                return err;
        tagged = collect_mounts(&path2);
        path_put(&path2);
-        if (!tagged)
+        if (IS_ERR(tagged))
-                return -ENOMEM;
+                return PTR_ERR(tagged);
        err = kern_path(old, 0, &path1);
        if (err) {
@@ -916,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
        struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
        evict_chunk(chunk);
-        fsnotify_put_mark(entry);
+        /*
+         * We are guaranteed to have at least one reference to the mark from
+         * either the inode or the caller of fsnotify_destroy_mark().
+         */
+        BUG_ON(atomic_read(&entry->refcnt) < 1);
 }
 static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e683869365d9..3823281401b5 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -355,34 +355,15 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 /* Get path information necessary for adding watches. */
 static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 {
-        struct nameidata nd;
+        struct dentry *d = kern_path_locked(watch->path, parent);
-        struct dentry *d;
+        if (IS_ERR(d))
-        int err;
-        err = kern_path_parent(watch->path, &nd);
-        if (err)
-                return err;
-        if (nd.last_type != LAST_NORM) {
-                path_put(&nd.path);
-                return -EINVAL;
-        }
-        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-        d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
-        if (IS_ERR(d)) {
-                mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-                path_put(&nd.path);
                return PTR_ERR(d);
-        }
+        mutex_unlock(&parent->dentry->d_inode->i_mutex);
        if (d->d_inode) {
                /* update watch filter fields */
                watch->dev = d->d_inode->i_sb->s_dev;
                watch->ino = d->d_inode->i_ino;
        }
-        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-        *parent = nd.path;
        dput(d);
        return 0;
 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b303dfc7dce0..79818507e444 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -822,7 +822,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
 */
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static const struct inode_operations cgroup_dir_inode_operations;
@@ -954,7 +954,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
                dget(d);
                d_delete(d);
-                simple_unlink(d->d_inode, d);
+                simple_unlink(cgrp->dentry->d_inode, d);
                list_del_init(&cfe->node);
                dput(d);
@@ -1068,28 +1068,24 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        BUG_ON(cgrp->subsys[i]);
                        BUG_ON(!dummytop->subsys[i]);
                        BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
-                        mutex_lock(&ss->hierarchy_mutex);
                        cgrp->subsys[i] = dummytop->subsys[i];
                        cgrp->subsys[i]->cgroup = cgrp;
                        list_move(&ss->sibling, &root->subsys_list);
                        ss->root = root;
                        if (ss->bind)
                                ss->bind(cgrp);
-                        mutex_unlock(&ss->hierarchy_mutex);
                        /* refcount was already taken, and we're keeping it */
                } else if (bit & removed_bits) {
                        /* We're removing this subsystem */
                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
-                        mutex_lock(&ss->hierarchy_mutex);
                        if (ss->bind)
                                ss->bind(dummytop);
                        dummytop->subsys[i]->cgroup = dummytop;
                        cgrp->subsys[i] = NULL;
                        subsys[i]->root = &rootnode;
                        list_move(&ss->sibling, &rootnode.subsys_list);
-                        mutex_unlock(&ss->hierarchy_mutex);
                        /* subsystem is now free - drop reference on module */
                        module_put(ss->module);
                } else if (bit & final_bits) {
@@ -1587,7 +1583,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        opts.new_root = new_root;
        /* Locate an existing or new sb for this hierarchy */
-        sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
+        sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
                cgroup_drop_root(opts.new_root);
@@ -2570,7 +2566,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
        .rename = cgroup_rename,
 };
-static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
@@ -3915,37 +3911,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
                set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
 }
-static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
-{
-        /* We need to take each hierarchy_mutex in a consistent order */
-        int i;
-        /*
-         * No worry about a race with rebind_subsystems that might mess up the
-         * locking order, since both parties are under cgroup_mutex.
-         */
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                struct cgroup_subsys *ss = subsys[i];
-                if (ss == NULL)
-                        continue;
-                if (ss->root == root)
-                        mutex_lock(&ss->hierarchy_mutex);
-        }
-}
-static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
-{
-        int i;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                struct cgroup_subsys *ss = subsys[i];
-                if (ss == NULL)
-                        continue;
-                if (ss->root == root)
-                        mutex_unlock(&ss->hierarchy_mutex);
-        }
-}
 /*
 * cgroup_create - create a cgroup
 * @parent: cgroup that will be parent of the new cgroup
@@ -4006,9 +3971,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                        ss->post_clone(cgrp);
        }
-        cgroup_lock_hierarchy(root);
        list_add(&cgrp->sibling, &cgrp->parent->children);
-        cgroup_unlock_hierarchy(root);
        root->number_of_cgroups++;
        err = cgroup_create_dir(cgrp, dentry, mode);
@@ -4035,9 +3998,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 err_remove:
-        cgroup_lock_hierarchy(root);
        list_del(&cgrp->sibling);
-        cgroup_unlock_hierarchy(root);
        root->number_of_cgroups--;
 err_destroy:
@@ -4245,10 +4206,8 @@ again:
                list_del_init(&cgrp->release_list);
        raw_spin_unlock(&release_list_lock);
-        cgroup_lock_hierarchy(cgrp->root);
        /* delete this cgroup from parent->children */
        list_del_init(&cgrp->sibling);
-        cgroup_unlock_hierarchy(cgrp->root);
        list_del_init(&cgrp->allcg_node);
@@ -4322,8 +4281,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));
-        mutex_init(&ss->hierarchy_mutex);
-        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
        ss->active = 1;
        /* this function shouldn't be used with modular subsystems, since they
@@ -4450,8 +4407,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        }
        write_unlock(&css_set_lock);
-        mutex_init(&ss->hierarchy_mutex);
-        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
        ss->active = 1;
        /* success! */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a4eb5227a19e..14d32588cccd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu)
        if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
                mutex_lock(&zonelists_mutex);
-                build_all_zonelists(NULL);
+                build_all_zonelists(NULL, NULL);
                mutex_unlock(&zonelists_mutex);
        }
 #endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c8bd652dd12..f33c7153b6d7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -147,6 +147,12 @@ typedef enum {
        CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
+/* the type of hotplug event */
+enum hotplug_event {
+        CPUSET_CPU_OFFLINE,
+        CPUSET_MEM_OFFLINE,
+};
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 }
 /*
- * Walk the specified cpuset subtree and look for empty cpusets.
+ * Helper function to traverse cpusets.
- * The tasks of such cpuset must be moved to a parent cpuset.
+ * It can be used to walk the cpuset tree from top to bottom, completing
+ * one layer before dropping down to the next (thus always processing a
+ * node before any of its children).
+ */
+static struct cpuset *cpuset_next(struct list_head *queue)
+{
+        struct cpuset *cp;
+        struct cpuset *child;   /* scans child cpusets of cp */
+        struct cgroup *cont;
+        if (list_empty(queue))
+                return NULL;
+        cp = list_first_entry(queue, struct cpuset, stack_list);
+        list_del(queue->next);
+        list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                child = cgroup_cs(cont);
+                list_add_tail(&child->stack_list, queue);
+        }
+        return cp;
+}
+/*
+ * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
+ * online/offline) and update the cpusets accordingly.
+ * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
+ * cpuset must be moved to a parent cpuset.
 *
 * Called with cgroup_mutex held.  We take callback_mutex to modify
 * cpus_allowed and mems_allowed.
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 * before dropping down to the next.  It always processes a node before
 * any of its children.
 *
- * For now, since we lack memory hot unplug, we'll never see a cpuset
+ * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
- * that has tasks along with an empty 'mems'.  But if we did see such
+ * if all present pages from a node are offlined.
- * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
 */
-static void scan_for_empty_cpusets(struct cpuset *root)
+static void
+scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 {
        LIST_HEAD(queue);
-        struct cpuset *cp;      /* scans cpusets being updated */
+        struct cpuset *cp;              /* scans cpusets being updated */
-        struct cpuset *child;   /* scans child cpusets of cp */
-        struct cgroup *cont;
        static nodemask_t oldmems;      /* protected by cgroup_mutex */
        list_add_tail((struct list_head *)&root->stack_list, &queue);
-        while (!list_empty(&queue)) {
+        switch (event) {
-                cp = list_first_entry(&queue, struct cpuset, stack_list);
+        case CPUSET_CPU_OFFLINE:
-                list_del(queue.next);
+                while ((cp = cpuset_next(&queue)) != NULL) {
-                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-                        child = cgroup_cs(cont);
+                        /* Continue past cpusets with all cpus online */
-                        list_add_tail(&child->stack_list, &queue);
+                        if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
+                                continue;
+                        /* Remove offline cpus from this cpuset. */
+                        mutex_lock(&callback_mutex);
+                        cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+                                                        cpu_active_mask);
+                        mutex_unlock(&callback_mutex);
+                        /* Move tasks from the empty cpuset to a parent */
+                        if (cpumask_empty(cp->cpus_allowed))
+                                remove_tasks_in_empty_cpuset(cp);
+                        else
+                                update_tasks_cpumask(cp, NULL);
                }
+                break;
-                /* Continue past cpusets with all cpus, mems online */
+        case CPUSET_MEM_OFFLINE:
-                if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
+                while ((cp = cpuset_next(&queue)) != NULL) {
-                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
-                        continue;
-                oldmems = cp->mems_allowed;
+                        /* Continue past cpusets with all mems online */
+                        if (nodes_subset(cp->mems_allowed,
+                                        node_states[N_HIGH_MEMORY]))
+                                continue;
-                /* Remove offline cpus and mems from this cpuset. */
+                        oldmems = cp->mems_allowed;
-                mutex_lock(&callback_mutex);
-                cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+                        /* Remove offline mems from this cpuset. */
-                            cpu_active_mask);
+                        mutex_lock(&callback_mutex);
-                nodes_and(cp->mems_allowed, cp->mems_allowed,
+                        nodes_and(cp->mems_allowed, cp->mems_allowed,
                                                node_states[N_HIGH_MEMORY]);
-                mutex_unlock(&callback_mutex);
+                        mutex_unlock(&callback_mutex);
-                /* Move tasks from the empty cpuset to a parent */
+                        /* Move tasks from the empty cpuset to a parent */
-                if (cpumask_empty(cp->cpus_allowed) ||
+                        if (nodes_empty(cp->mems_allowed))
-                     nodes_empty(cp->mems_allowed))
+                                remove_tasks_in_empty_cpuset(cp);
-                        remove_tasks_in_empty_cpuset(cp);
+                        else
-                else {
+                                update_tasks_nodemask(cp, &oldmems, NULL);
-                        update_tasks_cpumask(cp, NULL);
-                        update_tasks_nodemask(cp, &oldmems, NULL);
                }
        }
 }
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 * (of no affect) on systems that are actively using CPU hotplug
 * but making no active use of cpusets.
 *
+ * The only exception to this is suspend/resume, where we don't
+ * modify cpusets at all.
+ *
 * This routine ensures that top_cpuset.cpus_allowed tracks
 * cpu_active_mask on each CPU hotplug (cpuhp) event.
 *
 * Called within get_online_cpus().  Needs to call cgroup_lock()
 * before calling generate_sched_domains().
+ *
+ * @cpu_online: Indicates whether this is a CPU online event (true) or
+ * a CPU offline event (false).
 */
-void cpuset_update_active_cpus(void)
+void cpuset_update_active_cpus(bool cpu_online)
 {
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
        mutex_lock(&callback_mutex);
        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        mutex_unlock(&callback_mutex);
-        scan_for_empty_cpusets(&top_cpuset);
+        if (!cpu_online)
+                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
        ndoms = generate_sched_domains(&doms, &attr);
        cgroup_unlock();
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
 /*
 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See also the previous routine cpuset_track_online_cpus().
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
 */
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
        case MEM_OFFLINE:
                /*
                 * needn't update top_cpuset.mems_allowed explicitly because
-                 * scan_for_empty_cpusets() will update it.
+                 * scan_cpusets_upon_hotplug() will update it.
                 */
-                scan_for_empty_cpusets(&top_cpuset);
+                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
                break;
        default:
                break;
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8b68ce78ff17..be7b33b73d30 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -12,6 +12,7 @@
 #include <linux/kdb.h>
 #include <linux/kdebug.h>
 #include <linux/export.h>
+#include <linux/hardirq.h>
 #include "kdb_private.h"
 #include "../debug_core.h"
@@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks)
        if (atomic_read(&kgdb_setting_breakpoint))
                reason = KDB_REASON_KEYBOARD;
+        if (in_nmi())
+                reason = KDB_REASON_NMI;
        for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
                if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
                        reason = KDB_REASON_BREAK;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index bb9520f0f6ff..0a69d2adc4f3 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -715,9 +715,6 @@ kdb_printit:
        /* check for having reached the LINES number of printed lines */
        if (kdb_nextline == linecount) {
                char buf1[16] = "";
-#if defined(CONFIG_SMP)
-                char buf2[32];
-#endif
                /* Watch out for recursion here.  Any routine that calls
                 * kdb_printf will come back through here.  And kdb_read
@@ -732,14 +729,6 @@ kdb_printit:
                if (moreprompt == NULL)
                        moreprompt = "more> ";
-#if defined(CONFIG_SMP)
-                if (strchr(moreprompt, '%')) {
-                        sprintf(buf2, moreprompt, get_cpu());
-                        put_cpu();
-                        moreprompt = buf2;
-                }
-#endif
                kdb_input_flush();
                c = console_drivers;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 67b847dfa2bb..31df1706b9a9 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -14,6 +14,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/kmsg_dump.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/sysrq.h>
@@ -138,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
 static char *__env[] = {
 #if defined(CONFIG_SMP)
 "PROMPT=[%d]kdb> ",
- "MOREPROMPT=[%d]more> ",
 #else
 "PROMPT=kdb> ",
- "MOREPROMPT=more> ",
 #endif
+ "MOREPROMPT=more> ",
 "RADIX=16",
 "MDCOUNT=8",                   /* lines of md output */
 KDB_PLATFORM_ENV,
@@ -1235,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                *cmdbuf = '\0';
                *(cmd_hist[cmd_head]) = '\0';
-                if (KDB_FLAG(ONLY_DO_DUMP)) {
-                        /* kdb is off but a catastrophic error requires a dump.
-                         * Take the dump and reboot.
-                         * Turn on logging so the kdb output appears in the log
-                         * buffer in the dump.
-                         */
-                        const char *setargs[] = { "set", "LOGGING", "1" };
-                        kdb_set(2, setargs);
-                        kdb_reboot(0, NULL);
-                        /*NOTREACHED*/
-                }
 do_full_getstr:
 #if defined(CONFIG_SMP)
                snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
@@ -2040,8 +2028,15 @@ static int kdb_env(int argc, const char **argv)
 */
 static int kdb_dmesg(int argc, const char **argv)
 {
-        char *syslog_data[4], *start, *end, c = '\0', *p;
+        int diag;
-        int diag, logging, logsize, lines = 0, adjust = 0, n;
+        int logging;
+        int lines = 0;
+        int adjust = 0;
+        int n = 0;
+        int skip = 0;
+        struct kmsg_dumper dumper = { .active = 1 };
+        size_t len;
+        char buf[201];
        if (argc > 2)
                return KDB_ARGCOUNT;
@@ -2064,22 +2059,10 @@ static int kdb_dmesg(int argc, const char **argv)
                kdb_set(2, setargs);
        }
-        /* syslog_data[0,1] physical start, end+1.  syslog_data[2,3]
+        kmsg_dump_rewind_nolock(&dumper);
-         * logical start, end+1. */
+        while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL))
-        kdb_syslog_data(syslog_data);
+                n++;
-        if (syslog_data[2] == syslog_data[3])
-                return 0;
-        logsize = syslog_data[1] - syslog_data[0];
-        start = syslog_data[2];
-        end = syslog_data[3];
-#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
-        for (n = 0, p = start; p < end; ++p) {
-                c = *KDB_WRAP(p);
-                if (c == '\n')
-                        ++n;
-        }
-        if (c != '\n')
-                ++n;
        if (lines < 0) {
                if (adjust >= n)
                        kdb_printf("buffer only contains %d lines, nothing "
@@ -2087,21 +2070,11 @@ static int kdb_dmesg(int argc, const char **argv)
                else if (adjust - lines >= n)
                        kdb_printf("buffer only contains %d lines, last %d "
                                   "lines printed\n", n, n - adjust);
-                if (adjust) {
+                skip = adjust;
-                        for (; start < end && adjust; ++start) {
+                lines = abs(lines);
-                                if (*KDB_WRAP(start) == '\n')
-                                        --adjust;
-                        }
-                        if (start < end)
-                                ++start;
-                }
-                for (p = start; p < end && lines; ++p) {
-                        if (*KDB_WRAP(p) == '\n')
-                                ++lines;
-                }
-                end = p;
        } else if (lines > 0) {
-                int skip = n - (adjust + lines);
+                skip = n - lines - adjust;
+                lines = abs(lines);
                if (adjust >= n) {
                        kdb_printf("buffer only contains %d lines, "
                                   "nothing printed\n", n);
@@ -2112,35 +2085,24 @@ static int kdb_dmesg(int argc, const char **argv)
                        kdb_printf("buffer only contains %d lines, first "
                                   "%d lines printed\n", n, lines);
                }
-                for (; start < end && skip; ++start) {
+        } else {
-                        if (*KDB_WRAP(start) == '\n')
+                lines = n;
-                                --skip;
-                }
-                for (p = start; p < end && lines; ++p) {
-                        if (*KDB_WRAP(p) == '\n')
-                                --lines;
-                }
-                end = p;
        }
-        /* Do a line at a time (max 200 chars) to reduce protocol overhead */
-        c = '\n';
+        if (skip >= n || skip < 0)
-        while (start != end) {
+                return 0;
-                char buf[201];
-                p = buf;
+        kmsg_dump_rewind_nolock(&dumper);
-                if (KDB_FLAG(CMD_INTERRUPT))
+        while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) {
-                        return 0;
+                if (skip) {
-                while (start < end && (c = *KDB_WRAP(start)) &&
+                        skip--;
-                       (p - buf) < sizeof(buf)-1) {
+                        continue;
-                        ++start;
-                        *p++ = c;
-                        if (c == '\n')
-                                break;
                }
-                *p = '\0';
+                if (!lines--)
-                kdb_printf("%s", buf);
+                        break;
+                kdb_printf("%.*s\n", (int)len - 1, buf);
        }
-        if (c != '\n')
-                kdb_printf("\n");
        return 0;
 }
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 47c4e56e513b..392ec6a25844 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -205,7 +205,6 @@ extern char kdb_grep_string[];
 extern int kdb_grep_leading;
 extern int kdb_grep_trailing;
 extern char *kdb_cmds[];
-extern void kdb_syslog_data(char *syslog_data[]);
 extern unsigned long kdb_task_state_string(const char *);
 extern char kdb_task_state_char (const struct task_struct *);
 extern unsigned long kdb_task_state(const struct task_struct *p,
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6581a040f399..98d4597f43d6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -153,7 +153,8 @@ put_callchain_entry(int rctx)
        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
 }
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
        int rctx;
        struct perf_callchain_entry *entry;
@@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
        }
        if (regs) {
+                /*
+                 * Disallow cross-task user callchains.
+                 */
+                if (event->ctx->task && event->ctx->task != current)
+                        goto exit_put;
                perf_callchain_store(entry, PERF_CONTEXT_USER);
                perf_callchain_user(entry, regs);
        }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d7d71d6ec972..b7935fcec7d9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1645,6 +1645,8 @@ perf_install_in_context(struct perf_event_context *ctx,
        lockdep_assert_held(&ctx->mutex);
        event->ctx = ctx;
+        if (event->cpu != -1)
+                event->cpu = cpu;
        if (!task) {
                /*
@@ -4037,7 +4039,7 @@ void perf_prepare_sample(struct perf_event_header *header,
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;
-                data->callchain = perf_callchain(regs);
+                data->callchain = perf_callchain(event, regs);
                if (data->callchain)
                        size += data->callchain->nr;
@@ -5207,7 +5209,8 @@ static int perf_tp_event_match(struct perf_event *event,
 }
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-                   struct pt_regs *regs, struct hlist_head *head, int rctx)
+                   struct pt_regs *regs, struct hlist_head *head, int rctx,
+                   struct task_struct *task)
 {
        struct perf_sample_data data;
        struct perf_event *event;
@@ -5226,6 +5229,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
                        perf_swevent_event(event, count, &data, regs);
        }
+        /*
+         * If we got specified a target task, also iterate its context and
+         * deliver this event there too.
+         */
+        if (task && task != current) {
+                struct perf_event_context *ctx;
+                struct trace_entry *entry = record;
+                rcu_read_lock();
+                ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+                if (!ctx)
+                        goto unlock;
+                list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+                        if (event->attr.type != PERF_TYPE_TRACEPOINT)
+                                continue;
+                        if (event->attr.config != entry->type)
+                                continue;
+                        if (perf_tp_event_match(event, &data, regs))
+                                perf_swevent_event(event, count, &data, regs);
+                }
+unlock:
+                rcu_read_unlock();
+        }
        perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -6252,6 +6280,8 @@ SYSCALL_DEFINE5(perf_event_open,
                }
        }
+        get_online_cpus();
        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
                                 NULL, NULL);
        if (IS_ERR(event)) {
@@ -6304,7 +6334,7 @@ SYSCALL_DEFINE5(perf_event_open,
        /*
         * Get the target context (task or percpu):
         */
-        ctx = find_get_context(pmu, task, cpu);
+        ctx = find_get_context(pmu, task, event->cpu);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_alloc;
@@ -6377,20 +6407,23 @@ SYSCALL_DEFINE5(perf_event_open,
        mutex_lock(&ctx->mutex);
        if (move_group) {
-                perf_install_in_context(ctx, group_leader, cpu);
+                synchronize_rcu();
+                perf_install_in_context(ctx, group_leader, event->cpu);
                get_ctx(ctx);
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
-                        perf_install_in_context(ctx, sibling, cpu);
+                        perf_install_in_context(ctx, sibling, event->cpu);
                        get_ctx(ctx);
                }
        }
-        perf_install_in_context(ctx, event, cpu);
+        perf_install_in_context(ctx, event, event->cpu);
        ++ctx->generation;
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
+        put_online_cpus();
        event->owner = current;
        mutex_lock(&current->perf_event_mutex);
@@ -6419,6 +6452,7 @@ err_context:
 err_alloc:
        free_event(event);
 err_task:
+        put_online_cpus();
        if (task)
                put_task_struct(task);
 err_group_fd:
@@ -6479,6 +6513,39 @@ err:
 }
 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+        struct perf_event_context *src_ctx;
+        struct perf_event_context *dst_ctx;
+        struct perf_event *event, *tmp;
+        LIST_HEAD(events);
+        src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
+        dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+        mutex_lock(&src_ctx->mutex);
+        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
+                                 event_entry) {
+                perf_remove_from_context(event);
+                put_ctx(src_ctx);
+                list_add(&event->event_entry, &events);
+        }
+        mutex_unlock(&src_ctx->mutex);
+        synchronize_rcu();
+        mutex_lock(&dst_ctx->mutex);
+        list_for_each_entry_safe(event, tmp, &events, event_entry) {
+                list_del(&event->event_entry);
+                if (event->state >= PERF_EVENT_STATE_OFF)
+                        event->state = PERF_EVENT_STATE_INACTIVE;
+                perf_install_in_context(dst_ctx, event, dst_cpu);
+                get_ctx(dst_ctx);
+        }
+        mutex_unlock(&dst_ctx->mutex);
+}
+EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 static void sync_child_event(struct perf_event *child_event,
                               struct task_struct *child)
 {
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f90afc..a096c19f2c2a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle,
 }
 /* Callchain handling */
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs);
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 985be4d80fe8..c08a22d02f72 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -32,19 +32,36 @@
 #include <linux/swap.h>         /* try_to_free_swap */
 #include <linux/ptrace.h>       /* user_enable_single_step */
 #include <linux/kdebug.h>       /* notifier mechanism */
+#include "../../mm/internal.h"  /* munlock_vma_page */
 #include <linux/uprobes.h>
 #define UINSNS_PER_PAGE                 (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
 #define MAX_UPROBE_XOL_SLOTS            UINSNS_PER_PAGE
-static struct srcu_struct uprobes_srcu;
 static struct rb_root uprobes_tree = RB_ROOT;
 static DEFINE_SPINLOCK(uprobes_treelock);       /* serialize rbtree access */
 #define UPROBES_HASH_SZ 13
+/*
+ * We need separate register/unregister and mmap/munmap lock hashes because
+ * of mmap_sem nesting.
+ *
+ * uprobe_register() needs to install probes on (potentially) all processes
+ * and thus needs to acquire multiple mmap_sems (consequtively, not
+ * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
+ * for the particular process doing the mmap.
+ *
+ * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
+ * because of lock order against i_mmap_mutex. This means there's a hole in
+ * the register vma iteration where a mmap() can happen.
+ *
+ * Thus uprobe_register() can race with uprobe_mmap() and we can try and
+ * install a probe where one is already installed.
+ */
 /* serialize (un)register */
 static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
@@ -61,17 +78,6 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 */
 static atomic_t uprobe_events = ATOMIC_INIT(0);
-/*
- * Maintain a temporary per vma info that can be used to search if a vma
- * has already been handled. This structure is introduced since extending
- * vm_area_struct wasnt recommended.
- */
-struct vma_info {
-        struct list_head        probe_list;
-        struct mm_struct        *mm;
-        loff_t                  vaddr;
-};
 struct uprobe {
        struct rb_node          rb_node;        /* node in the rb tree */
        atomic_t                ref;
@@ -100,20 +106,21 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register)
        if (!is_register)
                return true;
-        if ((vma->vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) == (VM_READ|VM_EXEC))
+        if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED))
+                                == (VM_READ|VM_EXEC))
                return true;
        return false;
 }
-static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
+static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
 {
-        loff_t vaddr;
+        return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+}
-        vaddr = vma->vm_start + offset;
-        vaddr -= vma->vm_pgoff << PAGE_SHIFT;
-        return vaddr;
+static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
+{
+        return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
 }
 /**
@@ -121,41 +128,27 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset)
 * based on replace_page in mm/ksm.c
 *
 * @vma:      vma that holds the pte pointing to page
+ * @addr:     address the old @page is mapped at
 * @page:     the cowed page we are replacing by kpage
 * @kpage:    the modified page we replace page by
 *
 * Returns 0 on success, -EFAULT on failure.
 */
-static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage)
+static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
+                                struct page *page, struct page *kpage)
 {
        struct mm_struct *mm = vma->vm_mm;
-        pgd_t *pgd;
-        pud_t *pud;
-        pmd_t *pmd;
-        pte_t *ptep;
        spinlock_t *ptl;
-        unsigned long addr;
+        pte_t *ptep;
-        int err = -EFAULT;
+        int err;
-        addr = page_address_in_vma(page, vma);
-        if (addr == -EFAULT)
-                goto out;
-        pgd = pgd_offset(mm, addr);
-        if (!pgd_present(*pgd))
-                goto out;
-        pud = pud_offset(pgd, addr);
-        if (!pud_present(*pud))
-                goto out;
-        pmd = pmd_offset(pud, addr);
+        /* For try_to_free_swap() and munlock_vma_page() below */
-        if (!pmd_present(*pmd))
+        lock_page(page);
-                goto out;
-        ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+        err = -EAGAIN;
+        ptep = page_check_address(page, mm, addr, &ptl, 0);
        if (!ptep)
-                goto out;
+                goto unlock;
        get_page(kpage);
        page_add_new_anon_rmap(kpage, vma, addr);
@@ -172,11 +165,15 @@ static int __replace_page(struct vm_area_struct *vma, struct page *page, struct
        page_remove_rmap(page);
        if (!page_mapped(page))
                try_to_free_swap(page);
-        put_page(page);
        pte_unmap_unlock(ptep, ptl);
-        err = 0;
-out:
+        if (vma->vm_flags & VM_LOCKED)
+                munlock_vma_page(page);
+        put_page(page);
+        err = 0;
+ unlock:
+        unlock_page(page);
        return err;
 }
@@ -218,79 +215,46 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
                        unsigned long vaddr, uprobe_opcode_t opcode)
 {
        struct page *old_page, *new_page;
-        struct address_space *mapping;
        void *vaddr_old, *vaddr_new;
        struct vm_area_struct *vma;
-        struct uprobe *uprobe;
-        loff_t addr;
        int ret;
+retry:
        /* Read the page with vaddr into memory */
        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma);
        if (ret <= 0)
                return ret;
-        ret = -EINVAL;
-        /*
-         * We are interested in text pages only. Our pages of interest
-         * should be mapped for read and execute only. We desist from
-         * adding probes in write mapped pages since the breakpoints
-         * might end up in the file copy.
-         */
-        if (!valid_vma(vma, is_swbp_insn(&opcode)))
-                goto put_out;
-        uprobe = container_of(auprobe, struct uprobe, arch);
-        mapping = uprobe->inode->i_mapping;
-        if (mapping != vma->vm_file->f_mapping)
-                goto put_out;
-        addr = vma_address(vma, uprobe->offset);
-        if (vaddr != (unsigned long)addr)
-                goto put_out;
        ret = -ENOMEM;
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
        if (!new_page)
-                goto put_out;
+                goto put_old;
        __SetPageUptodate(new_page);
-        /*
-         * lock page will serialize against do_wp_page()'s
-         * PageAnon() handling
-         */
-        lock_page(old_page);
        /* copy the page now that we've got it stable */
        vaddr_old = kmap_atomic(old_page);
        vaddr_new = kmap_atomic(new_page);
        memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
+        memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
-        /* poke the new insn in, ASSUMES we don't cross page boundary */
-        vaddr &= ~PAGE_MASK;
-        BUG_ON(vaddr + UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-        memcpy(vaddr_new + vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
        kunmap_atomic(vaddr_new);
        kunmap_atomic(vaddr_old);
        ret = anon_vma_prepare(vma);
        if (ret)
-                goto unlock_out;
+                goto put_new;
-        lock_page(new_page);
+        ret = __replace_page(vma, vaddr, old_page, new_page);
-        ret = __replace_page(vma, old_page, new_page);
-        unlock_page(new_page);
-unlock_out:
+put_new:
-        unlock_page(old_page);
        page_cache_release(new_page);
+put_old:
-put_out:
        put_page(old_page);
+        if (unlikely(ret == -EAGAIN))
+                goto retry;
        return ret;
 }
@@ -312,7 +276,7 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
        void *vaddr_new;
        int ret;
-        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &page, NULL);
+        ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
        if (ret <= 0)
                return ret;
@@ -333,10 +297,20 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
        uprobe_opcode_t opcode;
        int result;
+        if (current->mm == mm) {
+                pagefault_disable();
+                result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
+                                                                sizeof(opcode));
+                pagefault_enable();
+                if (likely(result == 0))
+                        goto out;
+        }
        result = read_opcode(mm, vaddr, &opcode);
        if (result)
                return result;
+out:
        if (is_swbp_insn(&opcode))
                return 1;
@@ -355,7 +329,9 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
        int result;
+        /*
+         * See the comment near uprobes_hash().
+         */
        result = is_swbp_at_addr(mm, vaddr);
        if (result == 1)
                return -EEXIST;
@@ -520,7 +496,6 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
        uprobe->inode = igrab(inode);
        uprobe->offset = offset;
        init_rwsem(&uprobe->consumer_rwsem);
-        INIT_LIST_HEAD(&uprobe->pending_list);
        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
@@ -588,20 +563,22 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
 }
 static int
-__copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *insn,
+__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
-                        unsigned long nbytes, unsigned long offset)
+                        unsigned long nbytes, loff_t offset)
 {
-        struct file *filp = vma->vm_file;
        struct page *page;
        void *vaddr;
-        unsigned long off1;
+        unsigned long off;
-        unsigned long idx;
+        pgoff_t idx;
        if (!filp)
                return -EINVAL;
-        idx = (unsigned long)(offset >> PAGE_CACHE_SHIFT);
+        if (!mapping->a_ops->readpage)
-        off1 = offset &= ~PAGE_MASK;
+                return -EIO;
+        idx = offset >> PAGE_CACHE_SHIFT;
+        off = offset & ~PAGE_MASK;
        /*
         * Ensure that the page that has the original instruction is
@@ -612,22 +589,20 @@ __copy_insn(struct address_space *mapping, struct vm_area_struct *vma, char *ins
                return PTR_ERR(page);
        vaddr = kmap_atomic(page);
-        memcpy(insn, vaddr + off1, nbytes);
+        memcpy(insn, vaddr + off, nbytes);
        kunmap_atomic(vaddr);
        page_cache_release(page);
        return 0;
 }
-static int
+static int copy_insn(struct uprobe *uprobe, struct file *filp)
-copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
 {
        struct address_space *mapping;
        unsigned long nbytes;
        int bytes;
-        addr &= ~PAGE_MASK;
+        nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
-        nbytes = PAGE_SIZE - addr;
        mapping = uprobe->inode->i_mapping;
        /* Instruction at end of binary; copy only available bytes */
@@ -638,13 +613,13 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
        /* Instruction at the page-boundary; copy bytes in second page */
        if (nbytes < bytes) {
-                if (__copy_insn(mapping, vma, uprobe->arch.insn + nbytes,
+                int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
-                                bytes - nbytes, uprobe->offset + nbytes))
+                                bytes - nbytes, uprobe->offset + nbytes);
-                        return -ENOMEM;
+                if (err)
+                        return err;
                bytes = nbytes;
        }
-        return __copy_insn(mapping, vma, uprobe->arch.insn, bytes, uprobe->offset);
+        return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
 }
 /*
@@ -672,9 +647,8 @@ copy_insn(struct uprobe *uprobe, struct vm_area_struct *vma, unsigned long addr)
 */
 static int
 install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
-                        struct vm_area_struct *vma, loff_t vaddr)
+                        struct vm_area_struct *vma, unsigned long vaddr)
 {
-        unsigned long addr;
        int ret;
        /*
@@ -687,20 +661,22 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
        if (!uprobe->consumers)
                return -EEXIST;
-        addr = (unsigned long)vaddr;
        if (!(uprobe->flags & UPROBE_COPY_INSN)) {
-                ret = copy_insn(uprobe, vma, addr);
+                ret = copy_insn(uprobe, vma->vm_file);
                if (ret)
                        return ret;
                if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
-                        return -EEXIST;
+                        return -ENOTSUPP;
-                ret = arch_uprobe_analyze_insn(&uprobe->arch, mm);
+                ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
                if (ret)
                        return ret;
+                /* write_opcode() assumes we don't cross page boundary */
+                BUG_ON((uprobe->offset & ~PAGE_MASK) +
+                                UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
                uprobe->flags |= UPROBE_COPY_INSN;
        }
@@ -713,7 +689,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
         * Hence increment before and decrement on failure.
         */
        atomic_inc(&mm->uprobes_state.count);
-        ret = set_swbp(&uprobe->arch, mm, addr);
+        ret = set_swbp(&uprobe->arch, mm, vaddr);
        if (ret)
                atomic_dec(&mm->uprobes_state.count);
@@ -721,27 +697,21 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 }
 static void
-remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, loff_t vaddr)
+remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        if (!set_orig_insn(&uprobe->arch, mm, (unsigned long)vaddr, true))
+        if (!set_orig_insn(&uprobe->arch, mm, vaddr, true))
                atomic_dec(&mm->uprobes_state.count);
 }
 /*
- * There could be threads that have hit the breakpoint and are entering the
+ * There could be threads that have already hit the breakpoint. They
- * notifier code and trying to acquire the uprobes_treelock. The thread
+ * will recheck the current insn and restart if find_uprobe() fails.
- * calling delete_uprobe() that is removing the uprobe from the rb_tree can
+ * See find_active_uprobe().
- * race with these threads and might acquire the uprobes_treelock compared
- * to some of the breakpoint hit threads. In such a case, the breakpoint
- * hit threads will not find the uprobe. The current unregistering thread
- * waits till all other threads have hit a breakpoint, to acquire the
- * uprobes_treelock before the uprobe is removed from the rbtree.
 */
 static void delete_uprobe(struct uprobe *uprobe)
 {
        unsigned long flags;
-        synchronize_srcu(&uprobes_srcu);
        spin_lock_irqsave(&uprobes_treelock, flags);
        rb_erase(&uprobe->rb_node, &uprobes_tree);
        spin_unlock_irqrestore(&uprobes_treelock, flags);
@@ -750,139 +720,136 @@ static void delete_uprobe(struct uprobe *uprobe)
        atomic_dec(&uprobe_events);
 }
-static struct vma_info *
+struct map_info {
-__find_next_vma_info(struct address_space *mapping, struct list_head *head,
+        struct map_info *next;
-                        struct vma_info *vi, loff_t offset, bool is_register)
+        struct mm_struct *mm;
+        unsigned long vaddr;
+};
+static inline struct map_info *free_map_info(struct map_info *info)
+{
+        struct map_info *next = info->next;
+        kfree(info);
+        return next;
+}
+static struct map_info *
+build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
 {
+        unsigned long pgoff = offset >> PAGE_SHIFT;
        struct prio_tree_iter iter;
        struct vm_area_struct *vma;
-        struct vma_info *tmpvi;
+        struct map_info *curr = NULL;
-        unsigned long pgoff;
+        struct map_info *prev = NULL;
-        int existing_vma;
+        struct map_info *info;
-        loff_t vaddr;
+        int more = 0;
-        pgoff = offset >> PAGE_SHIFT;
+ again:
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                if (!valid_vma(vma, is_register))
                        continue;
-                existing_vma = 0;
+                if (!prev && !more) {
-                vaddr = vma_address(vma, offset);
+                        /*
+                         * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through
-                list_for_each_entry(tmpvi, head, probe_list) {
+                         * reclaim. This is optimistic, no harm done if it fails.
-                        if (tmpvi->mm == vma->vm_mm && tmpvi->vaddr == vaddr) {
+                         */
-                                existing_vma = 1;
+                        prev = kmalloc(sizeof(struct map_info),
-                                break;
+                                        GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
-                        }
+                        if (prev)
+                                prev->next = NULL;
                }
+                if (!prev) {
-                /*
+                        more++;
-                 * Another vma needs a probe to be installed. However skip
+                        continue;
-                 * installing the probe if the vma is about to be unlinked.
-                 */
-                if (!existing_vma && atomic_inc_not_zero(&vma->vm_mm->mm_users)) {
-                        vi->mm = vma->vm_mm;
-                        vi->vaddr = vaddr;
-                        list_add(&vi->probe_list, head);
-                        return vi;
                }
-        }
-        return NULL;
-}
-/*
+                if (!atomic_inc_not_zero(&vma->vm_mm->mm_users))
- * Iterate in the rmap prio tree  and find a vma where a probe has not
+                        continue;
- * yet been inserted.
- */
-static struct vma_info *
-find_next_vma_info(struct address_space *mapping, struct list_head *head,
-                loff_t offset, bool is_register)
-{
-        struct vma_info *vi, *retvi;
-        vi = kzalloc(sizeof(struct vma_info), GFP_KERNEL);
+                info = prev;
-        if (!vi)
+                prev = prev->next;
-                return ERR_PTR(-ENOMEM);
+                info->next = curr;
+                curr = info;
-        mutex_lock(&mapping->i_mmap_mutex);
+                info->mm = vma->vm_mm;
-        retvi = __find_next_vma_info(mapping, head, vi, offset, is_register);
+                info->vaddr = offset_to_vaddr(vma, offset);
+        }
        mutex_unlock(&mapping->i_mmap_mutex);
-        if (!retvi)
+        if (!more)
-                kfree(vi);
+                goto out;
+        prev = curr;
+        while (curr) {
+                mmput(curr->mm);
+                curr = curr->next;
+        }
-        return retvi;
+        do {
+                info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
+                if (!info) {
+                        curr = ERR_PTR(-ENOMEM);
+                        goto out;
+                }
+                info->next = prev;
+                prev = info;
+        } while (--more);
+        goto again;
+ out:
+        while (prev)
+                prev = free_map_info(prev);
+        return curr;
 }
 static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
 {
-        struct list_head try_list;
+        struct map_info *info;
-        struct vm_area_struct *vma;
+        int err = 0;
-        struct address_space *mapping;
-        struct vma_info *vi, *tmpvi;
-        struct mm_struct *mm;
-        loff_t vaddr;
-        int ret;
-        mapping = uprobe->inode->i_mapping;
-        INIT_LIST_HEAD(&try_list);
-        ret = 0;
+        info = build_map_info(uprobe->inode->i_mapping,
+                                        uprobe->offset, is_register);
+        if (IS_ERR(info))
+                return PTR_ERR(info);
-        for (;;) {
+        while (info) {
-                vi = find_next_vma_info(mapping, &try_list, uprobe->offset, is_register);
+                struct mm_struct *mm = info->mm;
-                if (!vi)
+                struct vm_area_struct *vma;
-                        break;
-                if (IS_ERR(vi)) {
+                if (err)
-                        ret = PTR_ERR(vi);
+                        goto free;
-                        break;
-                }
-                mm = vi->mm;
+                down_write(&mm->mmap_sem);
-                down_read(&mm->mmap_sem);
+                vma = find_vma(mm, info->vaddr);
-                vma = find_vma(mm, (unsigned long)vi->vaddr);
+                if (!vma || !valid_vma(vma, is_register) ||
-                if (!vma || !valid_vma(vma, is_register)) {
+                    vma->vm_file->f_mapping->host != uprobe->inode)
-                        list_del(&vi->probe_list);
+                        goto unlock;
-                        kfree(vi);
-                        up_read(&mm->mmap_sem);
-                        mmput(mm);
-                        continue;
-                }
-                vaddr = vma_address(vma, uprobe->offset);
-                if (vma->vm_file->f_mapping->host != uprobe->inode ||
-                                                vaddr != vi->vaddr) {
-                        list_del(&vi->probe_list);
-                        kfree(vi);
-                        up_read(&mm->mmap_sem);
-                        mmput(mm);
-                        continue;
-                }
-                if (is_register)
+                if (vma->vm_start > info->vaddr ||
-                        ret = install_breakpoint(uprobe, mm, vma, vi->vaddr);
+                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
-                else
+                        goto unlock;
-                        remove_breakpoint(uprobe, mm, vi->vaddr);
-                up_read(&mm->mmap_sem);
-                mmput(mm);
                if (is_register) {
-                        if (ret && ret == -EEXIST)
+                        err = install_breakpoint(uprobe, mm, vma, info->vaddr);
-                                ret = 0;
+                        /*
-                        if (ret)
+                         * We can race against uprobe_mmap(), see the
-                                break;
+                         * comment near uprobe_hash().
+                         */
+                        if (err == -EEXIST)
+                                err = 0;
+                } else {
+                        remove_breakpoint(uprobe, mm, info->vaddr);
                }
+ unlock:
+                up_write(&mm->mmap_sem);
+ free:
+                mmput(mm);
+                info = free_map_info(info);
        }
-        list_for_each_entry_safe(vi, tmpvi, &try_list, probe_list) {
+        return err;
-                list_del(&vi->probe_list);
-                kfree(vi);
-        }
-        return ret;
 }
 static int __uprobe_register(struct uprobe *uprobe)
@@ -977,59 +944,66 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
                put_uprobe(uprobe);
 }
-/*
+static struct rb_node *
- * Of all the nodes that correspond to the given inode, return the node
+find_node_in_range(struct inode *inode, loff_t min, loff_t max)
- * with the least offset.
- */
-static struct rb_node *find_least_offset_node(struct inode *inode)
 {
-        struct uprobe u = { .inode = inode, .offset = 0};
        struct rb_node *n = uprobes_tree.rb_node;
-        struct rb_node *close_node = NULL;
-        struct uprobe *uprobe;
-        int match;
        while (n) {
-                uprobe = rb_entry(n, struct uprobe, rb_node);
+                struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
-                match = match_uprobe(&u, uprobe);
-                if (uprobe->inode == inode)
+                if (inode < u->inode) {
-                        close_node = n;
-                if (!match)
-                        return close_node;
-                if (match < 0)
                        n = n->rb_left;
-                else
+                } else if (inode > u->inode) {
                        n = n->rb_right;
+                } else {
+                        if (max < u->offset)
+                                n = n->rb_left;
+                        else if (min > u->offset)
+                                n = n->rb_right;
+                        else
+                                break;
+                }
        }
-        return close_node;
+        return n;
 }
 /*
- * For a given inode, build a list of probes that need to be inserted.
+ * For a given range in vma, build a list of probes that need to be inserted.
 */
-static void build_probe_list(struct inode *inode, struct list_head *head)
+static void build_probe_list(struct inode *inode,
+                                struct vm_area_struct *vma,
+                                unsigned long start, unsigned long end,
+                                struct list_head *head)
 {
-        struct uprobe *uprobe;
+        loff_t min, max;
        unsigned long flags;
-        struct rb_node *n;
+        struct rb_node *n, *t;
+        struct uprobe *u;
-        spin_lock_irqsave(&uprobes_treelock, flags);
-        n = find_least_offset_node(inode);
-        for (; n; n = rb_next(n)) {
+        INIT_LIST_HEAD(head);
-                uprobe = rb_entry(n, struct uprobe, rb_node);
+        min = vaddr_to_offset(vma, start);
-                if (uprobe->inode != inode)
+        max = min + (end - start) - 1;
-                        break;
-                list_add(&uprobe->pending_list, head);
+        spin_lock_irqsave(&uprobes_treelock, flags);
-                atomic_inc(&uprobe->ref);
+        n = find_node_in_range(inode, min, max);
+        if (n) {
+                for (t = n; t; t = rb_prev(t)) {
+                        u = rb_entry(t, struct uprobe, rb_node);
+                        if (u->inode != inode || u->offset < min)
+                                break;
+                        list_add(&u->pending_list, head);
+                        atomic_inc(&u->ref);
+                }
+                for (t = n; (t = rb_next(t)); ) {
+                        u = rb_entry(t, struct uprobe, rb_node);
+                        if (u->inode != inode || u->offset > max)
+                                break;
+                        list_add(&u->pending_list, head);
+                        atomic_inc(&u->ref);
+                }
        }
        spin_unlock_irqrestore(&uprobes_treelock, flags);
 }
@@ -1059,28 +1033,21 @@ int uprobe_mmap(struct vm_area_struct *vma)
        if (!inode)
                return 0;
-        INIT_LIST_HEAD(&tmp_list);
        mutex_lock(uprobes_mmap_hash(inode));
-        build_probe_list(inode, &tmp_list);
+        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
        ret = 0;
        count = 0;
        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
-                loff_t vaddr;
-                list_del(&uprobe->pending_list);
                if (!ret) {
-                        vaddr = vma_address(vma, uprobe->offset);
+                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
-                        if (vaddr < vma->vm_start || vaddr >= vma->vm_end) {
-                                put_uprobe(uprobe);
-                                continue;
-                        }
                        ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
+                        /*
-                        /* Ignore double add: */
+                         * We can race against uprobe_register(), see the
+                         * comment near uprobe_hash().
+                         */
                        if (ret == -EEXIST) {
                                ret = 0;
@@ -1121,6 +1088,9 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
        if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
                return;
+        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
+                return;
        if (!atomic_read(&vma->vm_mm->uprobes_state.count))
                return;
@@ -1128,24 +1098,17 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
        if (!inode)
                return;
-        INIT_LIST_HEAD(&tmp_list);
        mutex_lock(uprobes_mmap_hash(inode));
-        build_probe_list(inode, &tmp_list);
+        build_probe_list(inode, vma, start, end, &tmp_list);
        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
-                loff_t vaddr;
+                unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
+                /*
-                list_del(&uprobe->pending_list);
+                 * An unregister could have removed the probe before
-                vaddr = vma_address(vma, uprobe->offset);
+                 * unmap. So check before we decrement the count.
+                 */
-                if (vaddr >= start && vaddr < end) {
+                if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
-                        /*
+                        atomic_dec(&vma->vm_mm->uprobes_state.count);
-                         * An unregister could have removed the probe before
-                         * unmap. So check before we decrement the count.
-                         */
-                        if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
-                                atomic_dec(&vma->vm_mm->uprobes_state.count);
-                }
                put_uprobe(uprobe);
        }
        mutex_unlock(uprobes_mmap_hash(inode));
@@ -1378,9 +1341,6 @@ void uprobe_free_utask(struct task_struct *t)
 {
        struct uprobe_task *utask = t->utask;
-        if (t->uprobe_srcu_id != -1)
-                srcu_read_unlock_raw(&uprobes_srcu, t->uprobe_srcu_id);
        if (!utask)
                return;
@@ -1398,7 +1358,6 @@ void uprobe_free_utask(struct task_struct *t)
 void uprobe_copy_process(struct task_struct *t)
 {
        t->utask = NULL;
-        t->uprobe_srcu_id = -1;
 }
 /*
@@ -1417,7 +1376,6 @@ static struct uprobe_task *add_utask(void)
        if (unlikely(!utask))
                return NULL;
-        utask->active_uprobe = NULL;
        current->utask = utask;
        return utask;
 }
@@ -1479,41 +1437,61 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
        return false;
 }
+static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+{
+        struct mm_struct *mm = current->mm;
+        struct uprobe *uprobe = NULL;
+        struct vm_area_struct *vma;
+        down_read(&mm->mmap_sem);
+        vma = find_vma(mm, bp_vaddr);
+        if (vma && vma->vm_start <= bp_vaddr) {
+                if (valid_vma(vma, false)) {
+                        struct inode *inode = vma->vm_file->f_mapping->host;
+                        loff_t offset = vaddr_to_offset(vma, bp_vaddr);
+                        uprobe = find_uprobe(inode, offset);
+                }
+                if (!uprobe)
+                        *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
+        } else {
+                *is_swbp = -EFAULT;
+        }
+        up_read(&mm->mmap_sem);
+        return uprobe;
+}
 /*
 * Run handler and ask thread to singlestep.
 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
 */
 static void handle_swbp(struct pt_regs *regs)
 {
-        struct vm_area_struct *vma;
        struct uprobe_task *utask;
        struct uprobe *uprobe;
-        struct mm_struct *mm;
        unsigned long bp_vaddr;
+        int uninitialized_var(is_swbp);
-        uprobe = NULL;
        bp_vaddr = uprobe_get_swbp_addr(regs);
-        mm = current->mm;
+        uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
-        down_read(&mm->mmap_sem);
-        vma = find_vma(mm, bp_vaddr);
-        if (vma && vma->vm_start <= bp_vaddr && valid_vma(vma, false)) {
-                struct inode *inode;
-                loff_t offset;
-                inode = vma->vm_file->f_mapping->host;
-                offset = bp_vaddr - vma->vm_start;
-                offset += (vma->vm_pgoff << PAGE_SHIFT);
-                uprobe = find_uprobe(inode, offset);
-        }
-        srcu_read_unlock_raw(&uprobes_srcu, current->uprobe_srcu_id);
-        current->uprobe_srcu_id = -1;
-        up_read(&mm->mmap_sem);
        if (!uprobe) {
-                /* No matching uprobe; signal SIGTRAP. */
+                if (is_swbp > 0) {
-                send_sig(SIGTRAP, current, 0);
+                        /* No matching uprobe; signal SIGTRAP. */
+                        send_sig(SIGTRAP, current, 0);
+                } else {
+                        /*
+                         * Either we raced with uprobe_unregister() or we can't
+                         * access this memory. The latter is only possible if
+                         * another thread plays with our ->mm. In both cases
+                         * we can simply restart. If this vma was unmapped we
+                         * can pretend this insn was not executed yet and get
+                         * the (correct) SIGSEGV after restart.
+                         */
+                        instruction_pointer_set(regs, bp_vaddr);
+                }
                return;
        }
@@ -1620,7 +1598,6 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
                utask->state = UTASK_BP_HIT;
        set_thread_flag(TIF_UPROBE);
-        current->uprobe_srcu_id = srcu_read_lock_raw(&uprobes_srcu);
        return 1;
 }
@@ -1655,7 +1632,6 @@ static int __init init_uprobes(void)
                mutex_init(&uprobes_mutex[i]);
                mutex_init(&uprobes_mmap_mutex[i]);
        }
-        init_srcu_struct(&uprobes_srcu);
        return register_die_notifier(&uprobe_exception_nb);
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f59cc334516..f65345f9e5bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -483,7 +483,7 @@ static void close_files(struct files_struct * files)
        rcu_read_unlock();
        for (;;) {
                unsigned long set;
-                i = j * __NFDBITS;
+                i = j * BITS_PER_LONG;
                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds[j++];
@@ -953,14 +953,11 @@ void do_exit(long code)
        exit_signals(tsk);  /* sets PF_EXITING */
        /*
         * tsk->flags are checked in the futex code to protect against
-         * an exiting task cleaning up the robust pi futexes, and in
+         * an exiting task cleaning up the robust pi futexes.
-         * task_work_add() to avoid the race with exit_task_work().
         */
        smp_mb();
        raw_spin_unlock_wait(&tsk->pi_lock);
-        exit_task_work(tsk);
        if (unlikely(in_atomic()))
                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
                                current->comm, task_pid_nr(current),
@@ -995,6 +992,7 @@ void do_exit(long code)
        exit_shm(tsk);
        exit_files(tsk);
        exit_fs(tsk);
+        exit_task_work(tsk);
        check_stack_usage();
        exit_thread();
diff --git a/kernel/fork.c b/kernel/fork.c
index f00e319d8376..2c8857e12855 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -114,6 +114,10 @@ int nr_processes(void)
        return total;
 }
+void __weak arch_release_task_struct(struct task_struct *tsk)
+{
+}
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 static struct kmem_cache *task_struct_cachep;
@@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node)
        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
 }
-void __weak arch_release_task_struct(struct task_struct *tsk) { }
 static inline void free_task_struct(struct task_struct *tsk)
 {
-        arch_release_task_struct(tsk);
        kmem_cache_free(task_struct_cachep, tsk);
 }
 #endif
+void __weak arch_release_thread_info(struct thread_info *ti)
+{
+}
 #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
-void __weak arch_release_thread_info(struct thread_info *ti) { }
 /*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 static inline void free_thread_info(struct thread_info *ti)
 {
-        arch_release_thread_info(ti);
        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 # else
@@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 static void free_thread_info(struct thread_info *ti)
 {
-        arch_release_thread_info(ti);
        kmem_cache_free(thread_info_cache, ti);
 }
@@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
        account_kernel_stack(tsk->stack, -1);
+        arch_release_thread_info(tsk->stack);
        free_thread_info(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
+        arch_release_task_struct(tsk);
        free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -298,23 +302,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
                return NULL;
        ti = alloc_thread_info_node(tsk, node);
-        if (!ti) {
+        if (!ti)
-                free_task_struct(tsk);
+                goto free_tsk;
-                return NULL;
-        }
        err = arch_dup_task_struct(tsk, orig);
+        if (err)
+                goto free_ti;
-        /*
-         * We defer looking at err, because we will need this setup
-         * for the clean up path to work correctly.
-         */
        tsk->stack = ti;
-        setup_thread_stack(tsk, orig);
-        if (err)
-                goto out;
+        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
@@ -338,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        return tsk;
-out:
+free_ti:
        free_thread_info(ti);
+free_tsk:
        free_task_struct(tsk);
        return NULL;
 }
@@ -383,16 +381,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                struct file *file;
                if (mpnt->vm_flags & VM_DONTCOPY) {
-                        long pages = vma_pages(mpnt);
-                        mm->total_vm -= pages;
                        vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
-                                                                -pages);
+                                                        -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                if (mpnt->vm_flags & VM_ACCOUNT) {
-                        unsigned long len;
+                        unsigned long len = vma_pages(mpnt);
-                        len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
@@ -459,8 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                if (retval)
                        goto out;
-                if (file && uprobe_mmap(tmp))
+                if (file)
-                        goto out;
+                        uprobe_mmap(tmp);
        }
        /* a new mm has just been created */
        arch_dup_mmap(oldmm, mm);
@@ -1310,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
        p->memcg_batch.do_batch = 0;
        p->memcg_batch.memcg = NULL;
 #endif
@@ -1420,7 +1416,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
-        INIT_HLIST_HEAD(&p->task_works);
+        p->task_works = NULL;
        /* Now that the task is set up, run cgroup callbacks if
         * necessary. We need to run them before the task is visible
diff --git a/kernel/futex.c b/kernel/futex.c
index e2b0fb9a0b3b..3717e7b306e0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 * @uaddr2:     the pi futex we will take prior to returning to user-space
 *
 * The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
+ * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
- * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
- * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
- * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * without one, the pi logic would not know which task to boost/deboost, if
- * need to.
+ * there was a need to.
 *
 * We call schedule in futex_wait_queue_me() when we enqueue and return there
 * via the following:
@@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        struct futex_q q = futex_q_init;
        int res, ret;
+        if (uaddr == uaddr2)
+                return -EINVAL;
        if (!bitset)
                return -EINVAL;
@@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
                 * the pi_state.
                 */
-                WARN_ON(!&q.pi_state);
+                WARN_ON(!q.pi_state);
                pi_mutex = &q.pi_state->pi_mutex;
                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
                debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * fault, unlock the rt_mutex and return the fault to userspace.
         */
        if (ret == -EFAULT) {
-                if (rt_mutex_owner(pi_mutex) == current)
+                if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
                        rt_mutex_unlock(pi_mutex);
        } else if (ret == -EINTR) {
                /*
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index bdb180325551..131ca176b497 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,7 +133,7 @@ irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
        irqreturn_t retval = IRQ_NONE;
-        unsigned int random = 0, irq = desc->irq_data.irq;
+        unsigned int flags = 0, irq = desc->irq_data.irq;
        do {
                irqreturn_t res;
@@ -161,7 +161,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
                        /* Fall through to add to randomness */
                case IRQ_HANDLED:
-                        random |= action->flags;
+                        flags |= action->flags;
                        break;
                default:
@@ -172,8 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
                action = action->next;
        } while (action);
-        if (random & IRQF_SAMPLE_RANDOM)
+        add_interrupt_randomness(irq, flags);
-                add_interrupt_randomness(irq);
        if (!noirqdebug)
                note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 41c1564103f1..49a77727db42 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -10,6 +10,7 @@
 #include <linux/mutex.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/topology.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
@@ -45,7 +46,8 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
 {
        struct irq_domain *domain;
-        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+        domain = kzalloc_node(sizeof(*domain), GFP_KERNEL,
+                              of_node_to_nid(of_node));
        if (WARN_ON(!domain))
                return NULL;
@@ -138,6 +140,36 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
 }
 /**
+ * irq_domain_add_simple() - Allocate and register a simple irq_domain.
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @size: total number of irqs in mapping
+ * @first_irq: first number of irq block assigned to the domain
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Allocates a legacy irq_domain if irq_base is positive or a linear
+ * domain otherwise.
+ *
+ * This is intended to implement the expected behaviour for most
+ * interrupt controllers which is that a linear mapping should
+ * normally be used unless the system requires a legacy mapping in
+ * order to support supplying interrupt numbers during non-DT
+ * registration of devices.
+ */
+struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
+                                         unsigned int size,
+                                         unsigned int first_irq,
+                                         const struct irq_domain_ops *ops,
+                                         void *host_data)
+{
+        if (first_irq > 0)
+                return irq_domain_add_legacy(of_node, size, first_irq, 0,
+                                             ops, host_data);
+        else
+                return irq_domain_add_linear(of_node, size, ops, host_data);
+}
+/**
 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
 * @of_node: pointer to interrupt controller's device tree node.
 * @size: total number of irqs in legacy mapping
@@ -203,7 +235,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
                 * one can then use irq_create_mapping() to
                 * explicitly change them
                 */
-                ops->map(domain, irq, hwirq);
+                if (ops->map)
+                        ops->map(domain, irq, hwirq);
                /* Clear norequest flags */
                irq_clear_status_flags(irq, IRQ_NOREQUEST);
@@ -215,7 +248,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
 /**
- * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
+ * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
 * @of_node: pointer to interrupt controller's device tree node.
 * @size: Number of interrupts in the domain.
 * @ops: map/unmap domain callbacks
@@ -229,7 +262,8 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
        struct irq_domain *domain;
        unsigned int *revmap;
-        revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL);
+        revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
+                              of_node_to_nid(of_node));
        if (WARN_ON(!revmap))
                return NULL;
@@ -330,24 +364,112 @@ void irq_set_default_host(struct irq_domain *domain)
 }
 EXPORT_SYMBOL_GPL(irq_set_default_host);
-static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
+static void irq_domain_disassociate_many(struct irq_domain *domain,
-                            irq_hw_number_t hwirq)
+                                         unsigned int irq_base, int count)
 {
-        struct irq_data *irq_data = irq_get_irq_data(virq);
+        /*
+         * disassociate in reverse order;
+         * not strictly necessary, but nice for unwinding
+         */
+        while (count--) {
+                int irq = irq_base + count;
+                struct irq_data *irq_data = irq_get_irq_data(irq);
+                irq_hw_number_t hwirq = irq_data->hwirq;
+                if (WARN_ON(!irq_data || irq_data->domain != domain))
+                        continue;
+                irq_set_status_flags(irq, IRQ_NOREQUEST);
+                /* remove chip and handler */
+                irq_set_chip_and_handler(irq, NULL, NULL);
+                /* Make sure it's completed */
+                synchronize_irq(irq);
+                /* Tell the PIC about it */
+                if (domain->ops->unmap)
+                        domain->ops->unmap(domain, irq);
+                smp_mb();
-        irq_data->hwirq = hwirq;
-        irq_data->domain = domain;
-        if (domain->ops->map(domain, virq, hwirq)) {
-                pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq);
                irq_data->domain = NULL;
                irq_data->hwirq = 0;
-                return -1;
+                /* Clear reverse map */
+                switch(domain->revmap_type) {
+                case IRQ_DOMAIN_MAP_LINEAR:
+                        if (hwirq < domain->revmap_data.linear.size)
+                                domain->revmap_data.linear.revmap[hwirq] = 0;
+                        break;
+                case IRQ_DOMAIN_MAP_TREE:
+                        mutex_lock(&revmap_trees_mutex);
+                        radix_tree_delete(&domain->revmap_data.tree, hwirq);
+                        mutex_unlock(&revmap_trees_mutex);
+                        break;
+                }
        }
+}
+int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
+                              irq_hw_number_t hwirq_base, int count)
+{
+        unsigned int virq = irq_base;
+        irq_hw_number_t hwirq = hwirq_base;
+        int i, ret;
+        pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
+                of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
+        for (i = 0; i < count; i++) {
+                struct irq_data *irq_data = irq_get_irq_data(virq + i);
+                if (WARN(!irq_data, "error: irq_desc not allocated; "
+                         "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
+                        return -EINVAL;
+                if (WARN(irq_data->domain, "error: irq_desc already associated; "
+                         "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
+                        return -EINVAL;
+        };
+        for (i = 0; i < count; i++, virq++, hwirq++) {
+                struct irq_data *irq_data = irq_get_irq_data(virq);
+                irq_data->hwirq = hwirq;
+                irq_data->domain = domain;
+                if (domain->ops->map) {
+                        ret = domain->ops->map(domain, virq, hwirq);
+                        if (ret != 0) {
+                                pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
+                                       virq, hwirq, ret);
+                                WARN_ON(1);
+                                irq_data->domain = NULL;
+                                irq_data->hwirq = 0;
+                                goto err_unmap;
+                        }
+                }
-        irq_clear_status_flags(virq, IRQ_NOREQUEST);
+                switch (domain->revmap_type) {
+                case IRQ_DOMAIN_MAP_LINEAR:
+                        if (hwirq < domain->revmap_data.linear.size)
+                                domain->revmap_data.linear.revmap[hwirq] = virq;
+                        break;
+                case IRQ_DOMAIN_MAP_TREE:
+                        mutex_lock(&revmap_trees_mutex);
+                        radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
+                        mutex_unlock(&revmap_trees_mutex);
+                        break;
+                }
+                irq_clear_status_flags(virq, IRQ_NOREQUEST);
+        }
        return 0;
+ err_unmap:
+        irq_domain_disassociate_many(domain, irq_base, i);
+        return -EINVAL;
 }
+EXPORT_SYMBOL_GPL(irq_domain_associate_many);
 /**
 * irq_create_direct_mapping() - Allocate an irq for direct mapping
@@ -364,10 +486,10 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
        if (domain == NULL)
                domain = irq_default_domain;
-        BUG_ON(domain == NULL);
+        if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
-        WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
+                return 0;
-        virq = irq_alloc_desc_from(1, 0);
+        virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
        if (!virq) {
                pr_debug("create_direct virq allocation failed\n");
                return 0;
@@ -380,7 +502,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
        }
        pr_debug("create_direct obtained virq %d\n", virq);
-        if (irq_setup_virq(domain, virq, virq)) {
+        if (irq_domain_associate(domain, virq, virq)) {
                irq_free_desc(virq);
                return 0;
        }
@@ -433,27 +555,64 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
        hint = hwirq % nr_irqs;
        if (hint == 0)
                hint++;
-        virq = irq_alloc_desc_from(hint, 0);
+        virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
        if (virq <= 0)
-                virq = irq_alloc_desc_from(1, 0);
+                virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
        if (virq <= 0) {
                pr_debug("-> virq allocation failed\n");
                return 0;
        }
-        if (irq_setup_virq(domain, virq, hwirq)) {
+        if (irq_domain_associate(domain, virq, hwirq)) {
-                if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
+                irq_free_desc(virq);
-                        irq_free_desc(virq);
                return 0;
        }
        pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
-                hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
+                hwirq, of_node_full_name(domain->of_node), virq);
        return virq;
 }
 EXPORT_SYMBOL_GPL(irq_create_mapping);
+/**
+ * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
+ * @domain: domain owning the interrupt range
+ * @irq_base: beginning of linux IRQ range
+ * @hwirq_base: beginning of hardware IRQ range
+ * @count: Number of interrupts to map
+ *
+ * This routine is used for allocating and mapping a range of hardware
+ * irqs to linux irqs where the linux irq numbers are at pre-defined
+ * locations. For use by controllers that already have static mappings
+ * to insert in to the domain.
+ *
+ * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
+ * domain insertion.
+ *
+ * 0 is returned upon success, while any failure to establish a static
+ * mapping is treated as an error.
+ */
+int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
+                               irq_hw_number_t hwirq_base, int count)
+{
+        int ret;
+        ret = irq_alloc_descs(irq_base, irq_base, count,
+                              of_node_to_nid(domain->of_node));
+        if (unlikely(ret < 0))
+                return ret;
+        ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count);
+        if (unlikely(ret < 0)) {
+                irq_free_descs(irq_base, count);
+                return ret;
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
 unsigned int irq_create_of_mapping(struct device_node *controller,
                                   const u32 *intspec, unsigned int intsize)
 {
@@ -477,7 +636,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
                        return intspec[0];
 #endif
                pr_warning("no irq domain found for %s !\n",
-                           controller->full_name);
+                           of_node_full_name(controller));
                return 0;
        }
@@ -511,7 +670,6 @@ void irq_dispose_mapping(unsigned int virq)
 {
        struct irq_data *irq_data = irq_get_irq_data(virq);
        struct irq_domain *domain;
-        irq_hw_number_t hwirq;
        if (!virq || !irq_data)
                return;
@@ -524,33 +682,7 @@ void irq_dispose_mapping(unsigned int virq)
        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
                return;
-        irq_set_status_flags(virq, IRQ_NOREQUEST);
+        irq_domain_disassociate_many(domain, virq, 1);
-        /* remove chip and handler */
-        irq_set_chip_and_handler(virq, NULL, NULL);
-        /* Make sure it's completed */
-        synchronize_irq(virq);
-        /* Tell the PIC about it */
-        if (domain->ops->unmap)
-                domain->ops->unmap(domain, virq);
-        smp_mb();
-        /* Clear reverse map */
-        hwirq = irq_data->hwirq;
-        switch(domain->revmap_type) {
-        case IRQ_DOMAIN_MAP_LINEAR:
-                if (hwirq < domain->revmap_data.linear.size)
-                        domain->revmap_data.linear.revmap[hwirq] = 0;
-                break;
-        case IRQ_DOMAIN_MAP_TREE:
-                mutex_lock(&revmap_trees_mutex);
-                radix_tree_delete(&domain->revmap_data.tree, hwirq);
-                mutex_unlock(&revmap_trees_mutex);
-                break;
-        }
        irq_free_desc(virq);
 }
 EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -559,16 +691,11 @@ EXPORT_SYMBOL_GPL(irq_dispose_mapping);
 * irq_find_mapping() - Find a linux irq from an hw irq number.
 * @domain: domain owning this hardware interrupt
 * @hwirq: hardware irq number in that domain space
- *
- * This is a slow path, for use by generic code. It's expected that an
- * irq controller implementation directly calls the appropriate low level
- * mapping function.
 */
 unsigned int irq_find_mapping(struct irq_domain *domain,
                              irq_hw_number_t hwirq)
 {
-        unsigned int i;
+        struct irq_data *data;
-        unsigned int hint = hwirq % nr_irqs;
        /* Look for default domain if nececssary */
        if (domain == NULL)
@@ -576,115 +703,47 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
        if (domain == NULL)
                return 0;
-        /* legacy -> bail early */
+        switch (domain->revmap_type) {
-        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+        case IRQ_DOMAIN_MAP_LEGACY:
                return irq_domain_legacy_revmap(domain, hwirq);
+        case IRQ_DOMAIN_MAP_LINEAR:
-        /* Slow path does a linear search of the map */
+                return irq_linear_revmap(domain, hwirq);
-        if (hint == 0)
+        case IRQ_DOMAIN_MAP_TREE:
-                hint = 1;
+                rcu_read_lock();
-        i = hint;
+                data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
-        do {
+                rcu_read_unlock();
-                struct irq_data *data = irq_get_irq_data(i);
+                if (data)
+                        return data->irq;
+                break;
+        case IRQ_DOMAIN_MAP_NOMAP:
+                data = irq_get_irq_data(hwirq);
                if (data && (data->domain == domain) && (data->hwirq == hwirq))
-                        return i;
+                        return hwirq;
-                i++;
+                break;
-                if (i >= nr_irqs)
+        }
-                        i = 1;
-        } while(i != hint);
        return 0;
 }
 EXPORT_SYMBOL_GPL(irq_find_mapping);
 /**
- * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
- * @domain: domain owning this hardware interrupt
- * @hwirq: hardware irq number in that domain space
- *
- * This is a fast path, for use by irq controller code that uses radix tree
- * revmaps
- */
-unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
-                                     irq_hw_number_t hwirq)
-{
-        struct irq_data *irq_data;
-        if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
-                return irq_find_mapping(domain, hwirq);
-        /*
-         * Freeing an irq can delete nodes along the path to
-         * do the lookup via call_rcu.
-         */
-        rcu_read_lock();
-        irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
-        rcu_read_unlock();
-        /*
-         * If found in radix tree, then fine.
-         * Else fallback to linear lookup - this should not happen in practice
-         * as it means that we failed to insert the node in the radix tree.
-         */
-        return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
-}
-EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup);
-/**
- * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
- * @domain: domain owning this hardware interrupt
- * @virq: linux irq number
- * @hwirq: hardware irq number in that domain space
- *
- * This is for use by irq controllers that use a radix tree reverse
- * mapping for fast lookup.
- */
-void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
-                             irq_hw_number_t hwirq)
-{
-        struct irq_data *irq_data = irq_get_irq_data(virq);
-        if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
-                return;
-        if (virq) {
-                mutex_lock(&revmap_trees_mutex);
-                radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
-                mutex_unlock(&revmap_trees_mutex);
-        }
-}
-EXPORT_SYMBOL_GPL(irq_radix_revmap_insert);
-/**
 * irq_linear_revmap() - Find a linux irq from a hw irq number.
 * @domain: domain owning this hardware interrupt
 * @hwirq: hardware irq number in that domain space
 *
- * This is a fast path, for use by irq controller code that uses linear
+ * This is a fast path that can be called directly by irq controller code to
- * revmaps. It does fallback to the slow path if the revmap doesn't exist
+ * save a handful of instructions.
- * yet and will create the revmap entry with appropriate locking
 */
 unsigned int irq_linear_revmap(struct irq_domain *domain,
                               irq_hw_number_t hwirq)
 {
-        unsigned int *revmap;
+        BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR);
-        if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
-                return irq_find_mapping(domain, hwirq);
-        /* Check revmap bounds */
+        /* Check revmap bounds; complain if exceeded */
-        if (unlikely(hwirq >= domain->revmap_data.linear.size))
+        if (WARN_ON(hwirq >= domain->revmap_data.linear.size))
-                return irq_find_mapping(domain, hwirq);
+                return 0;
-        /* Check if revmap was allocated */
-        revmap = domain->revmap_data.linear.revmap;
-        if (unlikely(revmap == NULL))
-                return irq_find_mapping(domain, hwirq);
-        /* Fill up revmap with slow path if no mapping found */
-        if (unlikely(!revmap[hwirq]))
-                revmap[hwirq] = irq_find_mapping(domain, hwirq);
-        return revmap[hwirq];
+        return domain->revmap_data.linear.revmap[hwirq];
 }
 EXPORT_SYMBOL_GPL(irq_linear_revmap);
@@ -725,8 +784,8 @@ static int virq_debug_show(struct seq_file *m, void *private)
                        data = irq_desc_get_chip_data(desc);
                        seq_printf(m, data ? "0x%p  " : "  %p  ", data);
-                        if (desc->irq_data.domain && desc->irq_data.domain->of_node)
+                        if (desc->irq_data.domain)
-                                p = desc->irq_data.domain->of_node->full_name;
+                                p = of_node_full_name(desc->irq_data.domain->of_node);
                        else
                                p = none;
                        seq_printf(m, "%s\n", p);
@@ -761,12 +820,6 @@ static int __init irq_debugfs_init(void)
 __initcall(irq_debugfs_init);
 #endif /* CONFIG_IRQ_DOMAIN_DEBUG */
-static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
-                                 irq_hw_number_t hwirq)
-{
-        return 0;
-}
 /**
 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
 *
@@ -829,7 +882,6 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
 EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
 const struct irq_domain_ops irq_domain_simple_ops = {
-        .map = irq_domain_simple_map,
        .xlate = irq_domain_xlate_onetwocell,
 };
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8c548232ba39..4c69326aa773 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -781,7 +781,7 @@ static void wake_threads_waitq(struct irq_desc *desc)
                wake_up(&desc->wait_for_threads);
 }
-static void irq_thread_dtor(struct task_work *unused)
+static void irq_thread_dtor(struct callback_head *unused)
 {
        struct task_struct *tsk = current;
        struct irq_desc *desc;
@@ -813,7 +813,7 @@ static void irq_thread_dtor(struct task_work *unused)
 */
 static int irq_thread(void *data)
 {
-        struct task_work on_exit_work;
+        struct callback_head on_exit_work;
        static const struct sched_param param = {
                .sched_priority = MAX_USER_RT_PRIO/2,
        };
@@ -830,7 +830,7 @@ static int irq_thread(void *data)
        sched_setscheduler(current, SCHED_FIFO, &param);
-        init_task_work(&on_exit_work, irq_thread_dtor, NULL);
+        init_task_work(&on_exit_work, irq_thread_dtor);
        task_work_add(current, &on_exit_work, false);
        while (!irq_wait_for_interrupt(action)) {
@@ -893,22 +893,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                return -ENOSYS;
        if (!try_module_get(desc->owner))
                return -ENODEV;
-        /*
-         * Some drivers like serial.c use request_irq() heavily,
-         * so we have to be careful not to interfere with a
-         * running system.
-         */
-        if (new->flags & IRQF_SAMPLE_RANDOM) {
-                /*
-                 * This function might sleep, we want to call it first,
-                 * outside of the atomic block.
-                 * Yes, this might clear the entropy pool if the wrong
-                 * driver is attempted to be loaded, without actually
-                 * installing a new handler, but is this really a problem,
-                 * only the sysadmin is able to do this.
-                 */
-                rand_initialize_irq(irq);
-        }
        /*
         * Check whether the interrupt nests into another interrupt
@@ -960,6 +944,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        }
        /*
+         * Drivers are often written to work w/o knowledge about the
+         * underlying irq chip implementation, so a request for a
+         * threaded irq without a primary hard irq context handler
+         * requires the ONESHOT flag to be set. Some irq chips like
+         * MSI based interrupts are per se one shot safe. Check the
+         * chip flags, so we can avoid the unmask dance at the end of
+         * the threaded handler for those.
+         */
+        if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
+                new->flags &= ~IRQF_ONESHOT;
+        /*
         * The following block of code has to be executed atomically
         */
        raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1033,7 +1029,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 */
                new->thread_mask = 1 << ffz(thread_mask);
-        } else if (new->handler == irq_default_primary_handler) {
+        } else if (new->handler == irq_default_primary_handler &&
+                   !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
                /*
                 * The interrupt was requested with handler = NULL, so
                 * we use the default primary handler for it. But it
@@ -1354,7 +1351,6 @@ EXPORT_SYMBOL(free_irq);
 *      Flags:
 *
 *      IRQF_SHARED             Interrupt is shared
- *      IRQF_SAMPLE_RANDOM      The interrupt can be used for entropy
 *      IRQF_TRIGGER_*          Specify active edge(s) or level
 *
 */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4e2e472f6aeb..0668d58d6413 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void)
 void crash_save_vmcoreinfo(void)
 {
-        vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+        vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
        update_vmcoreinfo_note();
 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ff2c7cb86d77..6f99aead66c6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,6 +45,13 @@ extern int max_threads;
 static struct workqueue_struct *khelper_wq;
+/*
+ * kmod_thread_locker is used for deadlock avoidance.  There is no explicit
+ * locking to protect this global - it is private to the singleton khelper
+ * thread and should only ever be modified by that thread.
+ */
+static const struct task_struct *kmod_thread_locker;
 #define CAP_BSET        (void *)1
 #define CAP_PI          (void *)2
@@ -221,6 +228,13 @@ fail:
        return 0;
 }
+static int call_helper(void *data)
+{
+        /* Worker thread started blocking khelper thread. */
+        kmod_thread_locker = current;
+        return ____call_usermodehelper(data);
+}
 static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
        if (info->cleanup)
@@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work)
        if (wait == UMH_WAIT_PROC)
                pid = kernel_thread(wait_for_helper, sub_info,
                                    CLONE_FS | CLONE_FILES | SIGCHLD);
-        else
+        else {
-                pid = kernel_thread(____call_usermodehelper, sub_info,
+                pid = kernel_thread(call_helper, sub_info,
                                    CLONE_VFORK | SIGCHLD);
+                /* Worker thread stopped blocking khelper thread. */
+                kmod_thread_locker = NULL;
+        }
        switch (wait) {
        case UMH_NO_WAIT:
@@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
                retval = -EBUSY;
                goto out;
        }
+        /*
+         * Worker thread must not wait for khelper thread at below
+         * wait_for_completion() if the thread was created with CLONE_VFORK
+         * flag, for khelper thread is already waiting for the thread at
+         * wait_for_completion() in do_fork().
+         */
+        if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
+                retval = -EBUSY;
+                goto out;
+        }
        sub_info->complete = &done;
        sub_info->wait = wait;
@@ -577,6 +604,12 @@ unlock:
        return retval;
 }
+/*
+ * call_usermodehelper_fns() will not run the caller-provided cleanup function
+ * if a memory allocation failure is experienced.  So the caller might need to
+ * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
+ * the necessaary cleanup within the caller.
+ */
 int call_usermodehelper_fns(
        char *path, char **argv, char **envp, int wait,
        int (*init)(struct subprocess_info *info, struct cred *new),
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702e..b579af57ea10 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -360,16 +360,12 @@ repeat:
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
+        worker->current_work = work;
        spin_unlock_irq(&worker->lock);
        if (work) {
                __set_current_state(TASK_RUNNING);
                work->func(work);
-                smp_wmb();      /* wmb worker-b0 paired with flush-b1 */
-                work->done_seq = work->queue_seq;
-                smp_mb();       /* mb worker-b1 paired with flush-b0 */
-                if (atomic_read(&work->flushing))
-                        wake_up_all(&work->done);
        } else if (!freezing(current))
                schedule();
@@ -378,6 +374,19 @@ repeat:
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
+/* insert @work before @pos in @worker */
+static void insert_kthread_work(struct kthread_worker *worker,
+                               struct kthread_work *work,
+                               struct list_head *pos)
+{
+        lockdep_assert_held(&worker->lock);
+        list_add_tail(&work->node, pos);
+        work->worker = worker;
+        if (likely(worker->task))
+                wake_up_process(worker->task);
+}
 /**
 * queue_kthread_work - queue a kthread_work
 * @worker: target kthread_worker
@@ -395,10 +404,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
        spin_lock_irqsave(&worker->lock, flags);
        if (list_empty(&work->node)) {
-                list_add_tail(&work->node, &worker->work_list);
+                insert_kthread_work(worker, work, &worker->work_list);
-                work->queue_seq++;
-                if (likely(worker->task))
-                        wake_up_process(worker->task);
                ret = true;
        }
        spin_unlock_irqrestore(&worker->lock, flags);
@@ -406,6 +412,18 @@ bool queue_kthread_work(struct kthread_worker *worker,
 }
 EXPORT_SYMBOL_GPL(queue_kthread_work);
+struct kthread_flush_work {
+        struct kthread_work     work;
+        struct completion       done;
+};
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+        struct kthread_flush_work *fwork =
+                container_of(work, struct kthread_flush_work, work);
+        complete(&fwork->done);
+}
 /**
 * flush_kthread_work - flush a kthread_work
 * @work: work to flush
@@ -414,39 +432,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work);
 */
 void flush_kthread_work(struct kthread_work *work)
 {
-        int seq = work->queue_seq;
+        struct kthread_flush_work fwork = {
+                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
-        atomic_inc(&work->flushing);
+                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+        };
+        struct kthread_worker *worker;
+        bool noop = false;
-        /*
+retry:
-         * mb flush-b0 paired with worker-b1, to make sure either
+        worker = work->worker;
-         * worker sees the above increment or we see done_seq update.
+        if (!worker)
-         */
+                return;
-        smp_mb__after_atomic_inc();
-        /* A - B <= 0 tests whether B is in front of A regardless of overflow */
+        spin_lock_irq(&worker->lock);
-        wait_event(work->done, seq - work->done_seq <= 0);
+        if (work->worker != worker) {
-        atomic_dec(&work->flushing);
+                spin_unlock_irq(&worker->lock);
+                goto retry;
+        }
-        /*
+        if (!list_empty(&work->node))
-         * rmb flush-b1 paired with worker-b0, to make sure our caller
+                insert_kthread_work(worker, &fwork.work, work->node.next);
-         * sees every change made by work->func().
+        else if (worker->current_work == work)
-         */
+                insert_kthread_work(worker, &fwork.work, worker->work_list.next);
-        smp_mb__after_atomic_dec();
+        else
-}
+                noop = true;
-EXPORT_SYMBOL_GPL(flush_kthread_work);
-struct kthread_flush_work {
+        spin_unlock_irq(&worker->lock);
-        struct kthread_work     work;
-        struct completion       done;
-};
-static void kthread_flush_work_fn(struct kthread_work *work)
+        if (!noop)
-{
+                wait_for_completion(&fwork.done);
-        struct kthread_flush_work *fwork =
-                container_of(work, struct kthread_flush_work, work);
-        complete(&fwork->done);
 }
+EXPORT_SYMBOL_GPL(flush_kthread_work);
 /**
 * flush_kthread_worker - flush all current works on a kthread_worker
diff --git a/kernel/panic.c b/kernel/panic.c
index d2a5f4ecc6dd..e1b2822fff97 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -75,6 +75,14 @@ void panic(const char *fmt, ...)
        int state = 0;
        /*
+         * Disable local interrupts. This will prevent panic_smp_self_stop
+         * from deadlocking the first cpu that invokes the panic, since
+         * there is nothing to prevent an interrupt handler (that runs
+         * after the panic_lock is acquired) from invoking panic again.
+         */
+        local_irq_disable();
+        /*
         * It's possible to come here directly from a panic-assertion and
         * not have preempt disabled. Some functions called from here want
         * preempt to be disabled. No point enabling it later though...
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 8f9b4eb974e0..a70518c9d82f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -175,7 +175,7 @@ config PM_TEST_SUSPEND
        You probably want to have your system's RTC driver statically
        linked, ensuring that it's available when this test runs.
-config CAN_PM_TRACE
+config PM_SLEEP_DEBUG
        def_bool y
        depends on PM_DEBUG && PM_SLEEP
@@ -196,7 +196,7 @@ config PM_TRACE
 config PM_TRACE_RTC
        bool "Suspend/resume event tracing"
-        depends on CAN_PM_TRACE
+        depends on PM_SLEEP_DEBUG
        depends on X86
        select PM_TRACE
        ---help---
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8b53db38a279..b26f5f1e773e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -5,6 +5,7 @@
 * Copyright (c) 2003 Open Source Development Lab
 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
+ * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
 *
 * This file is released under the GPLv2.
 */
@@ -27,7 +28,6 @@
 #include <linux/syscore_ops.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
-#include <scsi/scsi_scan.h>
 #include "power.h"
@@ -46,6 +46,9 @@ enum {
        HIBERNATION_PLATFORM,
        HIBERNATION_SHUTDOWN,
        HIBERNATION_REBOOT,
+#ifdef CONFIG_SUSPEND
+        HIBERNATION_SUSPEND,
+#endif
        /* keep last */
        __HIBERNATION_AFTER_LAST
 };
@@ -354,6 +357,7 @@ int hibernation_snapshot(int platform_mode)
        }
        suspend_console();
+        ftrace_stop();
        pm_restrict_gfp_mask();
        error = dpm_suspend(PMSG_FREEZE);
@@ -379,6 +383,7 @@ int hibernation_snapshot(int platform_mode)
        if (error || !in_suspend)
                pm_restore_gfp_mask();
+        ftrace_start();
        resume_console();
        dpm_complete(msg);
@@ -481,6 +486,7 @@ int hibernation_restore(int platform_mode)
        pm_prepare_console();
        suspend_console();
+        ftrace_stop();
        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
@@ -488,6 +494,7 @@ int hibernation_restore(int platform_mode)
                dpm_resume_end(PMSG_RECOVER);
        }
        pm_restore_gfp_mask();
+        ftrace_start();
        resume_console();
        pm_restore_console();
        return error;
@@ -514,6 +521,7 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
+        ftrace_stop();
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -557,6 +565,7 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
+        ftrace_start();
        resume_console();
 Close:
@@ -574,6 +583,10 @@ int hibernation_platform_enter(void)
 */
 static void power_down(void)
 {
+#ifdef CONFIG_SUSPEND
+        int error;
+#endif
        switch (hibernation_mode) {
        case HIBERNATION_REBOOT:
                kernel_restart(NULL);
@@ -583,6 +596,25 @@ static void power_down(void)
        case HIBERNATION_SHUTDOWN:
                kernel_power_off();
                break;
+#ifdef CONFIG_SUSPEND
+        case HIBERNATION_SUSPEND:
+                error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+                if (error) {
+                        if (hibernation_ops)
+                                hibernation_mode = HIBERNATION_PLATFORM;
+                        else
+                                hibernation_mode = HIBERNATION_SHUTDOWN;
+                        power_down();
+                }
+                /*
+                 * Restore swap signature.
+                 */
+                error = swsusp_unmark();
+                if (error)
+                        printk(KERN_ERR "PM: Swap will be unusable! "
+                                        "Try swapon -a.\n");
+                return;
+#endif
        }
        kernel_halt();
        /*
@@ -748,13 +780,6 @@ static int software_resume(void)
                        async_synchronize_full();
                }
-                /*
-                 * We can't depend on SCSI devices being available after loading
-                 * one of their modules until scsi_complete_async_scans() is
-                 * called and the resume device usually is a SCSI one.
-                 */
-                scsi_complete_async_scans();
                swsusp_resume_device = name_to_dev_t(resume_file);
                if (!swsusp_resume_device) {
                        error = -ENODEV;
@@ -827,6 +852,9 @@ static const char * const hibernation_modes[] = {
        [HIBERNATION_PLATFORM]  = "platform",
        [HIBERNATION_SHUTDOWN]  = "shutdown",
        [HIBERNATION_REBOOT]    = "reboot",
+#ifdef CONFIG_SUSPEND
+        [HIBERNATION_SUSPEND]   = "suspend",
+#endif
 };
 /*
@@ -867,6 +895,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
                switch (i) {
                case HIBERNATION_SHUTDOWN:
                case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+                case HIBERNATION_SUSPEND:
+#endif
                        break;
                case HIBERNATION_PLATFORM:
                        if (hibernation_ops)
@@ -907,6 +938,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
                switch (mode) {
                case HIBERNATION_SHUTDOWN:
                case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+                case HIBERNATION_SUSPEND:
+#endif
                        hibernation_mode = mode;
                        break;
                case HIBERNATION_PLATFORM:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 428f8a034e96..f458238109cc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -235,6 +235,47 @@ late_initcall(pm_debugfs_init);
 #endif /* CONFIG_PM_SLEEP */
+#ifdef CONFIG_PM_SLEEP_DEBUG
+/*
+ * pm_print_times: print time taken by devices to suspend and resume.
+ *
+ * show() returns whether printing of suspend and resume times is enabled.
+ * store() accepts 0 or 1.  0 disables printing and 1 enables it.
+ */
+bool pm_print_times_enabled;
+static ssize_t pm_print_times_show(struct kobject *kobj,
+                                   struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%d\n", pm_print_times_enabled);
+}
+static ssize_t pm_print_times_store(struct kobject *kobj,
+                                    struct kobj_attribute *attr,
+                                    const char *buf, size_t n)
+{
+        unsigned long val;
+        if (kstrtoul(buf, 10, &val))
+                return -EINVAL;
+        if (val > 1)
+                return -EINVAL;
+        pm_print_times_enabled = !!val;
+        return n;
+}
+power_attr(pm_print_times);
+static inline void pm_print_times_init(void)
+{
+        pm_print_times_enabled = !!initcall_debug;
+}
+#else /* !CONFIG_PP_SLEEP_DEBUG */
+static inline void pm_print_times_init(void) {}
+#endif /* CONFIG_PM_SLEEP_DEBUG */
 struct kobject *power_kobj;
 /**
@@ -531,6 +572,9 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_DEBUG
        &pm_test_attr.attr,
 #endif
+#ifdef CONFIG_PM_SLEEP_DEBUG
+        &pm_print_times_attr.attr,
+#endif
 #endif
        NULL,
 };
@@ -566,6 +610,7 @@ static int __init pm_init(void)
        error = sysfs_create_group(power_kobj, &attr_group);
        if (error)
                return error;
+        pm_print_times_init();
        return pm_autosleep_init();
 }
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b0bd4beaebfe..7d4b7ffb3c1d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -156,6 +156,9 @@ extern void swsusp_free(void);
 extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
 extern void swsusp_close(fmode_t);
+#ifdef CONFIG_SUSPEND
+extern int swsusp_unmark(void);
+#endif
 /* kernel/power/block_io.c */
 extern struct block_device *hib_resume_bdev;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 396d262b8fd0..c8b7446b27df 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
+#include <linux/ftrace.h>
 #include <trace/events/power.h>
 #include "power.h"
@@ -212,6 +213,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
+        ftrace_stop();
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -231,6 +233,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
+        ftrace_start();
        resume_console();
 Close:
        if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11e22c068e8b..3c9d764eb0d8 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -448,9 +448,9 @@ static int save_image(struct swap_map_handle *handle,
        struct timeval start;
        struct timeval stop;
-        printk(KERN_INFO "PM: Saving image data pages (%u pages) ...     ",
+        printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
                nr_to_write);
-        m = nr_to_write / 100;
+        m = nr_to_write / 10;
        if (!m)
                m = 1;
        nr_pages = 0;
@@ -464,7 +464,8 @@ static int save_image(struct swap_map_handle *handle,
                if (ret)
                        break;
                if (!(nr_pages % m))
-                        printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
+                        printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
+                               nr_pages / m * 10);
                nr_pages++;
        }
        err2 = hib_wait_on_bio_chain(&bio);
@@ -472,9 +473,7 @@ static int save_image(struct swap_map_handle *handle,
        if (!ret)
                ret = err2;
        if (!ret)
-                printk(KERN_CONT "\b\b\b\bdone\n");
+                printk(KERN_INFO "PM: Image saving done.\n");
-        else
-                printk(KERN_CONT "\n");
        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
        return ret;
 }
@@ -668,9 +667,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
        printk(KERN_INFO
                "PM: Using %u thread(s) for compression.\n"
-                "PM: Compressing and saving image data (%u pages) ...     ",
+                "PM: Compressing and saving image data (%u pages)...\n",
                nr_threads, nr_to_write);
-        m = nr_to_write / 100;
+        m = nr_to_write / 10;
        if (!m)
                m = 1;
        nr_pages = 0;
@@ -690,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle,
                                       data_of(*snapshot), PAGE_SIZE);
                                if (!(nr_pages % m))
-                                        printk(KERN_CONT "\b\b\b\b%3d%%",
+                                        printk(KERN_INFO
-                                               nr_pages / m);
+                                               "PM: Image saving progress: "
+                                               "%3d%%\n",
+                                               nr_pages / m * 10);
                                nr_pages++;
                        }
                        if (!off)
@@ -761,11 +762,8 @@ out_finish:
        do_gettimeofday(&stop);
        if (!ret)
                ret = err2;
-        if (!ret) {
+        if (!ret)
-                printk(KERN_CONT "\b\b\b\bdone\n");
+                printk(KERN_INFO "PM: Image saving done.\n");
-        } else {
-                printk(KERN_CONT "\n");
-        }
        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
 out_clean:
        if (crc) {
@@ -973,9 +971,9 @@ static int load_image(struct swap_map_handle *handle,
        int err2;
        unsigned nr_pages;
-        printk(KERN_INFO "PM: Loading image data pages (%u pages) ...     ",
+        printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
                nr_to_read);
-        m = nr_to_read / 100;
+        m = nr_to_read / 10;
        if (!m)
                m = 1;
        nr_pages = 0;
@@ -993,7 +991,8 @@ static int load_image(struct swap_map_handle *handle,
                if (ret)
                        break;
                if (!(nr_pages % m))
-                        printk("\b\b\b\b%3d%%", nr_pages / m);
+                        printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
+                               nr_pages / m * 10);
                nr_pages++;
        }
        err2 = hib_wait_on_bio_chain(&bio);
@@ -1001,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle,
        if (!ret)
                ret = err2;
        if (!ret) {
-                printk("\b\b\b\bdone\n");
+                printk(KERN_INFO "PM: Image loading done.\n");
                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
                        ret = -ENODATA;
-        } else
+        }
-                printk("\n");
        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
        return ret;
 }
@@ -1185,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
        printk(KERN_INFO
                "PM: Using %u thread(s) for decompression.\n"
-                "PM: Loading and decompressing image data (%u pages) ...     ",
+                "PM: Loading and decompressing image data (%u pages)...\n",
                nr_threads, nr_to_read);
-        m = nr_to_read / 100;
+        m = nr_to_read / 10;
        if (!m)
                m = 1;
        nr_pages = 0;
@@ -1319,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle,
                                       data[thr].unc + off, PAGE_SIZE);
                                if (!(nr_pages % m))
-                                        printk("\b\b\b\b%3d%%", nr_pages / m);
+                                        printk(KERN_INFO
+                                               "PM: Image loading progress: "
+                                               "%3d%%\n",
+                                               nr_pages / m * 10);
                                nr_pages++;
                                ret = snapshot_write_next(snapshot);
@@ -1344,7 +1345,7 @@ out_finish:
        }
        do_gettimeofday(&stop);
        if (!ret) {
-                printk("\b\b\b\bdone\n");
+                printk(KERN_INFO "PM: Image loading done.\n");
                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
                        ret = -ENODATA;
@@ -1357,8 +1358,7 @@ out_finish:
                                }
                        }
                }
-        } else
+        }
-                printk("\n");
        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
 out_clean:
        for (i = 0; i < ring_size; i++)
@@ -1472,6 +1472,34 @@ void swsusp_close(fmode_t mode)
        blkdev_put(hib_resume_bdev, mode);
 }
+/**
+ *      swsusp_unmark - Unmark swsusp signature in the resume device
+ */
+#ifdef CONFIG_SUSPEND
+int swsusp_unmark(void)
+{
+        int error;
+        hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+        if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
+                memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
+                error = hib_bio_write_page(swsusp_resume_block,
+                                        swsusp_header, NULL);
+        } else {
+                printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+                error = -ENODEV;
+        }
+        /*
+         * We just returned from suspend, we don't need the image any more.
+         */
+        free_all_swap_pages(root_swap);
+        return error;
+}
+#endif
 static int swsusp_header_init(void)
 {
        swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 91b0fd021a95..4ed81e74f86f 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,7 +24,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
-#include <scsi/scsi_scan.h>
 #include <asm/uaccess.h>
@@ -84,7 +83,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
                 * appear.
                 */
                wait_for_device_probe();
-                scsi_complete_async_scans();
                data->swap = -1;
                data->mode = O_WRONLY;
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index c8fba3380076..8f50de394d22 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -9,6 +9,7 @@
 * manipulate wakelocks on Android.
 */
+#include <linux/capability.h>
 #include <linux/ctype.h>
 #include <linux/device.h>
 #include <linux/err.h>
@@ -188,6 +189,9 @@ int pm_wake_lock(const char *buf)
        size_t len;
        int ret = 0;
+        if (!capable(CAP_BLOCK_SUSPEND))
+                return -EPERM;
        while (*str && !isspace(*str))
                str++;
@@ -231,6 +235,9 @@ int pm_wake_unlock(const char *buf)
        size_t len;
        int ret = 0;
+        if (!capable(CAP_BLOCK_SUSPEND))
+                return -EPERM;
        len = strlen(buf);
        if (!len)
                return -EINVAL;
diff --git a/kernel/printk.c b/kernel/printk.c
index 177fa49357a5..66a2ea37b576 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -216,6 +216,7 @@ struct log {
 */
 static DEFINE_RAW_SPINLOCK(logbuf_lock);
+#ifdef CONFIG_PRINTK
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
@@ -228,14 +229,19 @@ static u32 log_first_idx;
 /* index and sequence number of the next record to store in the buffer */
 static u64 log_next_seq;
-#ifdef CONFIG_PRINTK
 static u32 log_next_idx;
+/* the next printk record to write to the console */
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags console_prev;
 /* the next printk record to read after the last 'clear' command */
 static u64 clear_seq;
 static u32 clear_idx;
-#define LOG_LINE_MAX 1024
+#define PREFIX_MAX              32
+#define LOG_LINE_MAX            1024 - PREFIX_MAX
 /* record buffer */
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -360,6 +366,7 @@ static void log_store(int facility, int level,
 struct devkmsg_user {
        u64 seq;
        u32 idx;
+        enum log_flags prev;
        struct mutex lock;
        char buf[8192];
 };
@@ -382,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
        line = buf;
        for (i = 0; i < count; i++) {
-                if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len))
+                if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
+                        ret = -EFAULT;
                        goto out;
+                }
                line += iv[i].iov_len;
        }
@@ -425,6 +434,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        struct log *msg;
        u64 ts_usec;
        size_t i;
+        char cont = '-';
        size_t len;
        ssize_t ret;
@@ -462,8 +472,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        msg = log_from_idx(user->idx);
        ts_usec = msg->ts_nsec;
        do_div(ts_usec, 1000);
-        len = sprintf(user->buf, "%u,%llu,%llu;",
-                      (msg->facility << 3) | msg->level, user->seq, ts_usec);
+        /*
+         * If we couldn't merge continuation line fragments during the print,
+         * export the stored flags to allow an optional external merge of the
+         * records. Merging the records isn't always neccessarily correct, like
+         * when we hit a race during printing. In most cases though, it produces
+         * better readable output. 'c' in the record flags mark the first
+         * fragment of a line, '+' the following.
+         */
+        if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
+                cont = 'c';
+        else if ((msg->flags & LOG_CONT) ||
+                 ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
+                cont = '+';
+        len = sprintf(user->buf, "%u,%llu,%llu,%c;",
+                      (msg->facility << 3) | msg->level,
+                      user->seq, ts_usec, cont);
+        user->prev = msg->flags;
        /* escape non-printable characters */
        for (i = 0; i < msg->text_len; i++) {
@@ -646,6 +673,15 @@ void log_buf_kexec_setup(void)
        VMCOREINFO_SYMBOL(log_buf_len);
        VMCOREINFO_SYMBOL(log_first_idx);
        VMCOREINFO_SYMBOL(log_next_idx);
+        /*
+         * Export struct log size and field offsets. User space tools can
+         * parse it and detect any changes to structure down the line.
+         */
+        VMCOREINFO_STRUCT_SIZE(log);
+        VMCOREINFO_OFFSET(log, ts_nsec);
+        VMCOREINFO_OFFSET(log, len);
+        VMCOREINFO_OFFSET(log, text_len);
+        VMCOREINFO_OFFSET(log, dict_len);
 }
 #endif
@@ -876,7 +912,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
                if (buf) {
                        if (print_prefix(msg, syslog, NULL) +
-                            text_len + 1>= size - len)
+                            text_len + 1 >= size - len)
                                break;
                        if (prefix)
@@ -907,7 +943,7 @@ static int syslog_print(char __user *buf, int size)
        struct log *msg;
        int len = 0;
-        text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+        text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;
@@ -930,7 +966,8 @@ static int syslog_print(char __user *buf, int size)
                skip = syslog_partial;
                msg = log_from_idx(syslog_idx);
-                n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
+                n = msg_print_text(msg, syslog_prev, true, text,
+                                   LOG_LINE_MAX + PREFIX_MAX);
                if (n - syslog_partial <= size) {
                        /* message fits into buffer, move forward */
                        syslog_idx = log_next(syslog_idx);
@@ -969,7 +1006,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
        char *text;
        int len = 0;
-        text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+        text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;
@@ -997,6 +1034,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                        struct log *msg = log_from_idx(idx);
                        len += msg_print_text(msg, prev, true, NULL, 0);
+                        prev = msg->flags;
                        idx = log_next(idx);
                        seq++;
                }
@@ -1009,6 +1047,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                        struct log *msg = log_from_idx(idx);
                        len -= msg_print_text(msg, prev, true, NULL, 0);
+                        prev = msg->flags;
                        idx = log_next(idx);
                        seq++;
                }
@@ -1022,7 +1061,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                        struct log *msg = log_from_idx(idx);
                        int textlen;
-                        textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
+                        textlen = msg_print_text(msg, prev, true, text,
+                                                 LOG_LINE_MAX + PREFIX_MAX);
                        if (textlen < 0) {
                                len = textlen;
                                break;
@@ -1192,21 +1232,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
        return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
-#ifdef  CONFIG_KGDB_KDB
-/* kdb dmesg command needs access to the syslog buffer.  do_syslog()
- * uses locks so it cannot be used during debugging.  Just tell kdb
- * where the start and end of the physical and logical logs are.  This
- * is equivalent to do_syslog(3).
- */
-void kdb_syslog_data(char *syslog_data[4])
-{
-        syslog_data[0] = log_buf;
-        syslog_data[1] = log_buf + log_buf_len;
-        syslog_data[2] = log_buf + log_first_idx;
-        syslog_data[3] = log_buf + log_next_idx;
-}
-#endif  /* CONFIG_KGDB_KDB */
 static bool __read_mostly ignore_loglevel;
 static int __init ignore_loglevel_setup(char *str)
@@ -1364,20 +1389,36 @@ static struct cont {
        u64 ts_nsec;                    /* time of first print */
        u8 level;                       /* log level of first message */
        u8 facility;                    /* log level of first message */
+        enum log_flags flags;           /* prefix, newline flags */
        bool flushed:1;                 /* buffer sealed and committed */
 } cont;
-static void cont_flush(void)
+static void cont_flush(enum log_flags flags)
 {
        if (cont.flushed)
                return;
        if (cont.len == 0)
                return;
-        log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
+        if (cont.cons) {
-                  NULL, 0, cont.buf, cont.len);
+                /*
+                 * If a fragment of this line was directly flushed to the
-        cont.flushed = true;
+                 * console; wait for the console to pick up the rest of the
+                 * line. LOG_NOCONS suppresses a duplicated output.
+                 */
+                log_store(cont.facility, cont.level, flags | LOG_NOCONS,
+                          cont.ts_nsec, NULL, 0, cont.buf, cont.len);
+                cont.flags = flags;
+                cont.flushed = true;
+        } else {
+                /*
+                 * If no fragment of this line ever reached the console,
+                 * just submit it to the store and free the buffer.
+                 */
+                log_store(cont.facility, cont.level, flags, 0,
+                          NULL, 0, cont.buf, cont.len);
+                cont.len = 0;
+        }
 }
 static bool cont_add(int facility, int level, const char *text, size_t len)
@@ -1386,7 +1427,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
                return false;
        if (cont.len + len > sizeof(cont.buf)) {
-                cont_flush();
+                /* the line gets too long, split it up in separate records */
+                cont_flush(LOG_CONT);
                return false;
        }
@@ -1395,12 +1437,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
                cont.level = level;
                cont.owner = current;
                cont.ts_nsec = local_clock();
+                cont.flags = 0;
                cont.cons = 0;
                cont.flushed = false;
        }
        memcpy(cont.buf + cont.len, text, len);
        cont.len += len;
+        if (cont.len > (sizeof(cont.buf) * 80) / 100)
+                cont_flush(LOG_CONT);
        return true;
 }
@@ -1409,7 +1456,7 @@ static size_t cont_print_text(char *text, size_t size)
        size_t textlen = 0;
        size_t len;
-        if (cont.cons == 0) {
+        if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
                textlen += print_time(cont.ts_nsec, text);
                size -= textlen;
        }
@@ -1424,7 +1471,8 @@ static size_t cont_print_text(char *text, size_t size)
        }
        if (cont.flushed) {
-                text[textlen++] = '\n';
+                if (cont.flags & LOG_NEWLINE)
+                        text[textlen++] = '\n';
                /* got everything, release buffer */
                cont.len = 0;
        }
@@ -1496,17 +1544,23 @@ asmlinkage int vprintk_emit(int facility, int level,
                lflags |= LOG_NEWLINE;
        }
-        /* strip syslog prefix and extract log level or control flags */
+        /* strip kernel syslog prefix and extract log level or control flags */
-        if (text[0] == '<' && text[1] && text[2] == '>') {
+        if (facility == 0) {
-                switch (text[1]) {
+                int kern_level = printk_get_level(text);
-                case '0' ... '7':
-                        if (level == -1)
+                if (kern_level) {
-                                level = text[1] - '0';
+                        const char *end_of_header = printk_skip_level(text);
-                case 'd':       /* KERN_DEFAULT */
+                        switch (kern_level) {
-                        lflags |= LOG_PREFIX;
+                        case '0' ... '7':
-                case 'c':       /* KERN_CONT */
+                                if (level == -1)
-                        text += 3;
+                                        level = kern_level - '0';
-                        text_len -= 3;
+                        case 'd':       /* KERN_DEFAULT */
+                                lflags |= LOG_PREFIX;
+                        case 'c':       /* KERN_CONT */
+                                break;
+                        }
+                        text_len -= end_of_header - text;
+                        text = (char *)end_of_header;
                }
        }
@@ -1522,7 +1576,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                 * or another task also prints continuation lines.
                 */
                if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
-                        cont_flush();
+                        cont_flush(LOG_NEWLINE);
                /* buffer line if possible, otherwise store it right away */
                if (!cont_add(facility, level, text, text_len))
@@ -1540,7 +1594,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                if (cont.len && cont.owner == current) {
                        if (!(lflags & LOG_PREFIX))
                                stored = cont_add(facility, level, text, text_len);
-                        cont_flush();
+                        cont_flush(LOG_NEWLINE);
                }
                if (!stored)
@@ -1631,9 +1685,20 @@ asmlinkage int printk(const char *fmt, ...)
 }
 EXPORT_SYMBOL(printk);
-#else
+#else /* CONFIG_PRINTK */
+#define LOG_LINE_MAX            0
+#define PREFIX_MAX              0
 #define LOG_LINE_MAX 0
+static u64 syslog_seq;
+static u32 syslog_idx;
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags syslog_prev;
+static u64 log_first_seq;
+static u32 log_first_idx;
+static u64 log_next_seq;
+static enum log_flags console_prev;
 static struct cont {
        size_t len;
        size_t cons;
@@ -1917,10 +1982,34 @@ void wake_up_klogd(void)
                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
 }
-/* the next printk record to write to the console */
+static void console_cont_flush(char *text, size_t size)
-static u64 console_seq;
+{
-static u32 console_idx;
+        unsigned long flags;
-static enum log_flags console_prev;
+        size_t len;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        if (!cont.len)
+                goto out;
+        /*
+         * We still queue earlier records, likely because the console was
+         * busy. The earlier ones need to be printed before this one, we
+         * did not flush any fragment so far, so just let it queue up.
+         */
+        if (console_seq < log_next_seq && !cont.cons)
+                goto out;
+        len = cont_print_text(text, size);
+        raw_spin_unlock(&logbuf_lock);
+        stop_critical_timings();
+        call_console_drivers(cont.level, text, len);
+        start_critical_timings();
+        local_irq_restore(flags);
+        return;
+out:
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+}
 /**
 * console_unlock - unlock the console system
@@ -1938,7 +2027,7 @@ static enum log_flags console_prev;
 */
 void console_unlock(void)
 {
-        static char text[LOG_LINE_MAX];
+        static char text[LOG_LINE_MAX + PREFIX_MAX];
        static u64 seen_seq;
        unsigned long flags;
        bool wake_klogd = false;
@@ -1952,19 +2041,7 @@ void console_unlock(void)
        console_may_schedule = 0;
        /* flush buffered message fragment immediately to console */
-        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        console_cont_flush(text, sizeof(text));
-        if (cont.len && (cont.cons < cont.len || cont.flushed)) {
-                size_t len;
-                len = cont_print_text(text, sizeof(text));
-                raw_spin_unlock(&logbuf_lock);
-                stop_critical_timings();
-                call_console_drivers(cont.level, text, len);
-                start_critical_timings();
-                local_irq_restore(flags);
-        } else
-                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 again:
        for (;;) {
                struct log *msg;
@@ -2001,6 +2078,7 @@ skip:
                         * will properly dump everything later.
                         */
                        msg->flags &= ~LOG_NOCONS;
+                        console_prev = msg->flags;
                        goto skip;
                }
@@ -2525,7 +2603,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 }
 /**
- * kmsg_dump_get_line - retrieve one kmsg log line
+ * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version)
 * @dumper: registered kmsg dumper
 * @syslog: include the "<4>" prefixes
 * @line: buffer to copy the line to
@@ -2540,11 +2618,12 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 *
 * A return value of FALSE indicates that there are no more records to
 * read.
+ *
+ * The function is similar to kmsg_dump_get_line(), but grabs no locks.
 */
-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
-                        char *line, size_t size, size_t *len)
+                               char *line, size_t size, size_t *len)
 {
-        unsigned long flags;
        struct log *msg;
        size_t l = 0;
        bool ret = false;
@@ -2552,7 +2631,6 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
        if (!dumper->active)
                goto out;
-        raw_spin_lock_irqsave(&logbuf_lock, flags);
        if (dumper->cur_seq < log_first_seq) {
                /* messages are gone, move to first available one */
                dumper->cur_seq = log_first_seq;
@@ -2560,10 +2638,8 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
        }
        /* last entry */
-        if (dumper->cur_seq >= log_next_seq) {
+        if (dumper->cur_seq >= log_next_seq)
-                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
                goto out;
-        }
        msg = log_from_idx(dumper->cur_idx);
        l = msg_print_text(msg, 0, syslog, line, size);
@@ -2571,12 +2647,41 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
        dumper->cur_idx = log_next(dumper->cur_idx);
        dumper->cur_seq++;
        ret = true;
-        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 out:
        if (len)
                *len = l;
        return ret;
 }
+/**
+ * kmsg_dump_get_line - retrieve one kmsg log line
+ * @dumper: registered kmsg dumper
+ * @syslog: include the "<4>" prefixes
+ * @line: buffer to copy the line to
+ * @size: maximum size of the buffer
+ * @len: length of line placed into buffer
+ *
+ * Start at the beginning of the kmsg buffer, with the oldest kmsg
+ * record, and copy one record into the provided buffer.
+ *
+ * Consecutive calls will return the next available record moving
+ * towards the end of the buffer with the youngest messages.
+ *
+ * A return value of FALSE indicates that there are no more records to
+ * read.
+ */
+bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
+                        char *line, size_t size, size_t *len)
+{
+        unsigned long flags;
+        bool ret;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+        return ret;
+}
 EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
 /**
@@ -2679,6 +2784,24 @@ out:
 EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
 /**
+ * kmsg_dump_rewind_nolock - reset the interator (unlocked version)
+ * @dumper: registered kmsg dumper
+ *
+ * Reset the dumper's iterator so that kmsg_dump_get_line() and
+ * kmsg_dump_get_buffer() can be called again and used multiple
+ * times within the same dumper.dump() callback.
+ *
+ * The function is similar to kmsg_dump_rewind(), but grabs no locks.
+ */
+void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
+{
+        dumper->cur_seq = clear_seq;
+        dumper->cur_idx = clear_idx;
+        dumper->next_seq = log_next_seq;
+        dumper->next_idx = log_next_idx;
+}
+/**
 * kmsg_dump_rewind - reset the interator
 * @dumper: registered kmsg dumper
 *
@@ -2691,10 +2814,7 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
        unsigned long flags;
        raw_spin_lock_irqsave(&logbuf_lock, flags);
-        dumper->cur_seq = clear_seq;
+        kmsg_dump_rewind_nolock(dumper);
-        dumper->cur_idx = clear_idx;
-        dumper->next_seq = log_next_seq;
-        dumper->next_idx = log_next_idx;
        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 95cba41ce1e9..4e6a61b15e86 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -54,6 +54,50 @@
 #ifdef CONFIG_PREEMPT_RCU
 /*
+ * Preemptible RCU implementation for rcu_read_lock().
+ * Just increment ->rcu_read_lock_nesting, shared state will be updated
+ * if we block.
+ */
+void __rcu_read_lock(void)
+{
+        current->rcu_read_lock_nesting++;
+        barrier();  /* critical section after entry code. */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+/*
+ * Preemptible RCU implementation for rcu_read_unlock().
+ * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
+ * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
+ * invoke rcu_read_unlock_special() to clean up after a context switch
+ * in an RCU read-side critical section and other special cases.
+ */
+void __rcu_read_unlock(void)
+{
+        struct task_struct *t = current;
+        if (t->rcu_read_lock_nesting != 1) {
+                --t->rcu_read_lock_nesting;
+        } else {
+                barrier();  /* critical section before exit code. */
+                t->rcu_read_lock_nesting = INT_MIN;
+                barrier();  /* assign before ->rcu_read_unlock_special load */
+                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
+                        rcu_read_unlock_special(t);
+                barrier();  /* ->rcu_read_unlock_special load before assign */
+                t->rcu_read_lock_nesting = 0;
+        }
+#ifdef CONFIG_PROVE_LOCKING
+        {
+                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
+                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
+        }
+#endif /* #ifdef CONFIG_PROVE_LOCKING */
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+/*
 * Check for a task exiting while in a preemptible-RCU read-side
 * critical section, clean up if so.  No need to issue warnings,
 * as debug_check_no_locks_held() already does this if lockdep
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 37a5444204d2..547b1fe5b052 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -172,7 +172,7 @@ void rcu_irq_enter(void)
        local_irq_restore(flags);
 }
-#ifdef CONFIG_PROVE_RCU
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
 * Test whether RCU thinks that the current CPU is idle.
@@ -183,7 +183,7 @@ int rcu_is_cpu_idle(void)
 }
 EXPORT_SYMBOL(rcu_is_cpu_idle);
-#endif /* #ifdef CONFIG_PROVE_RCU */
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /*
 * Test whether the current CPU was interrupted from idle.  Nested
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index fc31a2d65100..918fd1e8509c 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -132,7 +132,6 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
        RCU_TRACE(.rcb.name = "rcu_preempt")
 };
-static void rcu_read_unlock_special(struct task_struct *t);
 static int rcu_preempted_readers_exp(void);
 static void rcu_report_exp_done(void);
@@ -351,8 +350,9 @@ static int rcu_initiate_boost(void)
                        rcu_preempt_ctrlblk.boost_tasks =
                                rcu_preempt_ctrlblk.gp_tasks;
                invoke_rcu_callbacks();
-        } else
+        } else {
                RCU_TRACE(rcu_initiate_boost_trace());
+        }
        return 1;
 }
@@ -527,23 +527,11 @@ void rcu_preempt_note_context_switch(void)
 }
 /*
- * Tiny-preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
-        current->rcu_read_lock_nesting++;
-        barrier();  /* needed if we ever invoke rcu_read_lock in rcutiny.c */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-/*
 * Handle special cases during rcu_read_unlock(), such as needing to
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
-static noinline void rcu_read_unlock_special(struct task_struct *t)
+void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
@@ -627,38 +615,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 }
 /*
- * Tiny-preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
-        struct task_struct *t = current;
-        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
-        if (t->rcu_read_lock_nesting != 1)
-                --t->rcu_read_lock_nesting;
-        else {
-                t->rcu_read_lock_nesting = INT_MIN;
-                barrier();  /* assign before ->rcu_read_unlock_special load */
-                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
-                        rcu_read_unlock_special(t);
-                barrier();  /* ->rcu_read_unlock_special load before assign */
-                t->rcu_read_lock_nesting = 0;
-        }
-#ifdef CONFIG_PROVE_LOCKING
-        {
-                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
-                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
-        }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-/*
 * Check for a quiescent state from the current CPU.  When a task blocks,
 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
 * checked elsewhere.  This is called from the scheduling-clock interrupt.
@@ -823,9 +779,9 @@ void synchronize_rcu_expedited(void)
                rpcp->exp_tasks = NULL;
        /* Wait for tail of ->blkd_tasks list to drain. */
-        if (!rcu_preempted_readers_exp())
+        if (!rcu_preempted_readers_exp()) {
                local_irq_restore(flags);
-        else {
+        } else {
                rcu_initiate_boost();
                local_irq_restore(flags);
                wait_event(sync_rcu_preempt_exp_wq,
@@ -846,8 +802,6 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 */
 int rcu_preempt_needs_cpu(void)
 {
-        if (!rcu_preempt_running_reader())
-                rcu_preempt_cpu_qs();
        return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
 }
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e66b34ab7555..25b15033c61f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -49,8 +49,7 @@
 #include <asm/byteorder.h>
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
-              "Josh Triplett <josh@freedesktop.org>");
 static int nreaders = -1;       /* # reader threads, defaults to 2*ncpus */
 static int nfakewriters = 4;    /* # fake writer threads */
@@ -206,6 +205,7 @@ static unsigned long boost_starttime;	/* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
                                        /*  and boost task create/destroy. */
 static atomic_t barrier_cbs_count;      /* Barrier callbacks registered. */
+static bool barrier_phase;              /* Test phase. */
 static atomic_t barrier_cbs_invoked;    /* Barrier callbacks invoked. */
 static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
 static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
@@ -407,8 +407,9 @@ rcu_torture_cb(struct rcu_head *p)
        if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
                rp->rtort_mbtest = 0;
                rcu_torture_free(rp);
-        } else
+        } else {
                cur_ops->deferred_free(rp);
+        }
 }
 static int rcu_no_completed(void)
@@ -635,6 +636,17 @@ static void srcu_torture_synchronize(void)
        synchronize_srcu(&srcu_ctl);
 }
+static void srcu_torture_call(struct rcu_head *head,
+                              void (*func)(struct rcu_head *head))
+{
+        call_srcu(&srcu_ctl, head, func);
+}
+static void srcu_torture_barrier(void)
+{
+        srcu_barrier(&srcu_ctl);
+}
 static int srcu_torture_stats(char *page)
 {
        int cnt = 0;
@@ -661,8 +673,8 @@ static struct rcu_torture_ops srcu_ops = {
        .completed      = srcu_torture_completed,
        .deferred_free  = srcu_torture_deferred_free,
        .sync           = srcu_torture_synchronize,
-        .call           = NULL,
+        .call           = srcu_torture_call,
-        .cb_barrier     = NULL,
+        .cb_barrier     = srcu_torture_barrier,
        .stats          = srcu_torture_stats,
        .name           = "srcu"
 };
@@ -1013,7 +1025,11 @@ rcu_torture_fakewriter(void *arg)
        do {
                schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
                udelay(rcu_random(&rand) & 0x3ff);
-                cur_ops->sync();
+                if (cur_ops->cb_barrier != NULL &&
+                    rcu_random(&rand) % (nfakewriters * 8) == 0)
+                        cur_ops->cb_barrier();
+                else
+                        cur_ops->sync();
                rcu_stutter_wait("rcu_torture_fakewriter");
        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1183,27 +1199,27 @@ rcu_torture_printk(char *page)
        }
        cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
        cnt += sprintf(&page[cnt],
-                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
+                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
-                       "rtmbe: %d rtbke: %ld rtbre: %ld "
-                       "rtbf: %ld rtb: %ld nt: %ld "
-                       "onoff: %ld/%ld:%ld/%ld "
-                       "barrier: %ld/%ld:%ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
                       atomic_read(&n_rcu_torture_alloc),
                       atomic_read(&n_rcu_torture_alloc_fail),
-                       atomic_read(&n_rcu_torture_free),
+                       atomic_read(&n_rcu_torture_free));
+        cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ",
                       atomic_read(&n_rcu_torture_mberror),
                       n_rcu_torture_boost_ktrerror,
-                       n_rcu_torture_boost_rterror,
+                       n_rcu_torture_boost_rterror);
+        cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ",
                       n_rcu_torture_boost_failure,
                       n_rcu_torture_boosts,
-                       n_rcu_torture_timers,
+                       n_rcu_torture_timers);
+        cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ",
                       n_online_successes,
                       n_online_attempts,
                       n_offline_successes,
-                       n_offline_attempts,
+                       n_offline_attempts);
+        cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
                       n_barrier_successes,
                       n_barrier_attempts,
                       n_rcu_torture_barrier_error);
@@ -1445,8 +1461,7 @@ rcu_torture_shutdown(void *arg)
                delta = shutdown_time - jiffies_snap;
                if (verbose)
                        printk(KERN_ALERT "%s" TORTURE_FLAG
-                               "rcu_torture_shutdown task: %lu "
+                               "rcu_torture_shutdown task: %lu jiffies remaining\n",
-                               "jiffies remaining\n",
                               torture_type, delta);
                schedule_timeout_interruptible(delta);
                jiffies_snap = ACCESS_ONCE(jiffies);
@@ -1498,8 +1513,7 @@ rcu_torture_onoff(void *arg)
                        if (cpu_down(cpu) == 0) {
                                if (verbose)
                                        printk(KERN_ALERT "%s" TORTURE_FLAG
-                                               "rcu_torture_onoff task: "
+                                               "rcu_torture_onoff task: offlined %d\n",
-                                               "offlined %d\n",
                                               torture_type, cpu);
                                n_offline_successes++;
                        }
@@ -1512,8 +1526,7 @@ rcu_torture_onoff(void *arg)
                        if (cpu_up(cpu) == 0) {
                                if (verbose)
                                        printk(KERN_ALERT "%s" TORTURE_FLAG
-                                               "rcu_torture_onoff task: "
+                                               "rcu_torture_onoff task: onlined %d\n",
-                                               "onlined %d\n",
                                               torture_type, cpu);
                                n_online_successes++;
                        }
@@ -1631,6 +1644,7 @@ void rcu_torture_barrier_cbf(struct rcu_head *rcu)
 static int rcu_torture_barrier_cbs(void *arg)
 {
        long myid = (long)arg;
+        bool lastphase = 0;
        struct rcu_head rcu;
        init_rcu_head_on_stack(&rcu);
@@ -1638,9 +1652,11 @@ static int rcu_torture_barrier_cbs(void *arg)
        set_user_nice(current, 19);
        do {
                wait_event(barrier_cbs_wq[myid],
-                           atomic_read(&barrier_cbs_count) == n_barrier_cbs ||
+                           barrier_phase != lastphase ||
                           kthread_should_stop() ||
                           fullstop != FULLSTOP_DONTSTOP);
+                lastphase = barrier_phase;
+                smp_mb(); /* ensure barrier_phase load before ->call(). */
                if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
                        break;
                cur_ops->call(&rcu, rcu_torture_barrier_cbf);
@@ -1665,7 +1681,8 @@ static int rcu_torture_barrier(void *arg)
        do {
                atomic_set(&barrier_cbs_invoked, 0);
                atomic_set(&barrier_cbs_count, n_barrier_cbs);
-                /* wake_up() path contains the required barriers. */
+                smp_mb(); /* Ensure barrier_phase after prior assignments. */
+                barrier_phase = !barrier_phase;
                for (i = 0; i < n_barrier_cbs; i++)
                        wake_up(&barrier_cbs_wq[i]);
                wait_event(barrier_wq,
@@ -1684,7 +1701,7 @@ static int rcu_torture_barrier(void *arg)
                schedule_timeout_interruptible(HZ / 10);
        } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
        VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
-        rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+        rcutorture_shutdown_absorb("rcu_torture_barrier");
        while (!kthread_should_stop())
                schedule_timeout_interruptible(1);
        return 0;
@@ -1908,8 +1925,8 @@ rcu_torture_init(void)
        static struct rcu_torture_ops *torture_ops[] =
                { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
                  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
-                  &srcu_ops, &srcu_sync_ops, &srcu_raw_ops,
+                  &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
-                  &srcu_raw_sync_ops, &srcu_expedited_ops,
+                  &srcu_raw_ops, &srcu_raw_sync_ops,
                  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
        mutex_lock(&fullstop_mutex);
@@ -1931,8 +1948,7 @@ rcu_torture_init(void)
                return -EINVAL;
        }
        if (cur_ops->fqs == NULL && fqs_duration != 0) {
-                printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
+                printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
-                                  "fqs_duration, fqs disabled.\n");
                fqs_duration = 0;
        }
        if (cur_ops->init)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4b97bba7396e..f280e542e3e9 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -60,36 +60,44 @@
 /* Data structures. */
-static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
+static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
-#define RCU_STATE_INITIALIZER(structname) { \
+#define RCU_STATE_INITIALIZER(sname, cr) { \
-        .level = { &structname##_state.node[0] }, \
+        .level = { &sname##_state.node[0] }, \
-        .levelcnt = { \
+        .call = cr, \
-                NUM_RCU_LVL_0,  /* root of hierarchy. */ \
-                NUM_RCU_LVL_1, \
-                NUM_RCU_LVL_2, \
-                NUM_RCU_LVL_3, \
-                NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
-        }, \
        .fqs_state = RCU_GP_IDLE, \
        .gpnum = -300, \
        .completed = -300, \
-        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
+        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \
-        .orphan_nxttail = &structname##_state.orphan_nxtlist, \
+        .orphan_nxttail = &sname##_state.orphan_nxtlist, \
-        .orphan_donetail = &structname##_state.orphan_donelist, \
+        .orphan_donetail = &sname##_state.orphan_donelist, \
-        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
+        .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
-        .n_force_qs = 0, \
+        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
-        .n_force_qs_ngp = 0, \
+        .name = #sname, \
-        .name = #structname, \
 }
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
+struct rcu_state rcu_sched_state =
+        RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 static struct rcu_state *rcu_state;
+LIST_HEAD(rcu_struct_flavors);
+/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
+static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
+module_param(rcu_fanout_leaf, int, 0);
+int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
+static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
+        NUM_RCU_LVL_0,
+        NUM_RCU_LVL_1,
+        NUM_RCU_LVL_2,
+        NUM_RCU_LVL_3,
+        NUM_RCU_LVL_4,
+};
+int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
 /*
 * The rcu_scheduler_active variable transitions from zero to one just
@@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 unsigned long rcutorture_testseq;
 unsigned long rcutorture_vernum;
-/* State information for rcu_barrier() and friends. */
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
-static atomic_t rcu_barrier_cpu_count;
-static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
 /*
 * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
 * permit this function to be invoked without holding the root rcu_node
@@ -358,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
                struct task_struct *idle = idle_task(smp_processor_id());
                trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
-                ftrace_dump(DUMP_ALL);
+                ftrace_dump(DUMP_ORIG);
                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
                          current->pid, current->comm,
                          idle->pid, idle->comm); /* must be idle task! */
@@ -468,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
                trace_rcu_dyntick("Error on exit: not idle task",
                                  oldval, rdtp->dynticks_nesting);
-                ftrace_dump(DUMP_ALL);
+                ftrace_dump(DUMP_ORIG);
                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
                          current->pid, current->comm,
                          idle->pid, idle->comm); /* must be idle task! */
@@ -585,8 +586,6 @@ void rcu_nmi_exit(void)
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
-#ifdef CONFIG_PROVE_RCU
 /**
 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
 *
@@ -604,7 +603,7 @@ int rcu_is_cpu_idle(void)
 }
 EXPORT_SYMBOL(rcu_is_cpu_idle);
-#ifdef CONFIG_HOTPLUG_CPU
+#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
 /*
 * Is the current CPU online?  Disable preemption to avoid false positives
@@ -645,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void)
 }
 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
-#endif /* #ifdef CONFIG_PROVE_RCU */
 /**
 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
@@ -733,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        int cpu;
        long delta;
        unsigned long flags;
-        int ndetected;
+        int ndetected = 0;
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Only let one CPU complain about others per time interval. */
@@ -774,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         */
        rnp = rcu_get_root(rsp);
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        ndetected = rcu_print_task_stall(rnp);
+        ndetected += rcu_print_task_stall(rnp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        print_cpu_stall_info_end();
@@ -860,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
 */
 void rcu_cpu_stall_reset(void)
 {
-        rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
+        struct rcu_state *rsp;
-        rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-        rcu_preempt_stall_reset();
+        for_each_rcu_flavor(rsp)
+                rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
 }
 static struct notifier_block rcu_panic_block = {
@@ -894,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
                if (rnp->qsmask & rdp->grpmask) {
                        rdp->qs_pending = 1;
                        rdp->passed_quiesce = 0;
-                } else
+                } else {
                        rdp->qs_pending = 0;
+                }
                zero_cpu_stall_ticks(rdp);
        }
 }
@@ -937,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
 }
 /*
+ * Initialize the specified rcu_data structure's callback list to empty.
+ */
+static void init_callback_list(struct rcu_data *rdp)
+{
+        int i;
+        rdp->nxtlist = NULL;
+        for (i = 0; i < RCU_NEXT_SIZE; i++)
+                rdp->nxttail[i] = &rdp->nxtlist;
+}
+/*
 * Advance this CPU's callbacks, but only if the current grace period
 * has ended.  This may be called only from the CPU to whom the rdp
 * belongs.  In addition, the corresponding leaf rcu_node structure's
@@ -1328,8 +1339,6 @@ static void
 rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
                          struct rcu_node *rnp, struct rcu_data *rdp)
 {
-        int i;
        /*
         * Orphan the callbacks.  First adjust the counts.  This is safe
         * because ->onofflock excludes _rcu_barrier()'s adoption of
@@ -1340,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
                rsp->qlen += rdp->qlen;
                rdp->n_cbs_orphaned += rdp->qlen;
                rdp->qlen_lazy = 0;
-                rdp->qlen = 0;
+                ACCESS_ONCE(rdp->qlen) = 0;
        }
        /*
@@ -1369,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
        }
        /* Finally, initialize the rcu_data structure's list to empty.  */
-        rdp->nxtlist = NULL;
+        init_callback_list(rdp);
-        for (i = 0; i < RCU_NEXT_SIZE; i++)
-                rdp->nxttail[i] = &rdp->nxtlist;
 }
 /*
@@ -1505,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp, true);
+        WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
+                  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
+                  cpu, rdp->qlen, rdp->nxtlist);
 }
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1592,7 +1602,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        }
        smp_mb(); /* List handling before counting for rcu_barrier(). */
        rdp->qlen_lazy -= count_lazy;
-        rdp->qlen -= count;
+        ACCESS_ONCE(rdp->qlen) -= count;
        rdp->n_cbs_invoked += count;
        /* Reinstate batch limit if we have worked down the excess. */
@@ -1605,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                rdp->n_force_qs_snap = rsp->n_force_qs;
        } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
                rdp->qlen_last_fqs_check = rdp->qlen;
+        WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));
        local_irq_restore(flags);
@@ -1745,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                break; /* grace period idle or initializing, ignore. */
        case RCU_SAVE_DYNTICK:
-                if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
-                        break; /* So gcc recognizes the dead code. */
                raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
@@ -1788,9 +1797,10 @@ unlock_fqs_ret:
 * whom the rdp belongs.
 */
 static void
-__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+__rcu_process_callbacks(struct rcu_state *rsp)
 {
        unsigned long flags;
+        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
        WARN_ON_ONCE(rdp->beenonline == 0);
@@ -1826,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
+        struct rcu_state *rsp;
        trace_rcu_utilization("Start RCU core");
-        __rcu_process_callbacks(&rcu_sched_state,
+        for_each_rcu_flavor(rsp)
-                                &__get_cpu_var(rcu_sched_data));
+                __rcu_process_callbacks(rsp);
-        __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
-        rcu_preempt_process_callbacks();
        trace_rcu_utilization("End RCU core");
 }
@@ -1857,6 +1867,56 @@ static void invoke_rcu_core(void)
        raise_softirq(RCU_SOFTIRQ);
 }
+/*
+ * Handle any core-RCU processing required by a call_rcu() invocation.
+ */
+static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
+                            struct rcu_head *head, unsigned long flags)
+{
+        /*
+         * If called from an extended quiescent state, invoke the RCU
+         * core in order to force a re-evaluation of RCU's idleness.
+         */
+        if (rcu_is_cpu_idle() && cpu_online(smp_processor_id()))
+                invoke_rcu_core();
+        /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
+        if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
+                return;
+        /*
+         * Force the grace period if too many callbacks or too long waiting.
+         * Enforce hysteresis, and don't invoke force_quiescent_state()
+         * if some other CPU has recently done so.  Also, don't bother
+         * invoking force_quiescent_state() if the newly enqueued callback
+         * is the only one waiting for a grace period to complete.
+         */
+        if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+                /* Are we ignoring a completed grace period? */
+                rcu_process_gp_end(rsp, rdp);
+                check_for_new_grace_period(rsp, rdp);
+                /* Start a new grace period if one not already started. */
+                if (!rcu_gp_in_progress(rsp)) {
+                        unsigned long nestflag;
+                        struct rcu_node *rnp_root = rcu_get_root(rsp);
+                        raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
+                        rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
+                } else {
+                        /* Give the grace period a kick. */
+                        rdp->blimit = LONG_MAX;
+                        if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+                            *rdp->nxttail[RCU_DONE_TAIL] != head)
+                                force_quiescent_state(rsp, 0);
+                        rdp->n_force_qs_snap = rsp->n_force_qs;
+                        rdp->qlen_last_fqs_check = rdp->qlen;
+                }
+        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
+                force_quiescent_state(rsp, 1);
+}
 static void
 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
           struct rcu_state *rsp, bool lazy)
@@ -1881,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        rdp = this_cpu_ptr(rsp->rda);
        /* Add the callback to our list. */
-        rdp->qlen++;
+        ACCESS_ONCE(rdp->qlen)++;
        if (lazy)
                rdp->qlen_lazy++;
        else
@@ -1896,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        else
                trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
-        /* If interrupts were disabled, don't dive into RCU core. */
+        /* Go handle any RCU core processing required. */
-        if (irqs_disabled_flags(flags)) {
+        __call_rcu_core(rsp, rdp, head, flags);
-                local_irq_restore(flags);
-                return;
-        }
-        /*
-         * Force the grace period if too many callbacks or too long waiting.
-         * Enforce hysteresis, and don't invoke force_quiescent_state()
-         * if some other CPU has recently done so.  Also, don't bother
-         * invoking force_quiescent_state() if the newly enqueued callback
-         * is the only one waiting for a grace period to complete.
-         */
-        if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
-                /* Are we ignoring a completed grace period? */
-                rcu_process_gp_end(rsp, rdp);
-                check_for_new_grace_period(rsp, rdp);
-                /* Start a new grace period if one not already started. */
-                if (!rcu_gp_in_progress(rsp)) {
-                        unsigned long nestflag;
-                        struct rcu_node *rnp_root = rcu_get_root(rsp);
-                        raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
-                        rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */
-                } else {
-                        /* Give the grace period a kick. */
-                        rdp->blimit = LONG_MAX;
-                        if (rsp->n_force_qs == rdp->n_force_qs_snap &&
-                            *rdp->nxttail[RCU_DONE_TAIL] != head)
-                                force_quiescent_state(rsp, 0);
-                        rdp->n_force_qs_snap = rsp->n_force_qs;
-                        rdp->qlen_last_fqs_check = rdp->qlen;
-                }
-        } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
-                force_quiescent_state(rsp, 1);
        local_irq_restore(flags);
 }
@@ -1962,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 * occasionally incorrectly indicate that there are multiple CPUs online
 * when there was in fact only one the whole time, as this just adds
 * some overhead: RCU still operates correctly.
- *
- * Of course, sampling num_online_cpus() with preemption enabled can
- * give erroneous results if there are concurrent CPU-hotplug operations.
- * For example, given a demonic sequence of preemptions in num_online_cpus()
- * and CPU-hotplug operations, there could be two or more CPUs online at
- * all times, but num_online_cpus() might well return one (or even zero).
- *
- * However, all such demonic sequences require at least one CPU-offline
- * operation.  Furthermore, rcu_blocking_is_gp() giving the wrong answer
- * is only a problem if there is an RCU read-side critical section executing
- * throughout.  But RCU-sched and RCU-bh read-side critical sections
- * disable either preemption or bh, which prevents a CPU from going offline.
- * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return
- * that there is only one CPU when in fact there was more than one throughout
- * is when there were no RCU readers in the system.  If there are no
- * RCU readers, the grace period by definition can be of zero length,
- * regardless of the number of online CPUs.
 */
 static inline int rcu_blocking_is_gp(void)
 {
+        int ret;
        might_sleep();  /* Check for RCU read-side critical section. */
-        return num_online_cpus() <= 1;
+        preempt_disable();
+        ret = num_online_cpus() <= 1;
+        preempt_enable();
+        return ret;
 }
 /**
@@ -2118,9 +2131,9 @@ void synchronize_sched_expedited(void)
                put_online_cpus();
                /* No joy, try again later.  Or just synchronize_sched(). */
-                if (trycount++ < 10)
+                if (trycount++ < 10) {
                        udelay(trycount * num_online_cpus());
-                else {
+                } else {
                        synchronize_sched();
                        return;
                }
@@ -2241,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 static int rcu_pending(int cpu)
 {
-        return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
+        struct rcu_state *rsp;
-               __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
-               rcu_preempt_pending(cpu);
+        for_each_rcu_flavor(rsp)
+                if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
+                        return 1;
+        return 0;
 }
 /*
@@ -2253,20 +2269,41 @@ static int rcu_pending(int cpu)
 */
 static int rcu_cpu_has_callbacks(int cpu)
 {
+        struct rcu_state *rsp;
        /* RCU callbacks either ready or pending? */
-        return per_cpu(rcu_sched_data, cpu).nxtlist ||
+        for_each_rcu_flavor(rsp)
-               per_cpu(rcu_bh_data, cpu).nxtlist ||
+                if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
-               rcu_preempt_cpu_has_callbacks(cpu);
+                        return 1;
+        return 0;
+}
+/*
+ * Helper function for _rcu_barrier() tracing.  If tracing is disabled,
+ * the compiler is expected to optimize this away.
+ */
+static void _rcu_barrier_trace(struct rcu_state *rsp, char *s,
+                               int cpu, unsigned long done)
+{
+        trace_rcu_barrier(rsp->name, s, cpu,
+                          atomic_read(&rsp->barrier_cpu_count), done);
 }
 /*
 * RCU callback function for _rcu_barrier().  If we are last, wake
 * up the task executing _rcu_barrier().
 */
-static void rcu_barrier_callback(struct rcu_head *notused)
+static void rcu_barrier_callback(struct rcu_head *rhp)
 {
-        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+        struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head);
-                complete(&rcu_barrier_completion);
+        struct rcu_state *rsp = rdp->rsp;
+        if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
+                _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+                complete(&rsp->barrier_completion);
+        } else {
+                _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+        }
 }
 /*
@@ -2274,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused)
 */
 static void rcu_barrier_func(void *type)
 {
-        int cpu = smp_processor_id();
+        struct rcu_state *rsp = type;
-        struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
+        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
-        void (*call_rcu_func)(struct rcu_head *head,
-                              void (*func)(struct rcu_head *head));
-        atomic_inc(&rcu_barrier_cpu_count);
+        _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
-        call_rcu_func = type;
+        atomic_inc(&rsp->barrier_cpu_count);
-        call_rcu_func(head, rcu_barrier_callback);
+        rsp->call(&rdp->barrier_head, rcu_barrier_callback);
 }
 /*
 * Orchestrate the specified type of RCU barrier, waiting for all
 * RCU callbacks of the specified type to complete.
 */
-static void _rcu_barrier(struct rcu_state *rsp,
+static void _rcu_barrier(struct rcu_state *rsp)
-                         void (*call_rcu_func)(struct rcu_head *head,
-                                               void (*func)(struct rcu_head *head)))
 {
        int cpu;
        unsigned long flags;
        struct rcu_data *rdp;
-        struct rcu_head rh;
+        struct rcu_data rd;
+        unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
+        unsigned long snap_done;
-        init_rcu_head_on_stack(&rh);
+        init_rcu_head_on_stack(&rd.barrier_head);
+        _rcu_barrier_trace(rsp, "Begin", -1, snap);
        /* Take mutex to serialize concurrent rcu_barrier() requests. */
-        mutex_lock(&rcu_barrier_mutex);
+        mutex_lock(&rsp->barrier_mutex);
+        /*
+         * Ensure that all prior references, including to ->n_barrier_done,
+         * are ordered before the _rcu_barrier() machinery.
+         */
+        smp_mb();  /* See above block comment. */
+        /*
+         * Recheck ->n_barrier_done to see if others did our work for us.
+         * This means checking ->n_barrier_done for an even-to-odd-to-even
+         * transition.  The "if" expression below therefore rounds the old
+         * value up to the next even number and adds two before comparing.
+         */
+        snap_done = ACCESS_ONCE(rsp->n_barrier_done);
+        _rcu_barrier_trace(rsp, "Check", -1, snap_done);
+        if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) {
+                _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+                smp_mb(); /* caller's subsequent code after above check. */
+                mutex_unlock(&rsp->barrier_mutex);
+                return;
+        }
-        smp_mb();  /* Prevent any prior operations from leaking in. */
+        /*
+         * Increment ->n_barrier_done to avoid duplicate work.  Use
+         * ACCESS_ONCE() to prevent the compiler from speculating
+         * the increment to precede the early-exit check.
+         */
+        ACCESS_ONCE(rsp->n_barrier_done)++;
+        WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
+        _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
+        smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
        /*
         * Initialize the count to one rather than to zero in order to
@@ -2321,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp,
         * 6.   Both rcu_barrier_callback() callbacks are invoked, awakening
         *      us -- but before CPU 1's orphaned callbacks are invoked!!!
         */
-        init_completion(&rcu_barrier_completion);
+        init_completion(&rsp->barrier_completion);
-        atomic_set(&rcu_barrier_cpu_count, 1);
+        atomic_set(&rsp->barrier_cpu_count, 1);
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
        rsp->rcu_barrier_in_progress = current;
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
@@ -2338,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp,
                preempt_disable();
                rdp = per_cpu_ptr(rsp->rda, cpu);
                if (cpu_is_offline(cpu)) {
+                        _rcu_barrier_trace(rsp, "Offline", cpu,
+                                           rsp->n_barrier_done);
                        preempt_enable();
                        while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
                                schedule_timeout_interruptible(1);
                } else if (ACCESS_ONCE(rdp->qlen)) {
-                        smp_call_function_single(cpu, rcu_barrier_func,
+                        _rcu_barrier_trace(rsp, "OnlineQ", cpu,
-                                                 (void *)call_rcu_func, 1);
+                                           rsp->n_barrier_done);
+                        smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
                        preempt_enable();
                } else {
+                        _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+                                           rsp->n_barrier_done);
                        preempt_enable();
                }
        }
@@ -2362,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp,
        rcu_adopt_orphan_cbs(rsp);
        rsp->rcu_barrier_in_progress = NULL;
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
-        atomic_inc(&rcu_barrier_cpu_count);
+        atomic_inc(&rsp->barrier_cpu_count);
        smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
-        call_rcu_func(&rh, rcu_barrier_callback);
+        rd.rsp = rsp;
+        rsp->call(&rd.barrier_head, rcu_barrier_callback);
        /*
         * Now that we have an rcu_barrier_callback() callback on each
         * CPU, and thus each counted, remove the initial count.
         */
-        if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+        if (atomic_dec_and_test(&rsp->barrier_cpu_count))
-                complete(&rcu_barrier_completion);
+                complete(&rsp->barrier_completion);
+        /* Increment ->n_barrier_done to prevent duplicate work. */
+        smp_mb(); /* Keep increment after above mechanism. */
+        ACCESS_ONCE(rsp->n_barrier_done)++;
+        WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
+        _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
+        smp_mb(); /* Keep increment before caller's subsequent code. */
        /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
-        wait_for_completion(&rcu_barrier_completion);
+        wait_for_completion(&rsp->barrier_completion);
        /* Other rcu_barrier() invocations can now safely proceed. */
-        mutex_unlock(&rcu_barrier_mutex);
+        mutex_unlock(&rsp->barrier_mutex);
-        destroy_rcu_head_on_stack(&rh);
+        destroy_rcu_head_on_stack(&rd.barrier_head);
 }
 /**
@@ -2387,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp,
 */
 void rcu_barrier_bh(void)
 {
-        _rcu_barrier(&rcu_bh_state, call_rcu_bh);
+        _rcu_barrier(&rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
@@ -2396,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
 */
 void rcu_barrier_sched(void)
 {
-        _rcu_barrier(&rcu_sched_state, call_rcu_sched);
+        _rcu_barrier(&rcu_sched_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
@@ -2407,18 +2485,15 @@ static void __init
 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
        unsigned long flags;
-        int i;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
-        rdp->nxtlist = NULL;
+        init_callback_list(rdp);
-        for (i = 0; i < RCU_NEXT_SIZE; i++)
-                rdp->nxttail[i] = &rdp->nxtlist;
        rdp->qlen_lazy = 0;
-        rdp->qlen = 0;
+        ACCESS_ONCE(rdp->qlen) = 0;
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
@@ -2492,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 static void __cpuinit rcu_prepare_cpu(int cpu)
 {
-        rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
+        struct rcu_state *rsp;
-        rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
-        rcu_preempt_init_percpu_data(cpu);
+        for_each_rcu_flavor(rsp)
+                rcu_init_percpu_data(cpu, rsp,
+                                     strcmp(rsp->name, "rcu_preempt") == 0);
 }
 /*
@@ -2506,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        long cpu = (long)hcpu;
        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
+        struct rcu_state *rsp;
        trace_rcu_utilization("Start CPU hotplug");
        switch (action) {
@@ -2530,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                 * touch any data without introducing corruption. We send the
                 * dying CPU's callbacks to an arbitrarily chosen online CPU.
                 */
-                rcu_cleanup_dying_cpu(&rcu_bh_state);
+                for_each_rcu_flavor(rsp)
-                rcu_cleanup_dying_cpu(&rcu_sched_state);
+                        rcu_cleanup_dying_cpu(rsp);
-                rcu_preempt_cleanup_dying_cpu();
                rcu_cleanup_after_idle(cpu);
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-                rcu_cleanup_dead_cpu(cpu, &rcu_bh_state);
+                for_each_rcu_flavor(rsp)
-                rcu_cleanup_dead_cpu(cpu, &rcu_sched_state);
+                        rcu_cleanup_dead_cpu(cpu, rsp);
-                rcu_preempt_cleanup_dead_cpu(cpu);
                break;
        default:
                break;
@@ -2574,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
        int i;
-        for (i = NUM_RCU_LVLS - 1; i > 0; i--)
+        for (i = rcu_num_lvls - 1; i > 0; i--)
                rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-        rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF;
+        rsp->levelspread[0] = rcu_fanout_leaf;
 }
 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
@@ -2586,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
        int i;
        cprv = NR_CPUS;
-        for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+        for (i = rcu_num_lvls - 1; i >= 0; i--) {
                ccur = rsp->levelcnt[i];
                rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
                cprv = ccur;
@@ -2613,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        /* Initialize the level-tracking arrays. */
-        for (i = 1; i < NUM_RCU_LVLS; i++)
+        for (i = 0; i < rcu_num_lvls; i++)
+                rsp->levelcnt[i] = num_rcu_lvl[i];
+        for (i = 1; i < rcu_num_lvls; i++)
                rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
        rcu_init_levelspread(rsp);
        /* Initialize the elements themselves, starting from the leaves. */
-        for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+        for (i = rcu_num_lvls - 1; i >= 0; i--) {
                cpustride *= rsp->levelspread[i];
                rnp = rsp->level[i];
                for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
@@ -2649,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        }
        rsp->rda = rda;
-        rnp = rsp->level[NUM_RCU_LVLS - 1];
+        rnp = rsp->level[rcu_num_lvls - 1];
        for_each_possible_cpu(i) {
                while (i > rnp->grphi)
                        rnp++;
                per_cpu_ptr(rsp->rda, i)->mynode = rnp;
                rcu_boot_init_percpu_data(i, rsp);
        }
+        list_add(&rsp->flavors, &rcu_struct_flavors);
+}
+/*
+ * Compute the rcu_node tree geometry from kernel parameters.  This cannot
+ * replace the definitions in rcutree.h because those are needed to size
+ * the ->node array in the rcu_state structure.
+ */
+static void __init rcu_init_geometry(void)
+{
+        int i;
+        int j;
+        int n = nr_cpu_ids;
+        int rcu_capacity[MAX_RCU_LVLS + 1];
+        /* If the compile-time values are accurate, just leave. */
+        if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF)
+                return;
+        /*
+         * Compute number of nodes that can be handled an rcu_node tree
+         * with the given number of levels.  Setting rcu_capacity[0] makes
+         * some of the arithmetic easier.
+         */
+        rcu_capacity[0] = 1;
+        rcu_capacity[1] = rcu_fanout_leaf;
+        for (i = 2; i <= MAX_RCU_LVLS; i++)
+                rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT;
+        /*
+         * The boot-time rcu_fanout_leaf parameter is only permitted
+         * to increase the leaf-level fanout, not decrease it.  Of course,
+         * the leaf-level fanout cannot exceed the number of bits in
+         * the rcu_node masks.  Finally, the tree must be able to accommodate
+         * the configured number of CPUs.  Complain and fall back to the
+         * compile-time values if these limits are exceeded.
+         */
+        if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF ||
+            rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
+            n > rcu_capacity[MAX_RCU_LVLS]) {
+                WARN_ON(1);
+                return;
+        }
+        /* Calculate the number of rcu_nodes at each level of the tree. */
+        for (i = 1; i <= MAX_RCU_LVLS; i++)
+                if (n <= rcu_capacity[i]) {
+                        for (j = 0; j <= i; j++)
+                                num_rcu_lvl[j] =
+                                        DIV_ROUND_UP(n, rcu_capacity[i - j]);
+                        rcu_num_lvls = i;
+                        for (j = i + 1; j <= MAX_RCU_LVLS; j++)
+                                num_rcu_lvl[j] = 0;
+                        break;
+                }
+        /* Calculate the total number of rcu_node structures. */
+        rcu_num_nodes = 0;
+        for (i = 0; i <= MAX_RCU_LVLS; i++)
+                rcu_num_nodes += num_rcu_lvl[i];
+        rcu_num_nodes -= n;
 }
 void __init rcu_init(void)
@@ -2663,6 +2802,7 @@ void __init rcu_init(void)
        int cpu;
        rcu_bootup_announce();
+        rcu_init_geometry();
        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 19b61ac1079f..4d29169f2124 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -42,28 +42,28 @@
 #define RCU_FANOUT_4          (RCU_FANOUT_3 * CONFIG_RCU_FANOUT)
 #if NR_CPUS <= RCU_FANOUT_1
-#  define NUM_RCU_LVLS        1
+#  define RCU_NUM_LVLS        1
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       (NR_CPUS)
 #  define NUM_RCU_LVL_2       0
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
 #elif NR_CPUS <= RCU_FANOUT_2
-#  define NUM_RCU_LVLS        2
+#  define RCU_NUM_LVLS        2
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_2       (NR_CPUS)
 #  define NUM_RCU_LVL_3       0
 #  define NUM_RCU_LVL_4       0
 #elif NR_CPUS <= RCU_FANOUT_3
-#  define NUM_RCU_LVLS        3
+#  define RCU_NUM_LVLS        3
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
 #  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
 #  define NUM_RCU_LVL_3       (NR_CPUS)
 #  define NUM_RCU_LVL_4       0
 #elif NR_CPUS <= RCU_FANOUT_4
-#  define NUM_RCU_LVLS        4
+#  define RCU_NUM_LVLS        4
 #  define NUM_RCU_LVL_0       1
 #  define NUM_RCU_LVL_1       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
 #  define NUM_RCU_LVL_2       DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
@@ -76,6 +76,9 @@
 #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
 #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+extern int rcu_num_lvls;
+extern int rcu_num_nodes;
 /*
 * Dynticks per-CPU state.
 */
@@ -97,6 +100,7 @@ struct rcu_dynticks {
                                    /* # times non-lazy CBs posted to CPU. */
        unsigned long nonlazy_posted_snap;
                                    /* idle-period nonlazy_posted snapshot. */
+        int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
@@ -206,7 +210,7 @@ struct rcu_node {
 */
 #define rcu_for_each_node_breadth_first(rsp, rnp) \
        for ((rnp) = &(rsp)->node[0]; \
-             (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
 /*
 * Do a breadth-first scan of the non-leaf rcu_node structures for the
@@ -215,7 +219,7 @@ struct rcu_node {
 */
 #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \
        for ((rnp) = &(rsp)->node[0]; \
-             (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++)
+             (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++)
 /*
 * Scan the leaves of the rcu_node hierarchy for the specified rcu_state
@@ -224,8 +228,8 @@ struct rcu_node {
 * It is still a leaf node, even if it is also the root node.
 */
 #define rcu_for_each_leaf_node(rsp, rnp) \
-        for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
+        for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
-             (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+             (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
 /* Index values for nxttail array in struct rcu_data. */
 #define RCU_DONE_TAIL           0       /* Also RCU_WAIT head. */
@@ -311,6 +315,9 @@ struct rcu_data {
        unsigned long n_rp_need_fqs;
        unsigned long n_rp_need_nothing;
+        /* 6) _rcu_barrier() callback. */
+        struct rcu_head barrier_head;
        int cpu;
        struct rcu_state *rsp;
 };
@@ -357,10 +364,12 @@ do {									\
 */
 struct rcu_state {
        struct rcu_node node[NUM_RCU_NODES];    /* Hierarchy. */
-        struct rcu_node *level[NUM_RCU_LVLS];   /* Hierarchy levels. */
+        struct rcu_node *level[RCU_NUM_LVLS];   /* Hierarchy levels. */
        u32 levelcnt[MAX_RCU_LVLS + 1];         /* # nodes in each level. */
-        u8 levelspread[NUM_RCU_LVLS];           /* kids/node in each level. */
+        u8 levelspread[RCU_NUM_LVLS];           /* kids/node in each level. */
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
+        void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
+                     void (*func)(struct rcu_head *head));
        /* The following fields are guarded by the root rcu_node's lock. */
@@ -392,6 +401,11 @@ struct rcu_state {
        struct task_struct *rcu_barrier_in_progress;
                                                /* Task doing rcu_barrier(), */
                                                /*  or NULL if no barrier. */
+        struct mutex barrier_mutex;             /* Guards barrier fields. */
+        atomic_t barrier_cpu_count;             /* # CPUs waiting on. */
+        struct completion barrier_completion;   /* Wake at barrier end. */
+        unsigned long n_barrier_done;           /* ++ at start and end of */
+                                                /*  _rcu_barrier(). */
        raw_spinlock_t fqslock;                 /* Only one task forcing */
                                                /*  quiescent states. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
@@ -409,8 +423,13 @@ struct rcu_state {
        unsigned long gp_max;                   /* Maximum GP duration in */
                                                /*  jiffies. */
        char *name;                             /* Name of structure. */
+        struct list_head flavors;               /* List of RCU flavors. */
 };
+extern struct list_head rcu_struct_flavors;
+#define for_each_rcu_flavor(rsp) \
+        list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
 /* Return values for rcu_preempt_offline_tasks(). */
 #define RCU_OFL_TASKS_NORM_GP   0x1             /* Tasks blocking normal */
@@ -453,25 +472,18 @@ static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
 static int rcu_print_task_stall(struct rcu_node *rnp);
-static void rcu_preempt_stall_reset(void);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
 static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
                                     struct rcu_node *rnp,
                                     struct rcu_data *rdp);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_preempt_cleanup_dead_cpu(int cpu);
 static void rcu_preempt_check_callbacks(int cpu);
-static void rcu_preempt_process_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
                               bool wake);
 #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
-static int rcu_preempt_pending(int cpu);
-static int rcu_preempt_cpu_has_callbacks(int cpu);
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void rcu_preempt_cleanup_dying_cpu(void);
 static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3e4899459f3d..7f3244c0df01 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -68,17 +68,21 @@ static void __init rcu_bootup_announce_oddness(void)
        printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
 #endif
 #if NUM_RCU_LVL_4 != 0
-        printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
+        printk(KERN_INFO "\tFour-level hierarchy is enabled.\n");
 #endif
+        if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
+                printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
+        if (nr_cpu_ids != NR_CPUS)
+                printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
 }
 #ifdef CONFIG_TREE_PREEMPT_RCU
-struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
+struct rcu_state rcu_preempt_state =
+        RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
-static void rcu_read_unlock_special(struct task_struct *t);
 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 /*
@@ -233,18 +237,6 @@ static void rcu_preempt_note_context_switch(int cpu)
 }
 /*
- * Tree-preemptible RCU implementation for rcu_read_lock().
- * Just increment ->rcu_read_lock_nesting, shared state will be updated
- * if we block.
- */
-void __rcu_read_lock(void)
-{
-        current->rcu_read_lock_nesting++;
-        barrier();  /* needed if we ever invoke rcu_read_lock in rcutree.c */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_lock);
-/*
 * Check for preempted RCU readers blocking the current grace period
 * for the specified rcu_node structure.  If the caller needs a reliable
 * answer, it must hold the rcu_node's ->lock.
@@ -310,7 +302,7 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t,
 * notify RCU core processing or task having blocked during the RCU
 * read-side critical section.
 */
-static noinline void rcu_read_unlock_special(struct task_struct *t)
+void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
@@ -398,8 +390,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
                                                         rnp->grphi,
                                                         !!rnp->gp_tasks);
                        rcu_report_unblock_qs_rnp(rnp, flags);
-                } else
+                } else {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                }
 #ifdef CONFIG_RCU_BOOST
                /* Unboost if we were boosted. */
@@ -418,38 +411,6 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
        }
 }
-/*
- * Tree-preemptible RCU implementation for rcu_read_unlock().
- * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
- * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
- * invoke rcu_read_unlock_special() to clean up after a context switch
- * in an RCU read-side critical section and other special cases.
- */
-void __rcu_read_unlock(void)
-{
-        struct task_struct *t = current;
-        if (t->rcu_read_lock_nesting != 1)
-                --t->rcu_read_lock_nesting;
-        else {
-                barrier();  /* critical section before exit code. */
-                t->rcu_read_lock_nesting = INT_MIN;
-                barrier();  /* assign before ->rcu_read_unlock_special load */
-                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
-                        rcu_read_unlock_special(t);
-                barrier();  /* ->rcu_read_unlock_special load before assign */
-                t->rcu_read_lock_nesting = 0;
-        }
-#ifdef CONFIG_PROVE_LOCKING
-        {
-                int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
-                WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
-        }
-#endif /* #ifdef CONFIG_PROVE_LOCKING */
-}
-EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 #ifdef CONFIG_RCU_CPU_STALL_VERBOSE
 /*
@@ -540,16 +501,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 }
 /*
- * Suppress preemptible RCU's CPU stall warnings by pushing the
- * time of the next stall-warning message comfortably far into the
- * future.
- */
-static void rcu_preempt_stall_reset(void)
-{
-        rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
-}
-/*
 * Check that the list of blocked tasks for the newly completed grace
 * period is in fact empty.  It is a serious bug to complete a grace
 * period that still has RCU readers blocked!  This function must be
@@ -650,14 +601,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
- * Do CPU-offline processing for preemptible RCU.
- */
-static void rcu_preempt_cleanup_dead_cpu(int cpu)
-{
-        rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);
-}
-/*
 * Check for a quiescent state from the current CPU.  When a task blocks,
 * the task is recorded in the corresponding CPU's rcu_node structure,
 * which is checked elsewhere.
@@ -677,15 +620,6 @@ static void rcu_preempt_check_callbacks(int cpu)
                t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
 }
-/*
- * Process callbacks for preemptible RCU.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-        __rcu_process_callbacks(&rcu_preempt_state,
-                                &__get_cpu_var(rcu_preempt_data));
-}
 #ifdef CONFIG_RCU_BOOST
 static void rcu_preempt_do_callbacks(void)
@@ -824,9 +758,9 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
        int must_wait = 0;
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        if (list_empty(&rnp->blkd_tasks))
+        if (list_empty(&rnp->blkd_tasks)) {
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        else {
+        } else {
                rnp->exp_tasks = rnp->blkd_tasks.next;
                rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
                must_wait = 1;
@@ -870,9 +804,9 @@ void synchronize_rcu_expedited(void)
         * expedited grace period for us, just leave.
         */
        while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
-                if (trycount++ < 10)
+                if (trycount++ < 10) {
                        udelay(trycount * num_online_cpus());
-                else {
+                } else {
                        synchronize_rcu();
                        return;
                }
@@ -917,51 +851,16 @@ mb_ret:
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-/*
- * Check to see if there is any immediate preemptible-RCU-related work
- * to be done.
- */
-static int rcu_preempt_pending(int cpu)
-{
-        return __rcu_pending(&rcu_preempt_state,
-                             &per_cpu(rcu_preempt_data, cpu));
-}
-/*
- * Does preemptible RCU have callbacks on this CPU?
- */
-static int rcu_preempt_cpu_has_callbacks(int cpu)
-{
-        return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
-}
 /**
 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
 */
 void rcu_barrier(void)
 {
-        _rcu_barrier(&rcu_preempt_state, call_rcu);
+        _rcu_barrier(&rcu_preempt_state);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
- * Initialize preemptible RCU's per-CPU data.
- */
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
-{
-        rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
-}
-/*
- * Move preemptible RCU's callbacks from dying CPU to other online CPU
- * and record a quiescent state.
- */
-static void rcu_preempt_cleanup_dying_cpu(void)
-{
-        rcu_cleanup_dying_cpu(&rcu_preempt_state);
-}
-/*
 * Initialize preemptible RCU's state structures.
 */
 static void __init __rcu_init_preempt(void)
@@ -1046,14 +945,6 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
 }
 /*
- * Because preemptible RCU does not exist, there is no need to suppress
- * its CPU stall warnings.
- */
-static void rcu_preempt_stall_reset(void)
-{
-}
-/*
 * Because there is no preemptible RCU, there can be no readers blocked,
 * so there is no need to check for blocked tasks.  So check only for
 * bogus qsmask values.
@@ -1081,14 +972,6 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
- * Because preemptible RCU does not exist, it never needs CPU-offline
- * processing.
- */
-static void rcu_preempt_cleanup_dead_cpu(int cpu)
-{
-}
-/*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
 */
@@ -1097,14 +980,6 @@ static void rcu_preempt_check_callbacks(int cpu)
 }
 /*
- * Because preemptible RCU does not exist, it never has any callbacks
- * to process.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-}
-/*
 * Queue an RCU callback for lazy invocation after a grace period.
 * This will likely be later named something like "call_rcu_lazy()",
 * but this change will require some way of tagging the lazy RCU
@@ -1145,22 +1020,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 /*
- * Because preemptible RCU does not exist, it never has any work to do.
- */
-static int rcu_preempt_pending(int cpu)
-{
-        return 0;
-}
-/*
- * Because preemptible RCU does not exist, it never has callbacks
- */
-static int rcu_preempt_cpu_has_callbacks(int cpu)
-{
-        return 0;
-}
-/*
 * Because preemptible RCU does not exist, rcu_barrier() is just
 * another name for rcu_barrier_sched().
 */
@@ -1171,21 +1030,6 @@ void rcu_barrier(void)
 EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
- * Because preemptible RCU does not exist, there is no per-CPU
- * data to initialize.
- */
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
-{
-}
-/*
- * Because there is no preemptible RCU, there is no cleanup to do.
- */
-static void rcu_preempt_cleanup_dying_cpu(void)
-{
-}
-/*
 * Because preemptible RCU does not exist, it need not be initialized.
 */
 static void __init __rcu_init_preempt(void)
@@ -1968,9 +1812,11 @@ static void rcu_idle_count_callbacks_posted(void)
 */
 #define RCU_IDLE_FLUSHES 5              /* Number of dyntick-idle tries. */
 #define RCU_IDLE_OPT_FLUSHES 3          /* Optional dyntick-idle tries. */
-#define RCU_IDLE_GP_DELAY 6             /* Roughly one grace period. */
+#define RCU_IDLE_GP_DELAY 4             /* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
+extern int tick_nohz_enabled;
 /*
 * Does the specified flavor of RCU have non-lazy callbacks pending on
 * the specified CPU?  Both RCU flavor and CPU are specified by the
@@ -2047,10 +1893,13 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
                return 1;
        }
        /* Set up for the possibility that RCU will post a timer. */
-        if (rcu_cpu_has_nonlazy_callbacks(cpu))
+        if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-                *delta_jiffies = RCU_IDLE_GP_DELAY;
+                *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
-        else
+                                          RCU_IDLE_GP_DELAY) - jiffies;
-                *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY;
+        } else {
+                *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
+                *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
+        }
        return 0;
 }
@@ -2109,6 +1958,7 @@ static void rcu_cleanup_after_idle(int cpu)
        del_timer(&rdtp->idle_gp_timer);
        trace_rcu_prep_idle("Cleanup after idle");
+        rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
 }
 /*
@@ -2134,6 +1984,18 @@ static void rcu_prepare_for_idle(int cpu)
 {
        struct timer_list *tp;
        struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+        int tne;
+        /* Handle nohz enablement switches conservatively. */
+        tne = ACCESS_ONCE(tick_nohz_enabled);
+        if (tne != rdtp->tick_nohz_enabled_snap) {
+                if (rcu_cpu_has_callbacks(cpu))
+                        invoke_rcu_core(); /* force nohz to see update. */
+                rdtp->tick_nohz_enabled_snap = tne;
+                return;
+        }
+        if (!tne)
+                return;
        /*
         * If this is an idle re-entry, for example, due to use of
@@ -2187,10 +2049,11 @@ static void rcu_prepare_for_idle(int cpu)
                if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
                        trace_rcu_prep_idle("Dyntick with callbacks");
                        rdtp->idle_gp_timer_expires =
-                                           jiffies + RCU_IDLE_GP_DELAY;
+                                round_up(jiffies + RCU_IDLE_GP_DELAY,
+                                         RCU_IDLE_GP_DELAY);
                } else {
                        rdtp->idle_gp_timer_expires =
-                                           jiffies + RCU_IDLE_LAZY_GP_DELAY;
+                                round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
                        trace_rcu_prep_idle("Dyntick with lazy callbacks");
                }
                tp = &rdtp->idle_gp_timer;
@@ -2231,8 +2094,9 @@ static void rcu_prepare_for_idle(int cpu)
        if (rcu_cpu_has_callbacks(cpu)) {
                trace_rcu_prep_idle("More callbacks");
                invoke_rcu_core();
-        } else
+        } else {
                trace_rcu_prep_idle("Callbacks drained");
+        }
 }
 /*
@@ -2269,6 +2133,7 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 {
+        *cp = '\0';
 }
 #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d4bc16ddd1d4..abffb486e94e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,31 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
+static int show_rcubarrier(struct seq_file *m, void *unused)
+{
+        struct rcu_state *rsp;
+        for_each_rcu_flavor(rsp)
+                seq_printf(m, "%s: %c bcc: %d nbd: %lu\n",
+                           rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.',
+                           atomic_read(&rsp->barrier_cpu_count),
+                           rsp->n_barrier_done);
+        return 0;
+}
+static int rcubarrier_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, show_rcubarrier, NULL);
+}
+static const struct file_operations rcubarrier_fops = {
+        .owner = THIS_MODULE,
+        .open = rcubarrier_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
 #ifdef CONFIG_RCU_BOOST
 static char convert_kthread_status(unsigned int kthread_status)
@@ -95,24 +120,16 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
-#define PRINT_RCU_DATA(name, func, m) \
-        do { \
-                int _p_r_d_i; \
-                \
-                for_each_possible_cpu(_p_r_d_i) \
-                        func(m, &per_cpu(name, _p_r_d_i)); \
-        } while (0)
 static int show_rcudata(struct seq_file *m, void *unused)
 {
-#ifdef CONFIG_TREE_PREEMPT_RCU
+        int cpu;
-        seq_puts(m, "rcu_preempt:\n");
+        struct rcu_state *rsp;
-        PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+        for_each_rcu_flavor(rsp) {
-        seq_puts(m, "rcu_sched:\n");
+                seq_printf(m, "%s:\n", rsp->name);
-        PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
+                for_each_possible_cpu(cpu)
-        seq_puts(m, "rcu_bh:\n");
+                        print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
-        PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
+        }
        return 0;
 }
@@ -166,6 +183,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
+        int cpu;
+        struct rcu_state *rsp;
        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
        seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
@@ -173,14 +193,11 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
        seq_puts(m, "\"kt\",\"ktl\"");
 #endif /* #ifdef CONFIG_RCU_BOOST */
        seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
-#ifdef CONFIG_TREE_PREEMPT_RCU
+        for_each_rcu_flavor(rsp) {
-        seq_puts(m, "\"rcu_preempt:\"\n");
+                seq_printf(m, "\"%s:\"\n", rsp->name);
-        PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
+                for_each_possible_cpu(cpu)
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+                        print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu));
-        seq_puts(m, "\"rcu_sched:\"\n");
+        }
-        PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
-        seq_puts(m, "\"rcu_bh:\"\n");
-        PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
        return 0;
 }
@@ -201,8 +218,7 @@ static const struct file_operations rcudata_csv_fops = {
 static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
 {
-        seq_printf(m,  "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
+        seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ",
-                   "j=%04x bt=%04x\n",
                   rnp->grplo, rnp->grphi,
                   "T."[list_empty(&rnp->blkd_tasks)],
                   "N."[!rnp->gp_tasks],
@@ -210,11 +226,11 @@ static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
                   "B."[!rnp->boost_tasks],
                   convert_kthread_status(rnp->boost_kthread_status),
                   rnp->n_tasks_boosted, rnp->n_exp_boosts,
-                   rnp->n_normal_boosts,
+                   rnp->n_normal_boosts);
+        seq_printf(m, "j=%04x bt=%04x\n",
                   (int)(jiffies & 0xffff),
                   (int)(rnp->boost_time & 0xffff));
-        seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
+        seq_printf(m, "    balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
-                   "     balk",
                   rnp->n_balk_blkd_tasks,
                   rnp->n_balk_exp_gp_tasks,
                   rnp->n_balk_boost_tasks,
@@ -270,15 +286,15 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        struct rcu_node *rnp;
        gpnum = rsp->gpnum;
-        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
+        seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ",
-                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
+                   rsp->name, rsp->completed, gpnum, rsp->fqs_state,
-                   rsp->completed, gpnum, rsp->fqs_state,
                   (long)(rsp->jiffies_force_qs - jiffies),
-                   (int)(jiffies & 0xffff),
+                   (int)(jiffies & 0xffff));
+        seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
                   rsp->n_force_qs - rsp->n_force_qs_ngp,
                   rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
-        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+        for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
                if (rnp->level != level) {
                        seq_puts(m, "\n");
                        level = rnp->level;
@@ -295,14 +311,10 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 static int show_rcuhier(struct seq_file *m, void *unused)
 {
-#ifdef CONFIG_TREE_PREEMPT_RCU
+        struct rcu_state *rsp;
-        seq_puts(m, "rcu_preempt:\n");
-        print_one_rcu_state(m, &rcu_preempt_state);
+        for_each_rcu_flavor(rsp)
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+                print_one_rcu_state(m, rsp);
-        seq_puts(m, "rcu_sched:\n");
-        print_one_rcu_state(m, &rcu_sched_state);
-        seq_puts(m, "rcu_bh:\n");
-        print_one_rcu_state(m, &rcu_bh_state);
        return 0;
 }
@@ -343,11 +355,10 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
 static int show_rcugp(struct seq_file *m, void *unused)
 {
-#ifdef CONFIG_TREE_PREEMPT_RCU
+        struct rcu_state *rsp;
-        show_one_rcugp(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+        for_each_rcu_flavor(rsp)
-        show_one_rcugp(m, &rcu_sched_state);
+                show_one_rcugp(m, rsp);
-        show_one_rcugp(m, &rcu_bh_state);
        return 0;
 }
@@ -366,44 +377,36 @@ static const struct file_operations rcugp_fops = {
 static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
 {
-        seq_printf(m, "%3d%cnp=%ld "
+        seq_printf(m, "%3d%cnp=%ld ",
-                   "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
-                   "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
-                   rdp->n_rcu_pending,
+                   rdp->n_rcu_pending);
+        seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
                   rdp->n_rp_qs_pending,
                   rdp->n_rp_report_qs,
                   rdp->n_rp_cb_ready,
-                   rdp->n_rp_cpu_needs_gp,
+                   rdp->n_rp_cpu_needs_gp);
+        seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
                   rdp->n_rp_gp_completed,
                   rdp->n_rp_gp_started,
                   rdp->n_rp_need_fqs,
                   rdp->n_rp_need_nothing);
 }
-static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+static int show_rcu_pending(struct seq_file *m, void *unused)
 {
        int cpu;
        struct rcu_data *rdp;
+        struct rcu_state *rsp;
-        for_each_possible_cpu(cpu) {
-                rdp = per_cpu_ptr(rsp->rda, cpu);
+        for_each_rcu_flavor(rsp) {
-                if (rdp->beenonline)
+                seq_printf(m, "%s:\n", rsp->name);
-                        print_one_rcu_pending(m, rdp);
+                for_each_possible_cpu(cpu) {
+                        rdp = per_cpu_ptr(rsp->rda, cpu);
+                        if (rdp->beenonline)
+                                print_one_rcu_pending(m, rdp);
+                }
        }
-}
-static int show_rcu_pending(struct seq_file *m, void *unused)
-{
-#ifdef CONFIG_TREE_PREEMPT_RCU
-        seq_puts(m, "rcu_preempt:\n");
-        print_rcu_pendings(m, &rcu_preempt_state);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-        seq_puts(m, "rcu_sched:\n");
-        print_rcu_pendings(m, &rcu_sched_state);
-        seq_puts(m, "rcu_bh:\n");
-        print_rcu_pendings(m, &rcu_bh_state);
        return 0;
 }
@@ -453,6 +456,11 @@ static int __init rcutree_trace_init(void)
        if (!rcudir)
                goto free_out;
+        retval = debugfs_create_file("rcubarrier", 0444, rcudir,
+                                                NULL, &rcubarrier_fops);
+        if (!retval)
+                goto free_out;
        retval = debugfs_create_file("rcudata", 0444, rcudir,
                                                NULL, &rcudata_fops);
        if (!retval)
diff --git a/kernel/resource.c b/kernel/resource.c
index e1d2b8ee76d5..34d45886ee84 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,6 +7,8 @@
 * Arbitrary resource management.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/export.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
@@ -722,14 +724,12 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
        write_lock(&resource_lock);
+        if (!parent)
+                goto skip;
        if ((start < parent->start) || (end > parent->end))
                goto out;
-        for (tmp = res->child; tmp; tmp = tmp->sibling) {
-                if ((tmp->start < start) || (tmp->end > end))
-                        goto out;
-        }
        if (res->sibling && (res->sibling->start <= end))
                goto out;
@@ -741,6 +741,11 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
                        goto out;
        }
+skip:
+        for (tmp = res->child; tmp; tmp = tmp->sibling)
+                if ((tmp->start < start) || (tmp->end > end))
+                        goto out;
        res->start = start;
        res->end = end;
        result = 0;
@@ -788,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root,
                resource_size_t start, resource_size_t end,
                const char *name)
 {
+        int abort = 0;
        write_lock(&resource_lock);
-        __reserve_region_with_split(root, start, end, name);
+        if (root->start > start || root->end < end) {
+                pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
+                       (unsigned long long)start, (unsigned long long)end,
+                       root);
+                if (start > root->end || end < root->start)
+                        abort = 1;
+                else {
+                        if (end > root->end)
+                                end = root->end;
+                        if (start < root->start)
+                                start = root->start;
+                        pr_err("fixing request to [0x%llx-0x%llx]\n",
+                               (unsigned long long)start,
+                               (unsigned long long)end);
+                }
+                dump_stack();
+        }
+        if (!abort)
+                __reserve_region_with_split(root, start, end, name);
        write_unlock(&resource_lock);
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 468bdd44c1ba..fbf1fd098dc6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
         *
         * sched_move_task() holds both and thus holding either pins the cgroup,
-         * see set_task_rq().
+         * see task_group().
         *
         * Furthermore, all task_rq users should acquire both locks, see
         * task_rq_lock().
@@ -1910,12 +1910,12 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
+        trace_sched_switch(prev, next);
        sched_info_switch(prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
        prepare_arch_switch(next);
-        trace_sched_switch(prev, next);
 }
 /**
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 # define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
 #endif
+static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+{
+        u64 temp = (__force u64) rtime;
+        temp *= (__force u64) utime;
+        if (sizeof(cputime_t) == 4)
+                temp = div_u64(temp, (__force u32) total);
+        else
+                temp = div64_u64(temp, (__force u64) total);
+        return (__force cputime_t) temp;
+}
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
         */
        rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
-        if (total) {
+        if (total)
-                u64 temp = (__force u64) rtime;
+                utime = scale_utime(utime, rtime, total);
+        else
-                temp *= (__force u64) utime;
-                do_div(temp, (__force u32) total);
-                utime = (__force cputime_t) temp;
-        } else
                utime = rtime;
        /*
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        total = cputime.utime + cputime.stime;
        rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
-        if (total) {
+        if (total)
-                u64 temp = (__force u64) rtime;
+                utime = scale_utime(cputime.utime, rtime, total);
+        else
-                temp *= (__force u64) cputime.utime;
-                do_div(temp, (__force u32) total);
-                utime = (__force cputime_t) temp;
-        } else
                utime = rtime;
        sig->prev_utime = max(sig->prev_utime, utime);
@@ -4340,9 +4346,7 @@ recheck:
         */
        if (unlikely(policy == p->policy && (!rt_policy(policy) ||
                        param->sched_priority == p->rt_priority))) {
+                task_rq_unlock(rq, p, &flags);
-                __task_rq_unlock(rq);
-                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                return 0;
        }
@@ -6024,6 +6028,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 * allows us to avoid some pointer chasing select_idle_sibling().
 *
+ * Iterate domains and sched_groups downward, assigning CPUs to be
+ * select_idle_sibling() hw buddy.  Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
 * Also keep a unique ID per domain (we use the first cpu number in
 * the cpumask of the domain), this allows us to quickly tell if
 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6046,40 @@ static void update_top_cache_domain(int cpu)
        int id = cpu;
        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-        if (sd)
+        if (sd) {
+                struct sched_domain *tmp = sd;
+                struct sched_group *sg, *prev;
+                bool right;
+                /*
+                 * Traverse to first CPU in group, and count hops
+                 * to cpu from there, switching direction on each
+                 * hop, never ever pointing the last CPU rightward.
+                 */
+                do {
+                        id = cpumask_first(sched_domain_span(tmp));
+                        prev = sg = tmp->groups;
+                        right = 1;
+                        while (cpumask_first(sched_group_cpus(sg)) != id)
+                                sg = sg->next;
+                        while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+                                prev = sg;
+                                sg = sg->next;
+                                right = !right;
+                        }
+                        /* A CPU went down, never point back to domain start. */
+                        if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+                                right = false;
+                        sg = right ? sg->next : prev;
+                        tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+                } while ((tmp = tmp->child));
                id = cpumask_first(sched_domain_span(sd));
+        }
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_id, cpu) = id;
@@ -7097,34 +7138,66 @@ match2:
        mutex_unlock(&sched_domains_mutex);
 }
+static int num_cpus_frozen;     /* used to mark begin/end of suspend/resume */
 /*
 * Update cpusets according to cpu_active mask.  If cpusets are
 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
 * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
 */
 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
                             void *hcpu)
 {
-        switch (action & ~CPU_TASKS_FROZEN) {
+        switch (action) {
+        case CPU_ONLINE_FROZEN:
+        case CPU_DOWN_FAILED_FROZEN:
+                /*
+                 * num_cpus_frozen tracks how many CPUs are involved in suspend
+                 * resume sequence. As long as this is not the last online
+                 * operation in the resume sequence, just build a single sched
+                 * domain, ignoring cpusets.
+                 */
+                num_cpus_frozen--;
+                if (likely(num_cpus_frozen)) {
+                        partition_sched_domains(1, NULL, NULL);
+                        break;
+                }
+                /*
+                 * This is the last CPU online operation. So fall through and
+                 * restore the original sched domains by considering the
+                 * cpuset configurations.
+                 */
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
-                cpuset_update_active_cpus();
+                cpuset_update_active_cpus(true);
-                return NOTIFY_OK;
+                break;
        default:
                return NOTIFY_DONE;
        }
+        return NOTIFY_OK;
 }
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
                               void *hcpu)
 {
-        switch (action & ~CPU_TASKS_FROZEN) {
+        switch (action) {
        case CPU_DOWN_PREPARE:
-                cpuset_update_active_cpus();
+                cpuset_update_active_cpus(false);
-                return NOTIFY_OK;
+                break;
+        case CPU_DOWN_PREPARE_FROZEN:
+                num_cpus_frozen++;
+                partition_sched_domains(1, NULL, NULL);
+                break;
        default:
                return NOTIFY_DONE;
        }
+        return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
@@ -7179,6 +7252,7 @@ int in_sched_functions(unsigned long addr)
 #ifdef CONFIG_CGROUP_SCHED
 struct task_group root_task_group;
+LIST_HEAD(task_groups);
 #endif
 DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
@@ -7589,6 +7663,7 @@ void sched_destroy_group(struct task_group *tg)
 */
 void sched_move_task(struct task_struct *tsk)
 {
+        struct task_group *tg;
        int on_rq, running;
        unsigned long flags;
        struct rq *rq;
@@ -7603,6 +7678,12 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->put_prev_task(rq, tsk);
+        tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+                                lockdep_is_held(&tsk->sighand->siglock)),
+                          struct task_group, css);
+        tg = autogroup_task_group(tsk, tg);
+        tsk->sched_task_group = tg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
                tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586fdf660..23aa789c53ee 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -65,8 +65,8 @@ static int convert_prio(int prio)
 int cpupri_find(struct cpupri *cp, struct task_struct *p,
                struct cpumask *lowest_mask)
 {
-        int                  idx      = 0;
+        int idx = 0;
-        int                  task_pri = convert_prio(p->prio);
+        int task_pri = convert_prio(p->prio);
        if (task_pri >= MAX_RT_PRIO)
                return 0;
@@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 */
 void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 {
-        int                 *currpri = &cp->cpu_to_pri[cpu];
+        int *currpri = &cp->cpu_to_pri[cpu];
-        int                  oldpri  = *currpri;
+        int oldpri = *currpri;
-        int                  do_mb = 0;
+        int do_mb = 0;
        newpri = convert_prio(newpri);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..c219bf8d704c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
-        struct sched_group *sg;
-        int i;
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
                return prev_cpu;
        /*
-         * Otherwise, iterate the domains and find an elegible idle cpu.
+         * Otherwise, check assigned siblings to find an elegible idle cpu.
         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
-        for_each_lower_domain(sd) {
-                sg = sd->groups;
-                do {
-                        if (!cpumask_intersects(sched_group_cpus(sg),
-                                                tsk_cpus_allowed(p)))
-                                goto next;
-                        for_each_cpu(i, sched_group_cpus(sg)) {
-                                if (!idle_cpu(i))
-                                        goto next;
-                        }
-                        target = cpumask_first_and(sched_group_cpus(sg),
+        for_each_lower_domain(sd) {
-                                        tsk_cpus_allowed(p));
+                if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
-                        goto done;
+                        continue;
-next:
+                if (idle_cpu(sd->idle_buddy))
-                        sg = sg->next;
+                        return sd->idle_buddy;
-                } while (sg != sd->groups);
        }
-done:
        return target;
 }
@@ -3068,18 +3054,24 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 #define LBF_ALL_PINNED  0x01
 #define LBF_NEED_BREAK  0x02
+#define LBF_SOME_PINNED 0x04
 struct lb_env {
        struct sched_domain     *sd;
-        int                     src_cpu;
        struct rq               *src_rq;
+        int                     src_cpu;
        int                     dst_cpu;
        struct rq               *dst_rq;
+        struct cpumask          *dst_grpmask;
+        int                     new_dst_cpu;
        enum cpu_idle_type      idle;
        long                    imbalance;
+        /* The set of CPUs under consideration for load-balancing */
+        struct cpumask          *cpus;
        unsigned int            flags;
        unsigned int            loop;
@@ -3145,9 +3137,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         * 3) are cache-hot on their current CPU.
         */
        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+                int new_dst_cpu;
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+                /*
+                 * Remember if this task can be migrated to any other cpu in
+                 * our sched_group. We may want to revisit it if we couldn't
+                 * meet load balance goals by pulling other tasks on src_cpu.
+                 *
+                 * Also avoid computing new_dst_cpu if we have already computed
+                 * one in current iteration.
+                 */
+                if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+                        return 0;
+                new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+                                                tsk_cpus_allowed(p));
+                if (new_dst_cpu < nr_cpu_ids) {
+                        env->flags |= LBF_SOME_PINNED;
+                        env->new_dst_cpu = new_dst_cpu;
+                }
                return 0;
        }
+        /* Record that we found atleast one task that could run on dst_cpu */
        env->flags &= ~LBF_ALL_PINNED;
        if (task_running(env->src_rq, p)) {
@@ -3373,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data)
 static void update_h_load(long cpu)
 {
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long now = jiffies;
+        if (rq->h_load_throttle == now)
+                return;
+        rq->h_load_throttle = now;
        rcu_read_lock();
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
        rcu_read_unlock();
@@ -3642,8 +3664,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 */
 static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
-                        int local_group, const struct cpumask *cpus,
+                        int local_group, int *balance, struct sg_lb_stats *sgs)
-                        int *balance, struct sg_lb_stats *sgs)
 {
        unsigned long nr_running, max_nr_running, min_nr_running;
        unsigned long load, max_cpu_load, min_cpu_load;
@@ -3660,7 +3681,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        max_nr_running = 0;
        min_nr_running = ~0UL;
-        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
                nr_running = rq->nr_running;
@@ -3789,8 +3810,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 * @sds: variable to hold the statistics for this sched_domain.
 */
 static inline void update_sd_lb_stats(struct lb_env *env,
-                                      const struct cpumask *cpus,
+                                        int *balance, struct sd_lb_stats *sds)
-                                      int *balance, struct sd_lb_stats *sds)
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
@@ -3807,8 +3827,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(env, sg, load_idx, local_group,
+                update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
-                                   cpus, balance, &sgs);
                if (local_group && !(*balance))
                        return;
@@ -4044,7 +4063,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 * to restore balance.
 *
 * @env: The load balancing environment.
- * @cpus: The set of CPUs under consideration for load-balancing.
 * @balance: Pointer to a variable indicating if this_cpu
 *      is the appropriate cpu to perform load balancing at this_level.
 *
@@ -4054,7 +4072,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 *                 put to idle by rebalancing its tasks onto our group.
 */
 static struct sched_group *
-find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
+find_busiest_group(struct lb_env *env, int *balance)
 {
        struct sd_lb_stats sds;
@@ -4064,7 +4082,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
         * Compute the various statistics relavent for load balancing at
         * this level.
         */
-        update_sd_lb_stats(env, cpus, balance, &sds);
+        update_sd_lb_stats(env, balance, &sds);
        /*
         * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4144,8 +4162,7 @@ ret:
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
 static struct rq *find_busiest_queue(struct lb_env *env,
-                                     struct sched_group *group,
+                                     struct sched_group *group)
-                                     const struct cpumask *cpus)
 {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
@@ -4160,7 +4177,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                if (!capacity)
                        capacity = fix_small_capacity(env->sd, group);
-                if (!cpumask_test_cpu(i, cpus))
+                if (!cpumask_test_cpu(i, env->cpus))
                        continue;
                rq = cpu_rq(i);
@@ -4227,7 +4244,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-        int ld_moved, active_balance = 0;
+        int ld_moved, cur_ld_moved, active_balance = 0;
+        int lb_iterations, max_lb_iterations;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
@@ -4237,16 +4255,19 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .sd             = sd,
                .dst_cpu        = this_cpu,
                .dst_rq         = this_rq,
+                .dst_grpmask    = sched_group_cpus(sd->groups),
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
+                .cpus           = cpus,
        };
        cpumask_copy(cpus, cpu_active_mask);
+        max_lb_iterations = cpumask_weight(env.dst_grpmask);
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        group = find_busiest_group(&env, cpus, balance);
+        group = find_busiest_group(&env, balance);
        if (*balance == 0)
                goto out_balanced;
@@ -4256,7 +4277,7 @@ redo:
                goto out_balanced;
        }
-        busiest = find_busiest_queue(&env, group, cpus);
+        busiest = find_busiest_queue(&env, group);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -4267,6 +4288,7 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
        ld_moved = 0;
+        lb_iterations = 1;
        if (busiest->nr_running > 1) {
                /*
                 * Attempt to move tasks. If find_busiest_group has found
@@ -4279,12 +4301,17 @@ redo:
                env.src_rq    = busiest;
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
+                update_h_load(env.src_cpu);
 more_balance:
                local_irq_save(flags);
                double_rq_lock(this_rq, busiest);
-                if (!env.loop)
-                        update_h_load(env.src_cpu);
+                /*
-                ld_moved += move_tasks(&env);
+                 * cur_ld_moved - load moved in current iteration
+                 * ld_moved     - cumulative load moved across iterations
+                 */
+                cur_ld_moved = move_tasks(&env);
+                ld_moved += cur_ld_moved;
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
@@ -4296,14 +4323,52 @@ more_balance:
                /*
                 * some other cpu did the load balance for us.
                 */
-                if (ld_moved && this_cpu != smp_processor_id())
+                if (cur_ld_moved && env.dst_cpu != smp_processor_id())
-                        resched_cpu(this_cpu);
+                        resched_cpu(env.dst_cpu);
+                /*
+                 * Revisit (affine) tasks on src_cpu that couldn't be moved to
+                 * us and move them to an alternate dst_cpu in our sched_group
+                 * where they can run. The upper limit on how many times we
+                 * iterate on same src_cpu is dependent on number of cpus in our
+                 * sched_group.
+                 *
+                 * This changes load balance semantics a bit on who can move
+                 * load to a given_cpu. In addition to the given_cpu itself
+                 * (or a ilb_cpu acting on its behalf where given_cpu is
+                 * nohz-idle), we now have balance_cpu in a position to move
+                 * load to given_cpu. In rare situations, this may cause
+                 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+                 * _independently_ and at _same_ time to move some load to
+                 * given_cpu) causing exceess load to be moved to given_cpu.
+                 * This however should not happen so much in practice and
+                 * moreover subsequent load balance cycles should correct the
+                 * excess load moved.
+                 */
+                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+                                lb_iterations++ < max_lb_iterations) {
+                        this_rq          = cpu_rq(env.new_dst_cpu);
+                        env.dst_rq       = this_rq;
+                        env.dst_cpu      = env.new_dst_cpu;
+                        env.flags       &= ~LBF_SOME_PINNED;
+                        env.loop         = 0;
+                        env.loop_break   = sched_nr_migrate_break;
+                        /*
+                         * Go back to "more_balance" rather than "redo" since we
+                         * need to continue with same src_cpu.
+                         */
+                        goto more_balance;
+                }
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus))
+                        if (!cpumask_empty(cpus)) {
+                                env.loop = 0;
+                                env.loop_break = sched_nr_migrate_break;
                                goto redo;
+                        }
                        goto out_balanced;
                }
        }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 573e1ca01102..944cb68420e9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -788,6 +788,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
        const struct cpumask *span;
        span = sched_rt_period_mask();
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * FIXME: isolated CPUs should really leave the root task group,
+         * whether they are isolcpus or were isolated via cpusets, lest
+         * the timer run on a CPU which does not service all runqueues,
+         * potentially leaving other CPUs indefinitely throttled.  If
+         * isolation is really required, the user will turn the throttle
+         * off to kill the perturbations it causes anyway.  Meanwhile,
+         * this maintains functionality for boot and/or troubleshooting.
+         */
+        if (rt_b == &root_task_group.rt_bandwidth)
+                span = cpu_online_mask;
+#endif
        for_each_cpu(i, span) {
                int enqueue = 0;
                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..f6714d009e77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex;
 struct cfs_rq;
 struct rt_rq;
-static LIST_HEAD(task_groups);
+extern struct list_head task_groups;
 struct cfs_bandwidth {
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -374,7 +374,11 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
        struct list_head leaf_cfs_rq_list;
-#endif
+#ifdef CONFIG_SMP
+        unsigned long h_load_throttle;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
        struct list_head leaf_rt_rq_list;
 #endif
@@ -538,22 +542,19 @@ extern int group_balance_cpu(struct sched_group *sg);
 /*
 * Return the group to which this tasks belongs.
 *
- * We use task_subsys_state_check() and extend the RCU verification with
+ * We cannot use task_subsys_state() and friends because the cgroup
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * subsystem changes that value before the cgroup_subsys::attach() method
- * task it moves into the cgroup. Therefore by holding either of those locks,
+ * is called, therefore we cannot pin it and might observe the wrong value.
- * we pin the task to the current cgroup.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
 */
 static inline struct task_group *task_group(struct task_struct *p)
 {
-        struct task_group *tg;
+        return p->sched_task_group;
-        struct cgroup_subsys_state *css;
-        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                        lockdep_is_held(&p->pi_lock) ||
-                        lockdep_is_held(&task_rq(p)->lock));
-        tg = container_of(css, struct task_group, css);
-        return autogroup_task_group(p, tg);
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e86fd23..da5eb5bed84a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
 {
        struct task_struct *stop = rq->stop;
-        if (stop && stop->on_rq)
+        if (stop && stop->on_rq) {
+                stop->se.exec_start = rq->clock_task;
                return stop;
+        }
        return NULL;
 }
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
 static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 {
+        struct task_struct *curr = rq->curr;
+        u64 delta_exec;
+        delta_exec = rq->clock_task - curr->se.exec_start;
+        if (unlikely((s64)delta_exec < 0))
+                delta_exec = 0;
+        schedstat_set(curr->se.statistics.exec_max,
+                        max(curr->se.statistics.exec_max, delta_exec));
+        curr->se.sum_exec_runtime += delta_exec;
+        account_group_exec_runtime(curr, delta_exec);
+        curr->se.exec_start = rq->clock_task;
+        cpuacct_charge(curr, delta_exec);
 }
 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
 static void set_curr_task_stop(struct rq *rq)
 {
+        struct task_struct *stop = rq->stop;
+        stop->se.exec_start = rq->clock_task;
 }
 static void switched_to_stop(struct rq *rq, struct task_struct *p)
diff --git a/kernel/signal.c b/kernel/signal.c
index 677102789cf2..be4f856d52f8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1971,6 +1971,13 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 void ptrace_notify(int exit_code)
 {
        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+        if (unlikely(current->task_works)) {
+                if (test_and_clear_ti_thread_flag(current_thread_info(),
+                                                   TIF_NOTIFY_RESUME)) {
+                        smp_mb__after_clear_bit();
+                        task_work_run();
+                }
+        }
        spin_lock_irq(&current->sighand->siglock);
        ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2191,6 +2198,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
        struct signal_struct *signal = current->signal;
        int signr;
+        if (unlikely(current->task_works)) {
+                if (test_and_clear_ti_thread_flag(current_thread_info(),
+                                                   TIF_NOTIFY_RESUME)) {
+                        smp_mb__after_clear_bit();
+                        task_work_run();
+                }
+        }
        if (unlikely(uprobe_deny_signal()))
                return 0;
diff --git a/kernel/smp.c b/kernel/smp.c
index d0ae5b24875e..29dd40a9f2f4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -581,26 +581,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)
        return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
-void ipi_call_lock(void)
-{
-        raw_spin_lock(&call_function.lock);
-}
-void ipi_call_unlock(void)
-{
-        raw_spin_unlock(&call_function.lock);
-}
-void ipi_call_lock_irq(void)
-{
-        raw_spin_lock_irq(&call_function.lock);
-}
-void ipi_call_unlock_irq(void)
-{
-        raw_spin_unlock_irq(&call_function.lock);
-}
 #endif /* USE_GENERIC_SMP_HELPERS */
 /* Setup configured maximum number of CPUs to activate */
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 80c0acfb8472..6ef9433e1c70 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -3,8 +3,6 @@
 struct task_struct;
-int smpboot_prepare(unsigned int cpu);
 #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
 struct task_struct *idle_thread_get(unsigned int cpu);
 void idle_thread_set_boot_cpu(void);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 671f9594e368..b73e681df09e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void)
        __u32 pending;
        int max_restart = MAX_SOFTIRQ_RESTART;
        int cpu;
+        unsigned long old_flags = current->flags;
+        /*
+         * Mask out PF_MEMALLOC s current task context is borrowed for the
+         * softirq. A softirq handled such as network RX might set PF_MEMALLOC
+         * again if the socket is related to swap
+         */
+        current->flags &= ~PF_MEMALLOC;
        pending = local_softirq_pending();
        account_system_vtime(current);
@@ -265,6 +273,7 @@ restart:
        account_system_vtime(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
+        tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
 #ifndef __ARCH_HAS_DO_SOFTIRQ
diff --git a/kernel/sys.c b/kernel/sys.c
index 2d39a84cd857..241507f23eca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2015,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                break;
                        }
                        me->pdeath_signal = arg2;
-                        error = 0;
                        break;
                case PR_GET_PDEATHSIG:
                        error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2029,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                break;
                        }
                        set_dumpable(me->mm, arg2);
-                        error = 0;
                        break;
                case PR_SET_UNALIGN:
@@ -2056,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                case PR_SET_TIMING:
                        if (arg2 != PR_TIMING_STATISTICAL)
                                error = -EINVAL;
-                        else
-                                error = 0;
                        break;
                case PR_SET_NAME:
                        comm[sizeof(me->comm)-1] = 0;
                        if (strncpy_from_user(comm, (char __user *)arg2,
@@ -2067,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                return -EFAULT;
                        set_task_comm(me, comm);
                        proc_comm_connector(me);
-                        return 0;
+                        break;
                case PR_GET_NAME:
                        get_task_comm(comm, me);
                        if (copy_to_user((char __user *)arg2, comm,
                                         sizeof(comm)))
                                return -EFAULT;
-                        return 0;
+                        break;
                case PR_GET_ENDIAN:
                        error = GET_ENDIAN(me, arg2);
                        break;
                case PR_SET_ENDIAN:
                        error = SET_ENDIAN(me, arg2);
                        break;
                case PR_GET_SECCOMP:
                        error = prctl_get_seccomp();
                        break;
@@ -2108,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                        current->default_timer_slack_ns;
                        else
                                current->timer_slack_ns = arg2;
-                        error = 0;
                        break;
                case PR_MCE_KILL:
                        if (arg4 | arg5)
@@ -2134,7 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        default:
                                return -EINVAL;
                        }
-                        error = 0;
                        break;
                case PR_MCE_KILL_GET:
                        if (arg2 | arg3 | arg4 | arg5)
@@ -2153,7 +2145,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        break;
                case PR_SET_CHILD_SUBREAPER:
                        me->signal->is_child_subreaper = !!arg2;
-                        error = 0;
                        break;
                case PR_GET_CHILD_SUBREAPER:
                        error = put_user(me->signal->is_child_subreaper,
@@ -2195,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info)
        argv_free(info->argv);
 }
-/**
+static int __orderly_poweroff(void)
- * orderly_poweroff - Trigger an orderly system poweroff
- * @force: force poweroff if command execution fails
- *
- * This may be called from any context to trigger a system shutdown.
- * If the orderly shutdown fails, it will force an immediate shutdown.
- */
-int orderly_poweroff(bool force)
 {
        int argc;
-        char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
+        char **argv;
        static char *envp[] = {
                "HOME=/",
                "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
                NULL
        };
-        int ret = -ENOMEM;
+        int ret;
+        argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
        if (argv == NULL) {
                printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
                       __func__, poweroff_cmd);
-                goto out;
+                return -ENOMEM;
        }
        ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
                                      NULL, argv_cleanup, NULL);
-out:
-        if (likely(!ret))
-                return 0;
        if (ret == -ENOMEM)
                argv_free(argv);
-        if (force) {
+        return ret;
+}
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+        int ret = __orderly_poweroff();
+        if (ret && force) {
                printk(KERN_WARNING "Failed to start orderly shutdown: "
                       "forcing the issue\n");
-                /* I guess this should try to kick off some daemon to
+                /*
-                   sync and poweroff asap.  Or not even bother syncing
+                 * I guess this should try to kick off some daemon to sync and
-                   if we're doing an emergency shutdown? */
+                 * poweroff asap.  Or not even bother syncing if we're doing an
+                 * emergency shutdown?
+                 */
                emergency_sync();
                kernel_power_off();
        }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab11879aeb4..87174ef59161 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/ctype.h>
 #include <linux/kmemcheck.h>
+#include <linux/kmemleak.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -174,6 +175,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp, loff_t *ppos);
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
 static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -410,7 +416,7 @@ static struct ctl_table kern_table[] = {
                .data           = core_pattern,
                .maxlen         = CORENAME_MAX_SIZE,
                .mode           = 0644,
-                .proc_handler   = proc_dostring,
+                .proc_handler   = proc_dostring_coredump,
        },
        {
                .procname       = "core_pipe_limit",
@@ -1095,11 +1101,9 @@ static struct ctl_table vm_table[] = {
                .extra1         = &zero,
        },
        {
-                .procname       = "nr_pdflush_threads",
+                .procname       = "nr_pdflush_threads",
-                .data           = &nr_pdflush_threads,
+                .mode           = 0444 /* read-only */,
-                .maxlen         = sizeof nr_pdflush_threads,
+                .proc_handler   = pdflush_proc_obsolete,
-                .mode           = 0444 /* read-only*/,
-                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "swappiness",
@@ -1494,11 +1498,29 @@ static struct ctl_table fs_table[] = {
 #endif
 #endif
        {
+                .procname       = "protected_symlinks",
+                .data           = &sysctl_protected_symlinks,
+                .maxlen         = sizeof(int),
+                .mode           = 0600,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        {
+                .procname       = "protected_hardlinks",
+                .data           = &sysctl_protected_hardlinks,
+                .maxlen         = sizeof(int),
+                .mode           = 0600,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        {
                .procname       = "suid_dumpable",
                .data           = &suid_dumpable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax_coredump,
                .extra1         = &zero,
                .extra2         = &two,
        },
@@ -1551,7 +1573,10 @@ static struct ctl_table dev_table[] = {
 int __init sysctl_init(void)
 {
-        register_sysctl_table(sysctl_base_table);
+        struct ctl_table_header *hdr;
+        hdr = register_sysctl_table(sysctl_base_table);
+        kmemleak_not_leak(hdr);
        return 0;
 }
@@ -2009,6 +2034,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
                                do_proc_dointvec_minmax_conv, &param);
 }
+static void validate_coredump_safety(void)
+{
+        if (suid_dumpable == SUID_DUMPABLE_SAFE &&
+            core_pattern[0] != '/' && core_pattern[0] != '|') {
+                printk(KERN_WARNING "Unsafe core_pattern used with "\
+                        "suid_dumpable=2. Pipe handler or fully qualified "\
+                        "core dump path required.\n");
+        }
+}
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        if (!error)
+                validate_coredump_safety();
+        return error;
+}
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int error = proc_dostring(table, write, buffer, lenp, ppos);
+        if (!error)
+                validate_coredump_safety();
+        return error;
+}
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
                                     void __user *buffer,
                                     size_t *lenp, loff_t *ppos,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index a650694883a1..65bdcf198d4e 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
        { CTL_INT,      VM_DIRTY_RATIO,                 "dirty_ratio" },
        /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
        /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
-        { CTL_INT,      VM_NR_PDFLUSH_THREADS,          "nr_pdflush_threads" },
+        /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
        { CTL_INT,      VM_OVERCOMMIT_RATIO,            "overcommit_ratio" },
        /* VM_PAGEBUF unused */
        /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 82d1c794066d..d320d44903bd 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -3,82 +3,79 @@
 #include <linux/tracehook.h>
 int
-task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
+task_work_add(struct task_struct *task, struct callback_head *twork, bool notify)
 {
+        struct callback_head *last, *first;
        unsigned long flags;
-        int err = -ESRCH;
-#ifndef TIF_NOTIFY_RESUME
-        if (notify)
-                return -ENOTSUPP;
-#endif
        /*
-         * We must not insert the new work if the task has already passed
+         * Not inserting the new work if the task has already passed
-         * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait()
+         * exit_task_work() is the responisbility of callers.
-         * and check PF_EXITING under pi_lock.
         */
        raw_spin_lock_irqsave(&task->pi_lock, flags);
-        if (likely(!(task->flags & PF_EXITING))) {
+        last = task->task_works;
-                hlist_add_head(&twork->hlist, &task->task_works);
+        first = last ? last->next : twork;
-                err = 0;
+        twork->next = first;
-        }
+        if (last)
+                last->next = twork;
+        task->task_works = twork;
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
-        if (likely(!err) && notify)
+        if (notify)
                set_notify_resume(task);
-        return err;
+        return 0;
 }
-struct task_work *
+struct callback_head *
 task_work_cancel(struct task_struct *task, task_work_func_t func)
 {
        unsigned long flags;
-        struct task_work *twork;
+        struct callback_head *last, *res = NULL;
-        struct hlist_node *pos;
        raw_spin_lock_irqsave(&task->pi_lock, flags);
-        hlist_for_each_entry(twork, pos, &task->task_works, hlist) {
+        last = task->task_works;
-                if (twork->func == func) {
+        if (last) {
-                        hlist_del(&twork->hlist);
+                struct callback_head *q = last, *p = q->next;
-                        goto found;
+                while (1) {
+                        if (p->func == func) {
+                                q->next = p->next;
+                                if (p == last)
+                                        task->task_works = q == p ? NULL : q;
+                                res = p;
+                                break;
+                        }
+                        if (p == last)
+                                break;
+                        q = p;
+                        p = q->next;
                }
        }
-        twork = NULL;
- found:
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+        return res;
-        return twork;
 }
 void task_work_run(void)
 {
        struct task_struct *task = current;
-        struct hlist_head task_works;
+        struct callback_head *p, *q;
-        struct hlist_node *pos;
-        raw_spin_lock_irq(&task->pi_lock);
+        while (1) {
-        hlist_move_list(&task->task_works, &task_works);
+                raw_spin_lock_irq(&task->pi_lock);
-        raw_spin_unlock_irq(&task->pi_lock);
+                p = task->task_works;
+                task->task_works = NULL;
+                raw_spin_unlock_irq(&task->pi_lock);
-        if (unlikely(hlist_empty(&task_works)))
+                if (unlikely(!p))
-                return;
+                        return;
-        /*
-         * We use hlist to save the space in task_struct, but we want fifo.
-         * Find the last entry, the list should be short, then process them
-         * in reverse order.
-         */
-        for (pos = task_works.first; pos->next; pos = pos->next)
-                ;
-        for (;;) {
+                q = p->next; /* head */
-                struct hlist_node **pprev = pos->pprev;
+                p->next = NULL; /* cut it */
-                struct task_work *twork = container_of(pos, struct task_work,
+                while (q) {
-                                                        hlist);
+                        p = q->next;
-                twork->func(twork);
+                        q->func(q);
+                        q = p;
-                if (pprev == &task_works.first)
+                        cond_resched();
-                        break;
+                }
-                pos = container_of(pprev, struct hlist_node, next);
        }
 }
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e66046456f4f..d0a32796550f 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
                                sizeof(struct cgroupstats));
+        if (na == NULL) {
+                rc = -EMSGSIZE;
+                goto err;
+        }
        stats = nla_data(na);
        memset(stats, 0, sizeof(*stats));
diff --git a/kernel/time.c b/kernel/time.c
index ba744cf80696..d226c6a3fd28 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -30,7 +30,7 @@
 #include <linux/export.h>
 #include <linux/timex.h>
 #include <linux/capability.h>
-#include <linux/clocksource.h>
+#include <linux/timekeeper_internal.h>
 #include <linux/errno.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index fd42bd452b75..8601f0db1261 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA
 config GENERIC_TIME_VSYSCALL
        bool
+# Timekeeping vsyscall support
+config GENERIC_TIME_VSYSCALL_OLD
+        bool
 # ktime_t scalar 64bit nsec representation
 config KTIME_SCALAR
        bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index aa27d391bfc8..f11d83b12949 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -37,7 +37,6 @@
 static struct alarm_base {
        spinlock_t              lock;
        struct timerqueue_head  timerqueue;
-        struct hrtimer          timer;
        ktime_t                 (*gettime)(void);
        clockid_t               base_clockid;
 } alarm_bases[ALARM_NUMTYPE];
@@ -46,6 +45,8 @@ static struct alarm_base {
 static ktime_t freezer_delta;
 static DEFINE_SPINLOCK(freezer_delta_lock);
+static struct wakeup_source *ws;
 #ifdef CONFIG_RTC_CLASS
 /* rtc timer and device for setting alarm wakeups at suspend */
 static struct rtc_timer         rtctimer;
@@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { }
 * @base: pointer to the base where the timer is being run
 * @alarm: pointer to alarm being enqueued.
 *
- * Adds alarm to a alarm_base timerqueue and if necessary sets
+ * Adds alarm to a alarm_base timerqueue
- * an hrtimer to run.
 *
 * Must hold base->lock when calling.
 */
 static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
 {
+        if (alarm->state & ALARMTIMER_STATE_ENQUEUED)
+                timerqueue_del(&base->timerqueue, &alarm->node);
        timerqueue_add(&base->timerqueue, &alarm->node);
        alarm->state |= ALARMTIMER_STATE_ENQUEUED;
-        if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
-                hrtimer_try_to_cancel(&base->timer);
-                hrtimer_start(&base->timer, alarm->node.expires,
-                                HRTIMER_MODE_ABS);
-        }
 }
 /**
- * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
+ * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue
 * @base: pointer to the base where the timer is running
 * @alarm: pointer to alarm being removed
 *
- * Removes alarm to a alarm_base timerqueue and if necessary sets
+ * Removes alarm to a alarm_base timerqueue
- * a new timer to run.
 *
 * Must hold base->lock when calling.
 */
-static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
+static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)
 {
-        struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
        if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))
                return;
        timerqueue_del(&base->timerqueue, &alarm->node);
        alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
-        if (next == &alarm->node) {
-                hrtimer_try_to_cancel(&base->timer);
-                next = timerqueue_getnext(&base->timerqueue);
-                if (!next)
-                        return;
-                hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
-        }
 }
@@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
 */
 static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
 {
-        struct alarm_base *base = container_of(timer, struct alarm_base, timer);
+        struct alarm *alarm = container_of(timer, struct alarm, timer);
-        struct timerqueue_node *next;
+        struct alarm_base *base = &alarm_bases[alarm->type];
        unsigned long flags;
-        ktime_t now;
        int ret = HRTIMER_NORESTART;
        int restart = ALARMTIMER_NORESTART;
        spin_lock_irqsave(&base->lock, flags);
-        now = base->gettime();
+        alarmtimer_dequeue(base, alarm);
-        while ((next = timerqueue_getnext(&base->timerqueue))) {
+        spin_unlock_irqrestore(&base->lock, flags);
-                struct alarm *alarm;
-                ktime_t expired = next->expires;
-                if (expired.tv64 > now.tv64)
-                        break;
-                alarm = container_of(next, struct alarm, node);
-                timerqueue_del(&base->timerqueue, &alarm->node);
-                alarm->state &= ~ALARMTIMER_STATE_ENQUEUED;
-                alarm->state |= ALARMTIMER_STATE_CALLBACK;
-                spin_unlock_irqrestore(&base->lock, flags);
-                if (alarm->function)
-                        restart = alarm->function(alarm, now);
-                spin_lock_irqsave(&base->lock, flags);
-                alarm->state &= ~ALARMTIMER_STATE_CALLBACK;
-                if (restart != ALARMTIMER_NORESTART) {
+        if (alarm->function)
-                        timerqueue_add(&base->timerqueue, &alarm->node);
+                restart = alarm->function(alarm, base->gettime());
-                        alarm->state |= ALARMTIMER_STATE_ENQUEUED;
-                }
-        }
-        if (next) {
+        spin_lock_irqsave(&base->lock, flags);
-                hrtimer_set_expires(&base->timer, next->expires);
+        if (restart != ALARMTIMER_NORESTART) {
+                hrtimer_set_expires(&alarm->timer, alarm->node.expires);
+                alarmtimer_enqueue(base, alarm);
                ret = HRTIMER_RESTART;
        }
        spin_unlock_irqrestore(&base->lock, flags);
@@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev)
        unsigned long flags;
        struct rtc_device *rtc;
        int i;
+        int ret;
        spin_lock_irqsave(&freezer_delta_lock, flags);
        min = freezer_delta;
@@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev)
        if (min.tv64 == 0)
                return 0;
-        /* XXX - Should we enforce a minimum sleep time? */
+        if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
-        WARN_ON(min.tv64 < NSEC_PER_SEC);
+                __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+                return -EBUSY;
+        }
        /* Setup an rtc timer to fire that far in the future */
        rtc_timer_cancel(rtc, &rtctimer);
@@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev)
        now = rtc_tm_to_ktime(tm);
        now = ktime_add(now, min);
-        rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+        /* Set alarm, if in the past reject suspend briefly to handle */
+        ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
-        return 0;
+        if (ret < 0)
+                __pm_wakeup_event(ws, MSEC_PER_SEC);
+        return ret;
 }
 #else
 static int alarmtimer_suspend(struct device *dev)
@@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
                enum alarmtimer_restart (*function)(struct alarm *, ktime_t))
 {
        timerqueue_init(&alarm->node);
+        hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid,
+                        HRTIMER_MODE_ABS);
+        alarm->timer.function = alarmtimer_fired;
        alarm->function = function;
        alarm->type = type;
        alarm->state = ALARMTIMER_STATE_INACTIVE;
@@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
 * @alarm: ptr to alarm to set
 * @start: time to run the alarm
 */
-void alarm_start(struct alarm *alarm, ktime_t start)
+int alarm_start(struct alarm *alarm, ktime_t start)
 {
        struct alarm_base *base = &alarm_bases[alarm->type];
        unsigned long flags;
+        int ret;
        spin_lock_irqsave(&base->lock, flags);
-        if (alarmtimer_active(alarm))
-                alarmtimer_remove(base, alarm);
        alarm->node.expires = start;
        alarmtimer_enqueue(base, alarm);
+        ret = hrtimer_start(&alarm->timer, alarm->node.expires,
+                                HRTIMER_MODE_ABS);
        spin_unlock_irqrestore(&base->lock, flags);
+        return ret;
 }
 /**
@@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm)
 {
        struct alarm_base *base = &alarm_bases[alarm->type];
        unsigned long flags;
-        int ret = -1;
+        int ret;
-        spin_lock_irqsave(&base->lock, flags);
-        if (alarmtimer_callback_running(alarm))
-                goto out;
-        if (alarmtimer_is_queued(alarm)) {
+        spin_lock_irqsave(&base->lock, flags);
-                alarmtimer_remove(base, alarm);
+        ret = hrtimer_try_to_cancel(&alarm->timer);
-                ret = 1;
+        if (ret >= 0)
-        } else
+                alarmtimer_dequeue(base, alarm);
-                ret = 0;
-out:
        spin_unlock_irqrestore(&base->lock, flags);
        return ret;
 }
@@ -802,10 +773,6 @@ static int __init alarmtimer_init(void)
        for (i = 0; i < ALARM_NUMTYPE; i++) {
                timerqueue_init_head(&alarm_bases[i].timerqueue);
                spin_lock_init(&alarm_bases[i].lock);
-                hrtimer_init(&alarm_bases[i].timer,
-                                alarm_bases[i].base_clockid,
-                                HRTIMER_MODE_ABS);
-                alarm_bases[i].timer.function = alarmtimer_fired;
        }
        error = alarmtimer_rtc_interface_setup();
@@ -821,6 +788,7 @@ static int __init alarmtimer_init(void)
                error = PTR_ERR(pdev);
                goto out_drv;
        }
+        ws = wakeup_source_register("alarmtimer");
        return 0;
 out_drv:
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a470154e0408..6629bf7b5285 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
 * requested HZ value. It is also not recommended
 * for "tick-less" systems.
 */
-#define NSEC_PER_JIFFY  ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+#define NSEC_PER_JIFFY  ((NSEC_PER_SEC+HZ/2)/HZ)
 /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
 * conversion, the .shift value could be zero. However
@@ -95,3 +95,33 @@ struct clocksource * __init __weak clocksource_default_clock(void)
 {
        return &clocksource_jiffies;
 }
+struct clocksource refined_jiffies;
+int register_refined_jiffies(long cycles_per_second)
+{
+        u64 nsec_per_tick, shift_hz;
+        long cycles_per_tick;
+        refined_jiffies = clocksource_jiffies;
+        refined_jiffies.name = "refined-jiffies";
+        refined_jiffies.rating++;
+        /* Calc cycles per tick */
+        cycles_per_tick = (cycles_per_second + HZ/2)/HZ;
+        /* shift_hz stores hz<<8 for extra accuracy */
+        shift_hz = (u64)cycles_per_second << 8;
+        shift_hz += cycles_per_tick/2;
+        do_div(shift_hz, cycles_per_tick);
+        /* Calculate nsec_per_tick using shift_hz */
+        nsec_per_tick = (u64)NSEC_PER_SEC << 8;
+        nsec_per_tick += (u32)shift_hz/2;
+        do_div(nsec_per_tick, (u32)shift_hz);
+        refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
+        clocksource_register(&refined_jiffies);
+        return 0;
+}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b7fbadc5c973..24174b4d669b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock);
 /* USER_HZ period (usecs): */
 unsigned long                   tick_usec = TICK_USEC;
-/* ACTHZ period (nsecs): */
+/* SHIFTED_HZ period (nsecs): */
 unsigned long                   tick_nsec;
 static u64                      tick_length;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 41be02250e08..024540f97f74 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -105,7 +105,7 @@ static ktime_t tick_init_jiffy_update(void)
 /*
 * NO HZ enabled ?
 */
-static int tick_nohz_enabled __read_mostly  = 1;
+int tick_nohz_enabled __read_mostly  = 1;
 /*
 * Enable / Disable tickless mode
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f045cc50832d..16280ff3cf82 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -8,6 +8,7 @@
 *
 */
+#include <linux/timekeeper_internal.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
@@ -21,61 +22,6 @@
 #include <linux/tick.h>
 #include <linux/stop_machine.h>
-/* Structure holding internal timekeeping values. */
-struct timekeeper {
-        /* Current clocksource used for timekeeping. */
-        struct clocksource      *clock;
-        /* NTP adjusted clock multiplier */
-        u32                     mult;
-        /* The shift value of the current clocksource. */
-        u32                     shift;
-        /* Number of clock cycles in one NTP interval. */
-        cycle_t                 cycle_interval;
-        /* Number of clock shifted nano seconds in one NTP interval. */
-        u64                     xtime_interval;
-        /* shifted nano seconds left over when rounding cycle_interval */
-        s64                     xtime_remainder;
-        /* Raw nano seconds accumulated per NTP interval. */
-        u32                     raw_interval;
-        /* Current CLOCK_REALTIME time in seconds */
-        u64                     xtime_sec;
-        /* Clock shifted nano seconds */
-        u64                     xtime_nsec;
-        /* Difference between accumulated time and NTP time in ntp
-         * shifted nano seconds. */
-        s64                     ntp_error;
-        /* Shift conversion between clock shifted nano seconds and
-         * ntp shifted nano seconds. */
-        u32                     ntp_error_shift;
-        /*
-         * wall_to_monotonic is what we need to add to xtime (or xtime corrected
-         * for sub jiffie times) to get to monotonic time.  Monotonic is pegged
-         * at zero at system boot time, so wall_to_monotonic will be negative,
-         * however, we will ALWAYS keep the tv_nsec part positive so we can use
-         * the usual normalization.
-         *
-         * wall_to_monotonic is moved after resume from suspend for the
-         * monotonic time not to jump. We need to add total_sleep_time to
-         * wall_to_monotonic to get the real boot based time offset.
-         *
-         * - wall_to_monotonic is no longer the boot time, getboottime must be
-         * used instead.
-         */
-        struct timespec         wall_to_monotonic;
-        /* time spent in suspend */
-        struct timespec         total_sleep_time;
-        /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
-        struct timespec         raw_time;
-        /* Offset clock monotonic -> clock realtime */
-        ktime_t                 offs_real;
-        /* Offset clock monotonic -> clock boottime */
-        ktime_t                 offs_boot;
-        /* Seqlock for all timekeeper values */
-        seqlock_t               lock;
-};
 static struct timekeeper timekeeper;
@@ -96,25 +42,42 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
        }
 }
-static struct timespec tk_xtime(struct timekeeper *tk)
-{
-        struct timespec ts;
-        ts.tv_sec = tk->xtime_sec;
-        ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
-        return ts;
-}
 static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
 {
        tk->xtime_sec = ts->tv_sec;
-        tk->xtime_nsec = ts->tv_nsec << tk->shift;
+        tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
 }
 static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
 {
        tk->xtime_sec += ts->tv_sec;
-        tk->xtime_nsec += ts->tv_nsec << tk->shift;
+        tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
+        tk_normalize_xtime(tk);
+}
+static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
+{
+        struct timespec tmp;
+        /*
+         * Verify consistency of: offset_real = -wall_to_monotonic
+         * before modifying anything
+         */
+        set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
+                                        -tk->wall_to_monotonic.tv_nsec);
+        WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
+        tk->wall_to_monotonic = wtm;
+        set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
+        tk->offs_real = timespec_to_ktime(tmp);
+}
+static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
+{
+        /* Verify consistency before modifying */
+        WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
+        tk->total_sleep_time    = t;
+        tk->offs_boot           = timespec_to_ktime(t);
 }
 /**
@@ -217,29 +180,16 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
        return nsec + arch_gettimeoffset();
 }
-static void update_rt_offset(struct timekeeper *tk)
-{
-        struct timespec tmp, *wtm = &tk->wall_to_monotonic;
-        set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
-        tk->offs_real = timespec_to_ktime(tmp);
-}
 /* must hold write on timekeeper.lock */
 static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 {
-        struct timespec xt;
        if (clearntp) {
                tk->ntp_error = 0;
                ntp_clear();
        }
-        update_rt_offset(tk);
+        update_vsyscall(tk);
-        xt = tk_xtime(tk);
-        update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
 }
 /**
 * timekeeping_forward_now - update clock to the current time
 *
@@ -261,7 +211,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
        tk->xtime_nsec += cycle_delta * tk->mult;
        /* If arch requires, add in gettimeoffset() */
-        tk->xtime_nsec += arch_gettimeoffset() << tk->shift;
+        tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
        tk_normalize_xtime(tk);
@@ -277,38 +227,39 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 */
 void getnstimeofday(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs = 0;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ts->tv_sec = timekeeper.xtime_sec;
+                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
+                nsecs = timekeeping_get_ns(tk);
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
+        ts->tv_nsec = 0;
        timespec_add_ns(ts, nsecs);
 }
 EXPORT_SYMBOL(getnstimeofday);
 ktime_t ktime_get(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned int seq;
        s64 secs, nsecs;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                secs = timekeeper.xtime_sec +
+                secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
-                                timekeeper.wall_to_monotonic.tv_sec;
+                nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
-                nsecs = timekeeping_get_ns(&timekeeper) +
-                                timekeeper.wall_to_monotonic.tv_nsec;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        /*
         * Use ktime_set/ktime_add_ns to create a proper ktime on
         * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -327,21 +278,24 @@ EXPORT_SYMBOL_GPL(ktime_get);
 */
 void ktime_get_ts(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec tomono;
+        s64 nsec;
        unsigned int seq;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ts->tv_sec = timekeeper.xtime_sec;
+                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
+                nsec = timekeeping_get_ns(tk);
-                tomono = timekeeper.wall_to_monotonic;
+                tomono = tk->wall_to_monotonic;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
-        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+        ts->tv_sec += tomono.tv_sec;
-                                ts->tv_nsec + tomono.tv_nsec);
+        ts->tv_nsec = 0;
+        timespec_add_ns(ts, nsec + tomono.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
@@ -358,22 +312,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
 */
 void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs_raw, nsecs_real;
        WARN_ON_ONCE(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                *ts_raw = timekeeper.raw_time;
+                *ts_raw = tk->raw_time;
-                ts_real->tv_sec = timekeeper.xtime_sec;
+                ts_real->tv_sec = tk->xtime_sec;
                ts_real->tv_nsec = 0;
-                nsecs_raw = timekeeping_get_ns_raw(&timekeeper);
+                nsecs_raw = timekeeping_get_ns_raw(tk);
-                nsecs_real = timekeeping_get_ns(&timekeeper);
+                nsecs_real = timekeeping_get_ns(tk);
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        timespec_add_ns(ts_raw, nsecs_raw);
        timespec_add_ns(ts_real, nsecs_real);
@@ -406,28 +361,28 @@ EXPORT_SYMBOL(do_gettimeofday);
 */
 int do_settimeofday(const struct timespec *tv)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec ts_delta, xt;
        unsigned long flags;
-        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+        if (!timespec_valid_strict(tv))
                return -EINVAL;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
-        xt = tk_xtime(&timekeeper);
+        xt = tk_xtime(tk);
        ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
        ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
-        timekeeper.wall_to_monotonic =
+        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
-                        timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
-        tk_set_xtime(&timekeeper, tv);
+        tk_set_xtime(tk, tv);
-        timekeeping_update(&timekeeper, true);
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
@@ -436,7 +391,6 @@ int do_settimeofday(const struct timespec *tv)
 }
 EXPORT_SYMBOL(do_settimeofday);
 /**
 * timekeeping_inject_offset - Adds or subtracts from the current time.
 * @tv:         pointer to the timespec variable containing the offset
@@ -445,28 +399,37 @@ EXPORT_SYMBOL(do_settimeofday);
 */
 int timekeeping_inject_offset(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
+        struct timespec tmp;
+        int ret = 0;
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
+        /* Make sure the proposed value is valid */
+        tmp = timespec_add(tk_xtime(tk),  *ts);
+        if (!timespec_valid_strict(&tmp)) {
+                ret = -EINVAL;
+                goto error;
+        }
-        tk_xtime_add(&timekeeper, ts);
+        tk_xtime_add(tk, ts);
-        timekeeper.wall_to_monotonic =
+        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
-                                timespec_sub(timekeeper.wall_to_monotonic, *ts);
-        timekeeping_update(&timekeeper, true);
+error: /* even if we error out, we forwarded the time, so call update */
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
-        return 0;
+        return ret;
 }
 EXPORT_SYMBOL(timekeeping_inject_offset);
@@ -477,23 +440,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
 */
 static int change_clocksource(void *data)
 {
+        struct timekeeper *tk = &timekeeper;
        struct clocksource *new, *old;
        unsigned long flags;
        new = (struct clocksource *) data;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
        if (!new->enable || new->enable(new) == 0) {
-                old = timekeeper.clock;
+                old = tk->clock;
-                tk_setup_internals(&timekeeper, new);
+                tk_setup_internals(tk, new);
                if (old->disable)
                        old->disable(old);
        }
-        timekeeping_update(&timekeeper, true);
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        return 0;
 }
@@ -507,7 +471,9 @@ static int change_clocksource(void *data)
 */
 void timekeeping_notify(struct clocksource *clock)
 {
-        if (timekeeper.clock == clock)
+        struct timekeeper *tk = &timekeeper;
+        if (tk->clock == clock)
                return;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
@@ -536,35 +502,36 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
 */
 void getrawmonotonic(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                nsecs = timekeeping_get_ns_raw(&timekeeper);
+                nsecs = timekeeping_get_ns_raw(tk);
-                *ts = timekeeper.raw_time;
+                *ts = tk->raw_time;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        timespec_add_ns(ts, nsecs);
 }
 EXPORT_SYMBOL(getrawmonotonic);
 /**
 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
 int timekeeping_valid_for_hres(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        int ret;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+                ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return ret;
 }
@@ -574,15 +541,16 @@ int timekeeping_valid_for_hres(void)
 */
 u64 timekeeping_max_deferment(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        u64 ret;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ret = timekeeper.clock->max_idle_ns;
+                ret = tk->clock->max_idle_ns;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return ret;
 }
@@ -622,46 +590,56 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)
 */
 void __init timekeeping_init(void)
 {
+        struct timekeeper *tk = &timekeeper;
        struct clocksource *clock;
        unsigned long flags;
-        struct timespec now, boot;
+        struct timespec now, boot, tmp;
        read_persistent_clock(&now);
+        if (!timespec_valid_strict(&now)) {
+                pr_warn("WARNING: Persistent clock returned invalid value!\n"
+                        "         Check your CMOS/BIOS settings.\n");
+                now.tv_sec = 0;
+                now.tv_nsec = 0;
+        }
        read_boot_clock(&boot);
+        if (!timespec_valid_strict(&boot)) {
+                pr_warn("WARNING: Boot clock returned invalid value!\n"
+                        "         Check your CMOS/BIOS settings.\n");
+                boot.tv_sec = 0;
+                boot.tv_nsec = 0;
+        }
-        seqlock_init(&timekeeper.lock);
+        seqlock_init(&tk->lock);
        ntp_init();
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
-        tk_setup_internals(&timekeeper, clock);
+        tk_setup_internals(tk, clock);
-        tk_set_xtime(&timekeeper, &now);
+        tk_set_xtime(tk, &now);
-        timekeeper.raw_time.tv_sec = 0;
+        tk->raw_time.tv_sec = 0;
-        timekeeper.raw_time.tv_nsec = 0;
+        tk->raw_time.tv_nsec = 0;
        if (boot.tv_sec == 0 && boot.tv_nsec == 0)
-                boot = tk_xtime(&timekeeper);
+                boot = tk_xtime(tk);
-        set_normalized_timespec(&timekeeper.wall_to_monotonic,
+        set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
-                                -boot.tv_sec, -boot.tv_nsec);
+        tk_set_wall_to_mono(tk, tmp);
-        update_rt_offset(&timekeeper);
-        timekeeper.total_sleep_time.tv_sec = 0;
+        tmp.tv_sec = 0;
-        timekeeper.total_sleep_time.tv_nsec = 0;
+        tmp.tv_nsec = 0;
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        tk_set_sleep_time(tk, tmp);
+        write_sequnlock_irqrestore(&tk->lock, flags);
 }
 /* time in seconds when suspend began */
 static struct timespec timekeeping_suspend_time;
-static void update_sleep_time(struct timespec t)
-{
-        timekeeper.total_sleep_time = t;
-        timekeeper.offs_boot = timespec_to_ktime(t);
-}
 /**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @delta: pointer to a timespec delta value
@@ -672,18 +650,16 @@ static void update_sleep_time(struct timespec t)
 static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
                                                        struct timespec *delta)
 {
-        if (!timespec_valid(delta)) {
+        if (!timespec_valid_strict(delta)) {
                printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
                                        "sleep delta value!\n");
                return;
        }
        tk_xtime_add(tk, delta);
-        tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta);
+        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
-        update_sleep_time(timespec_add(tk->total_sleep_time, *delta));
+        tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
 }
 /**
 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec delta value
@@ -696,6 +672,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
 */
 void timekeeping_inject_sleeptime(struct timespec *delta)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec ts;
@@ -704,21 +681,20 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
        if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
                return;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
-        __timekeeping_inject_sleeptime(&timekeeper, delta);
+        __timekeeping_inject_sleeptime(tk, delta);
-        timekeeping_update(&timekeeper, true);
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
 }
 /**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 *
@@ -728,6 +704,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 */
 static void timekeeping_resume(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec ts;
@@ -735,18 +712,18 @@ static void timekeeping_resume(void)
        clocksource_resume();
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
        if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
                ts = timespec_sub(ts, timekeeping_suspend_time);
-                __timekeeping_inject_sleeptime(&timekeeper, &ts);
+                __timekeeping_inject_sleeptime(tk, &ts);
        }
        /* re-base the last cycle value */
-        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
+        tk->clock->cycle_last = tk->clock->read(tk->clock);
-        timekeeper.ntp_error = 0;
+        tk->ntp_error = 0;
        timekeeping_suspended = 0;
-        timekeeping_update(&timekeeper, false);
+        timekeeping_update(tk, false);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        touch_softlockup_watchdog();
@@ -758,14 +735,15 @@ static void timekeeping_resume(void)
 static int timekeeping_suspend(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec         delta, delta_delta;
        static struct timespec  old_delta;
        read_persistent_clock(&timekeeping_suspend_time);
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
        timekeeping_suspended = 1;
        /*
@@ -774,7 +752,7 @@ static int timekeeping_suspend(void)
         * try to compensate so the difference in system time
         * and persistent_clock time stays close to constant.
         */
-        delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time);
+        delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
        delta_delta = timespec_sub(delta, old_delta);
        if (abs(delta_delta.tv_sec)  >= 2) {
                /*
@@ -787,7 +765,7 @@ static int timekeeping_suspend(void)
                timekeeping_suspend_time =
                        timespec_add(timekeeping_suspend_time, delta_delta);
        }
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
        clocksource_suspend();
@@ -898,27 +876,29 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
                 * the error. This causes the likely below to be unlikely.
                 *
                 * The proper fix is to avoid rounding up by using
-                 * the high precision timekeeper.xtime_nsec instead of
+                 * the high precision tk->xtime_nsec instead of
                 * xtime.tv_nsec everywhere. Fixing this will take some
                 * time.
                 */
                if (likely(error <= interval))
                        adj = 1;
                else
-                        adj = timekeeping_bigadjust(tk, error, &interval,
+                        adj = timekeeping_bigadjust(tk, error, &interval, &offset);
-                                                        &offset);
+        } else {
-        } else if (error < -interval) {
+                if (error < -interval) {
-                /* See comment above, this is just switched for the negative */
+                        /* See comment above, this is just switched for the negative */
-                error >>= 2;
+                        error >>= 2;
-                if (likely(error >= -interval)) {
+                        if (likely(error >= -interval)) {
-                        adj = -1;
+                                adj = -1;
-                        interval = -interval;
+                                interval = -interval;
-                        offset = -offset;
+                                offset = -offset;
-                } else
+                        } else {
-                        adj = timekeeping_bigadjust(tk, error, &interval,
+                                adj = timekeeping_bigadjust(tk, error, &interval, &offset);
-                                                        &offset);
+                        }
-        } else
+                } else {
-                return;
+                        goto out_adjust;
+                }
+        }
        if (unlikely(tk->clock->maxadj &&
                (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
@@ -981,6 +961,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
        tk->xtime_nsec -= offset;
        tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
+out_adjust:
        /*
         * It may be possible that when we entered this function, xtime_nsec
         * was very small.  Further, if we're slightly speeding the clocksource
@@ -1003,7 +984,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 }
 /**
 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
 *
@@ -1024,15 +1004,21 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
                /* Figure out if its a leap sec and apply if needed */
                leap = second_overflow(tk->xtime_sec);
-                tk->xtime_sec += leap;
+                if (unlikely(leap)) {
-                tk->wall_to_monotonic.tv_sec -= leap;
+                        struct timespec ts;
-                if (leap)
-                        clock_was_set_delayed();
+                        tk->xtime_sec += leap;
+                        ts.tv_sec = leap;
+                        ts.tv_nsec = 0;
+                        tk_set_wall_to_mono(tk,
+                                timespec_sub(tk->wall_to_monotonic, ts));
+                        clock_was_set_delayed();
+                }
        }
 }
 /**
 * logarithmic_accumulation - shifted accumulation of cycles
 *
@@ -1076,6 +1062,32 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
        return offset;
 }
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
+static inline void old_vsyscall_fixup(struct timekeeper *tk)
+{
+        s64 remainder;
+        /*
+        * Store only full nanoseconds into xtime_nsec after rounding
+        * it up and add the remainder to the error difference.
+        * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+        * by truncating the remainder in vsyscalls. However, it causes
+        * additional work to be done in timekeeping_adjust(). Once
+        * the vsyscall implementations are converted to use xtime_nsec
+        * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
+        * users are removed, this can be killed.
+        */
+        remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
+        tk->xtime_nsec -= remainder;
+        tk->xtime_nsec += 1ULL << tk->shift;
+        tk->ntp_error += remainder << tk->ntp_error_shift;
+}
+#else
+#define old_vsyscall_fixup(tk)
+#endif
 /**
 * update_wall_time - Uses the current clocksource to increment the wall time
@@ -1084,25 +1096,29 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 static void update_wall_time(void)
 {
        struct clocksource *clock;
+        struct timekeeper *tk = &timekeeper;
        cycle_t offset;
        int shift = 0, maxshift;
        unsigned long flags;
-        s64 remainder;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
                goto out;
-        clock = timekeeper.clock;
+        clock = tk->clock;
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-        offset = timekeeper.cycle_interval;
+        offset = tk->cycle_interval;
 #else
        offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
+        /* Check if there's really nothing to do */
+        if (offset < tk->cycle_interval)
+                goto out;
        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
@@ -1111,45 +1127,36 @@ static void update_wall_time(void)
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
-        shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+        shift = ilog2(offset) - ilog2(tk->cycle_interval);
        shift = max(0, shift);
        /* Bound shift to one less than what overflows tick_length */
        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
        shift = min(shift, maxshift);
-        while (offset >= timekeeper.cycle_interval) {
+        while (offset >= tk->cycle_interval) {
-                offset = logarithmic_accumulation(&timekeeper, offset, shift);
+                offset = logarithmic_accumulation(tk, offset, shift);
-                if(offset < timekeeper.cycle_interval<<shift)
+                if (offset < tk->cycle_interval<<shift)
                        shift--;
        }
        /* correct the clock when NTP error is too big */
-        timekeeping_adjust(&timekeeper, offset);
+        timekeeping_adjust(tk, offset);
        /*
-        * Store only full nanoseconds into xtime_nsec after rounding
+         * XXX This can be killed once everyone converts
-        * it up and add the remainder to the error difference.
+         * to the new update_vsyscall.
-        * XXX - This is necessary to avoid small 1ns inconsistnecies caused
+         */
-        * by truncating the remainder in vsyscalls. However, it causes
+        old_vsyscall_fixup(tk);
-        * additional work to be done in timekeeping_adjust(). Once
-        * the vsyscall implementations are converted to use xtime_nsec
-        * (shifted nanoseconds), this can be killed.
-        */
-        remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1);
-        timekeeper.xtime_nsec -= remainder;
-        timekeeper.xtime_nsec += 1 << timekeeper.shift;
-        timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift;
        /*
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
-        accumulate_nsecs_to_secs(&timekeeper);
+        accumulate_nsecs_to_secs(tk);
-        timekeeping_update(&timekeeper, false);
+        timekeeping_update(tk, false);
 out:
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
 }
@@ -1166,18 +1173,18 @@ out:
 */
 void getboottime(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec boottime = {
-                .tv_sec = timekeeper.wall_to_monotonic.tv_sec +
+                .tv_sec = tk->wall_to_monotonic.tv_sec +
-                                timekeeper.total_sleep_time.tv_sec,
+                                tk->total_sleep_time.tv_sec,
-                .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec +
+                .tv_nsec = tk->wall_to_monotonic.tv_nsec +
-                                timekeeper.total_sleep_time.tv_nsec
+                                tk->total_sleep_time.tv_nsec
        };
        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(getboottime);
 /**
 * get_monotonic_boottime - Returns monotonic time since boot
 * @ts:         pointer to the timespec to be set
@@ -1189,22 +1196,25 @@ EXPORT_SYMBOL_GPL(getboottime);
 */
 void get_monotonic_boottime(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec tomono, sleep;
+        s64 nsec;
        unsigned int seq;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ts->tv_sec = timekeeper.xtime_sec;
+                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
+                nsec = timekeeping_get_ns(tk);
-                tomono = timekeeper.wall_to_monotonic;
+                tomono = tk->wall_to_monotonic;
-                sleep = timekeeper.total_sleep_time;
+                sleep = tk->total_sleep_time;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
-        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
+        ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
-                        ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
+        ts->tv_nsec = 0;
+        timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(get_monotonic_boottime);
@@ -1231,31 +1241,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
 */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-        *ts = timespec_add(*ts, timekeeper.total_sleep_time);
+        struct timekeeper *tk = &timekeeper;
+        *ts = timespec_add(*ts, tk->total_sleep_time);
 }
 EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 unsigned long get_seconds(void)
 {
-        return timekeeper.xtime_sec;
+        struct timekeeper *tk = &timekeeper;
+        return tk->xtime_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 struct timespec __current_kernel_time(void)
 {
-        return tk_xtime(&timekeeper);
+        struct timekeeper *tk = &timekeeper;
+        return tk_xtime(tk);
 }
 struct timespec current_kernel_time(void)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec now;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                now = tk_xtime(&timekeeper);
+                now = tk_xtime(tk);
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return now;
 }
@@ -1263,15 +1280,16 @@ EXPORT_SYMBOL(current_kernel_time);
 struct timespec get_monotonic_coarse(void)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec now, mono;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                now = tk_xtime(&timekeeper);
+                now = tk_xtime(tk);
-                mono = timekeeper.wall_to_monotonic;
+                mono = tk->wall_to_monotonic;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
                                now.tv_nsec + mono.tv_nsec);
@@ -1300,14 +1318,15 @@ void do_timer(unsigned long ticks)
 void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
                                struct timespec *wtom, struct timespec *sleep)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                *xtim = tk_xtime(&timekeeper);
+                *xtim = tk_xtime(tk);
-                *wtom = timekeeper.wall_to_monotonic;
+                *wtom = tk->wall_to_monotonic;
-                *sleep = timekeeper.total_sleep_time;
+                *sleep = tk->total_sleep_time;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
 }
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1321,19 +1340,20 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 */
 ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 {
+        struct timekeeper *tk = &timekeeper;
        ktime_t now;
        unsigned int seq;
        u64 secs, nsecs;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                secs = timekeeper.xtime_sec;
+                secs = tk->xtime_sec;
-                nsecs = timekeeping_get_ns(&timekeeper);
+                nsecs = timekeeping_get_ns(tk);
-                *offs_real = timekeeper.offs_real;
+                *offs_real = tk->offs_real;
-                *offs_boot = timekeeper.offs_boot;
+                *offs_boot = tk->offs_boot;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        now = ktime_add_ns(ktime_set(secs, 0), nsecs);
        now = ktime_sub(now, *offs_real);
@@ -1346,19 +1366,19 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 */
 ktime_t ktime_get_monotonic_offset(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        struct timespec wtom;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                wtom = timekeeper.wall_to_monotonic;
+                wtom = tk->wall_to_monotonic;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return timespec_to_ktime(wtom);
 }
 EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
 /**
 * xtime_update() - advances the timekeeping infrastructure
 * @ticks:      number of ticks, that have elapsed since the last call.
diff --git a/kernel/timer.c b/kernel/timer.c
index 706fe4c53e82..d5de1b2292aa 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1393,13 +1393,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
 #endif
-#ifndef __alpha__
-/*
- * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
- * should be moved into arch/i386 instead?
- */
 /**
 * sys_getpid - return the thread group id of the current process
 *
@@ -1455,8 +1448,6 @@ SYSCALL_DEFINE0(getegid)
        return from_kgid_munged(current_user_ns(), current_egid());
 }
-#endif
 static void process_timeout(unsigned long __data)
 {
        wake_up_process((struct task_struct *)__data);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a008663d86c8..b4f20fba09fc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -312,7 +312,7 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-        if (ftrace_disabled)
+        if (unlikely(ftrace_disabled))
                return -ENODEV;
        if (FTRACE_WARN_ON(ops == &global_ops))
@@ -4299,16 +4299,12 @@ int register_ftrace_function(struct ftrace_ops *ops)
        mutex_lock(&ftrace_lock);
-        if (unlikely(ftrace_disabled))
-                goto out_unlock;
        ret = __register_ftrace_function(ops);
        if (!ret)
                ret = ftrace_startup(ops, 0);
- out_unlock:
        mutex_unlock(&ftrace_lock);
        return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_function);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f765465bffe4..49491fa7daa2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3239,6 +3239,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        if (cpu_buffer->commit_page == cpu_buffer->reader_page)
                goto out;
+        /* Don't bother swapping if the ring buffer is empty */
+        if (rb_num_of_entries(cpu_buffer) == 0)
+                goto out;
        /*
         * Reset the reader page to size zero.
         */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a7fa0702be1c..5c38c81496ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -830,6 +830,8 @@ int register_tracer(struct tracer *type)
                current_trace = saved_tracer;
                if (ret) {
                        printk(KERN_CONT "FAILED!\n");
+                        /* Add the warning after printing 'FAILED' */
+                        WARN_ON(1);
                        goto out;
                }
                /* Only reset on passing, to avoid touching corrupted buffers */
@@ -1708,9 +1710,11 @@ EXPORT_SYMBOL_GPL(trace_vprintk);
 static void trace_iterator_increment(struct trace_iterator *iter)
 {
+        struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, iter->cpu);
        iter->idx++;
-        if (iter->buffer_iter[iter->cpu])
+        if (buf_iter)
-                ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+                ring_buffer_read(buf_iter, NULL);
 }
 static struct trace_entry *
@@ -1718,7 +1722,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
                unsigned long *lost_events)
 {
        struct ring_buffer_event *event;
-        struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
+        struct ring_buffer_iter *buf_iter = trace_buffer_iter(iter, cpu);
        if (buf_iter)
                event = ring_buffer_iter_peek(buf_iter, ts);
@@ -1856,10 +1860,10 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
        tr->data[cpu]->skipped_entries = 0;
-        if (!iter->buffer_iter[cpu])
+        buf_iter = trace_buffer_iter(iter, cpu);
+        if (!buf_iter)
                return;
-        buf_iter = iter->buffer_iter[cpu];
        ring_buffer_iter_reset(buf_iter);
        /*
@@ -2205,13 +2209,15 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 int trace_empty(struct trace_iterator *iter)
 {
+        struct ring_buffer_iter *buf_iter;
        int cpu;
        /* If we are looking at one CPU buffer, only check that one */
        if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
                cpu = iter->cpu_file;
-                if (iter->buffer_iter[cpu]) {
+                buf_iter = trace_buffer_iter(iter, cpu);
-                        if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+                if (buf_iter) {
+                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
                        if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2221,8 +2227,9 @@ int trace_empty(struct trace_iterator *iter)
        }
        for_each_tracing_cpu(cpu) {
-                if (iter->buffer_iter[cpu]) {
+                buf_iter = trace_buffer_iter(iter, cpu);
-                        if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+                if (buf_iter) {
+                        if (!ring_buffer_iter_empty(buf_iter))
                                return 0;
                } else {
                        if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
@@ -2381,6 +2388,11 @@ __tracing_open(struct inode *inode, struct file *file)
        if (!iter)
                return ERR_PTR(-ENOMEM);
+        iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+                                    GFP_KERNEL);
+        if (!iter->buffer_iter)
+                goto release;
        /*
         * We make a copy of the current tracer to avoid concurrent
         * changes on it while we are reading.
@@ -2441,6 +2453,8 @@ __tracing_open(struct inode *inode, struct file *file)
 fail:
        mutex_unlock(&trace_types_lock);
        kfree(iter->trace);
+        kfree(iter->buffer_iter);
+release:
        seq_release_private(inode, file);
        return ERR_PTR(-ENOMEM);
 }
@@ -2481,6 +2495,7 @@ static int tracing_release(struct inode *inode, struct file *file)
        mutex_destroy(&iter->mutex);
        free_cpumask_var(iter->started);
        kfree(iter->trace);
+        kfree(iter->buffer_iter);
        seq_release_private(inode, file);
        return 0;
 }
@@ -3172,10 +3187,10 @@ static int tracing_set_tracer(const char *buf)
        }
        destroy_trace_option_files(topts);
-        current_trace = t;
+        current_trace = &nop_trace;
-        topts = create_trace_option_files(current_trace);
+        topts = create_trace_option_files(t);
-        if (current_trace->use_max_tr) {
+        if (t->use_max_tr) {
                int cpu;
                /* we need to make per cpu buffer sizes equivalent */
                for_each_tracing_cpu(cpu) {
@@ -3195,6 +3210,7 @@ static int tracing_set_tracer(const char *buf)
                        goto out;
        }
+        current_trace = t;
        trace_branch_enable(tr);
 out:
        mutex_unlock(&trace_types_lock);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5aec220d2de0..55e1f7f0db12 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -317,6 +317,14 @@ struct tracer {
 #define TRACE_PIPE_ALL_CPU      -1
+static inline struct ring_buffer_iter *
+trace_buffer_iter(struct trace_iterator *iter, int cpu)
+{
+        if (iter->buffer_iter && iter->buffer_iter[cpu])
+                return iter->buffer_iter[cpu];
+        return NULL;
+}
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index fee3752ae8f6..8a6d2ee2086c 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
        head = this_cpu_ptr(event_function.perf_events);
        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
-                              1, &regs, head);
+                              1, &regs, head, NULL);
 #undef ENTRY_SIZE
 }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c7b0c6a7db09..a426f410c060 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,6 +13,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/pstore.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
        preempt_enable_notrace();
 }
+/* Our two options */
+enum {
+        TRACE_FUNC_OPT_STACK    = 0x1,
+        TRACE_FUNC_OPT_PSTORE   = 0x2,
+};
+static struct tracer_flags func_flags;
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip)
 {
@@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
        disabled = atomic_inc_return(&data->disabled);
        if (likely(disabled == 1)) {
+                /*
+                 * So far tracing doesn't support multiple buffers, so
+                 * we make an explicit call for now.
+                 */
+                if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
+                        pstore_ftrace_call(ip, parent_ip);
                pc = preempt_count();
                trace_function(tr, ip, parent_ip, flags, pc);
        }
@@ -158,15 +173,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly =
        .flags = FTRACE_OPS_FL_GLOBAL,
 };
-/* Our two options */
-enum {
-        TRACE_FUNC_OPT_STACK = 0x1,
-};
 static struct tracer_opt func_opts[] = {
 #ifdef CONFIG_STACKTRACE
        { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
 #endif
+#ifdef CONFIG_PSTORE_FTRACE
+        { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
+#endif
        { } /* Always set a last empty entry */
 };
@@ -204,10 +217,11 @@ static void tracing_stop_function_trace(void)
 static int func_set_flag(u32 old_flags, u32 bit, int set)
 {
-        if (bit == TRACE_FUNC_OPT_STACK) {
+        switch (bit) {
+        case TRACE_FUNC_OPT_STACK:
                /* do nothing if already set */
                if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
-                        return 0;
+                        break;
                if (set) {
                        unregister_ftrace_function(&trace_ops);
@@ -217,10 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
                        register_ftrace_function(&trace_ops);
                }
-                return 0;
+                break;
+        case TRACE_FUNC_OPT_PSTORE:
+                break;
+        default:
+                return -EINVAL;
        }
-        return -EINVAL;
+        return 0;
 }
 static struct tracer function_trace __read_mostly =
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a7d2a4c653d8..ce27c8ba8d31 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -538,7 +538,7 @@ get_return_for_leaf(struct trace_iterator *iter,
                next = &data->ret;
        } else {
-                ring_iter = iter->buffer_iter[iter->cpu];
+                ring_iter = trace_buffer_iter(iter, iter->cpu);
                /* First peek to compare current entry and the next one */
                if (ring_iter)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b31d3d5699fe..1a2117043bb1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+        perf_trace_buf_submit(entry, size, rctx,
+                                        entry->ip, 1, regs, head, NULL);
 }
 /* Kretprobe profile handler */
@@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+        perf_trace_buf_submit(entry, size, rctx,
+                                        entry->ret_ip, 1, regs, head, NULL);
 }
 #endif  /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index df611a0e76c5..123b189c732c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1325,4 +1325,4 @@ __init static int init_events(void)
        return 0;
 }
-device_initcall(init_events);
+early_initcall(init_events);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 96fc73369099..6b245f64c8dd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        int size;
        syscall_nr = syscall_get_nr(current, regs);
+        if (syscall_nr < 0)
+                return;
        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
                return;
@@ -532,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
                               (unsigned long *)&rec->args);
        head = this_cpu_ptr(sys_data->enter_event->perf_events);
-        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        int size;
        syscall_nr = syscall_get_nr(current, regs);
+        if (syscall_nr < 0)
+                return;
        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
                return;
@@ -608,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        rec->ret = syscall_get_return_value(current, regs);
        head = this_cpu_ptr(sys_data->exit_event->perf_events);
-        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 int perf_sysexit_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2b36ac68549e..03003cd7dd96 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
 out:
        preempt_enable();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9a3128dc67df..1e1373bcb3e3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,32 +45,42 @@
 #include "workqueue_sched.h"
 enum {
-        /* global_cwq flags */
+        /*
-        GCWQ_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
+         * global_cwq flags
-        GCWQ_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
+         *
-        GCWQ_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
+         * A bound gcwq is either associated or disassociated with its CPU.
-        GCWQ_FREEZING           = 1 << 3,       /* freeze in progress */
+         * While associated (!DISASSOCIATED), all workers are bound to the
-        GCWQ_HIGHPRI_PENDING    = 1 << 4,       /* highpri works on queue */
+         * CPU and none has %WORKER_UNBOUND set and concurrency management
+         * is in effect.
+         *
+         * While DISASSOCIATED, the cpu may be offline and all workers have
+         * %WORKER_UNBOUND set and concurrency management disabled, and may
+         * be executing on any CPU.  The gcwq behaves as an unbound one.
+         *
+         * Note that DISASSOCIATED can be flipped only while holding
+         * managership of all pools on the gcwq to avoid changing binding
+         * state while create_worker() is in progress.
+         */
+        GCWQ_DISASSOCIATED      = 1 << 0,       /* cpu can't serve workers */
+        GCWQ_FREEZING           = 1 << 1,       /* freeze in progress */
+        /* pool flags */
+        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
+        POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
        /* worker flags */
        WORKER_STARTED          = 1 << 0,       /* started */
        WORKER_DIE              = 1 << 1,       /* die die die */
        WORKER_IDLE             = 1 << 2,       /* is idle */
        WORKER_PREP             = 1 << 3,       /* preparing to run works */
-        WORKER_ROGUE            = 1 << 4,       /* not bound to any cpu */
        WORKER_REBIND           = 1 << 5,       /* mom is home, come back */
        WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
        WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
-        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
+        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
-                                  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+                                  WORKER_CPU_INTENSIVE,
-        /* gcwq->trustee_state */
+        NR_WORKER_POOLS         = 2,            /* # worker pools per gcwq */
-        TRUSTEE_START           = 0,            /* start */
-        TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
-        TRUSTEE_BUTCHER         = 2,            /* butcher workers */
-        TRUSTEE_RELEASE         = 3,            /* release workers */
-        TRUSTEE_DONE            = 4,            /* trustee is done */
        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
        BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
@@ -84,13 +94,13 @@ enum {
                                                   (min two ticks) */
        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
-        TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give -20.
         */
        RESCUER_NICE_LEVEL      = -20,
+        HIGHPRI_NICE_LEVEL      = -20,
 };
 /*
@@ -115,6 +125,8 @@ enum {
 */
 struct global_cwq;
+struct worker_pool;
+struct idle_rebind;
 /*
 * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -131,12 +143,31 @@ struct worker {
        struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
        struct list_head        scheduled;      /* L: scheduled works */
        struct task_struct      *task;          /* I: worker task */
-        struct global_cwq       *gcwq;          /* I: the associated gcwq */
+        struct worker_pool      *pool;          /* I: the associated pool */
        /* 64 bytes boundary on 64bit, 32 on 32bit */
        unsigned long           last_active;    /* L: last active timestamp */
        unsigned int            flags;          /* X: flags */
        int                     id;             /* I: worker id */
-        struct work_struct      rebind_work;    /* L: rebind worker to cpu */
+        /* for rebinding worker to CPU */
+        struct idle_rebind      *idle_rebind;   /* L: for idle worker */
+        struct work_struct      rebind_work;    /* L: for busy worker */
+};
+struct worker_pool {
+        struct global_cwq       *gcwq;          /* I: the owning gcwq */
+        unsigned int            flags;          /* X: flags */
+        struct list_head        worklist;       /* L: list of pending works */
+        int                     nr_workers;     /* L: total number of workers */
+        int                     nr_idle;        /* L: currently idle ones */
+        struct list_head        idle_list;      /* X: list of idle workers */
+        struct timer_list       idle_timer;     /* L: worker idle timeout */
+        struct timer_list       mayday_timer;   /* L: SOS timer for workers */
+        struct mutex            manager_mutex;  /* mutex manager should hold */
+        struct ida              worker_ida;     /* L: for worker IDs */
 };
 /*
@@ -146,27 +177,16 @@ struct worker {
 */
 struct global_cwq {
        spinlock_t              lock;           /* the gcwq lock */
-        struct list_head        worklist;       /* L: list of pending works */
        unsigned int            cpu;            /* I: the associated cpu */
        unsigned int            flags;          /* L: GCWQ_* flags */
-        int                     nr_workers;     /* L: total number of workers */
+        /* workers are chained either in busy_hash or pool idle_list */
-        int                     nr_idle;        /* L: currently idle ones */
-        /* workers are chained either in the idle_list or busy_hash */
-        struct list_head        idle_list;      /* X: list of idle workers */
        struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
                                                /* L: hash of busy workers */
-        struct timer_list       idle_timer;     /* L: worker idle timeout */
+        struct worker_pool      pools[2];       /* normal and highpri pools */
-        struct timer_list       mayday_timer;   /* L: SOS timer for dworkers */
-        struct ida              worker_ida;     /* L: for worker IDs */
-        struct task_struct      *trustee;       /* L: for gcwq shutdown */
+        wait_queue_head_t       rebind_hold;    /* rebind hold wait */
-        unsigned int            trustee_state;  /* L: trustee state */
-        wait_queue_head_t       trustee_wait;   /* trustee wait */
-        struct worker           *first_idle;    /* L: first idle worker */
 } ____cacheline_aligned_in_smp;
 /*
@@ -175,7 +195,7 @@ struct global_cwq {
 * aligned at two's power of the number of flag bits.
 */
 struct cpu_workqueue_struct {
-        struct global_cwq       *gcwq;          /* I: the associated gcwq */
+        struct worker_pool      *pool;          /* I: the associated pool */
        struct workqueue_struct *wq;            /* I: the owning workqueue */
        int                     work_color;     /* L: current color */
        int                     flush_color;    /* L: flushing color */
@@ -264,6 +284,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
+#define for_each_worker_pool(pool, gcwq)                                \
+        for ((pool) = &(gcwq)->pools[0];                                \
+             (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
 #define for_each_busy_worker(worker, i, pos, gcwq)                      \
        for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
                hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -444,7 +468,7 @@ static bool workqueue_freezing;		/* W: have wqs started freezing? */
 * try_to_wake_up().  Put it in a separate cacheline.
 */
 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
 /*
 * Global cpu workqueue and nr_running counter for unbound gcwq.  The
@@ -452,10 +476,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
 * workers have WORKER_UNBOUND set.
 */
 static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);       /* always 0 */
+static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
+        [0 ... NR_WORKER_POOLS - 1]     = ATOMIC_INIT(0),       /* always 0 */
+};
 static int worker_thread(void *__worker);
+static int worker_pool_pri(struct worker_pool *pool)
+{
+        return pool - pool->gcwq->pools;
+}
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
        if (cpu != WORK_CPU_UNBOUND)
@@ -464,12 +495,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
                return &unbound_global_cwq;
 }
-static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+static atomic_t *get_pool_nr_running(struct worker_pool *pool)
 {
+        int cpu = pool->gcwq->cpu;
+        int idx = worker_pool_pri(pool);
        if (cpu != WORK_CPU_UNBOUND)
-                return &per_cpu(gcwq_nr_running, cpu);
+                return &per_cpu(pool_nr_running, cpu)[idx];
        else
-                return &unbound_gcwq_nr_running;
+                return &unbound_pool_nr_running[idx];
 }
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
@@ -555,7 +589,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
        if (data & WORK_STRUCT_CWQ)
                return ((struct cpu_workqueue_struct *)
-                        (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
        cpu = data >> WORK_STRUCT_FLAG_BITS;
        if (cpu == WORK_CPU_NONE)
@@ -566,60 +600,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 }
 /*
- * Policy functions.  These define the policies on how the global
+ * Policy functions.  These define the policies on how the global worker
- * worker pool is managed.  Unless noted otherwise, these functions
+ * pools are managed.  Unless noted otherwise, these functions assume that
- * assume that they're being called with gcwq->lock held.
+ * they're being called with gcwq->lock held.
 */
-static bool __need_more_worker(struct global_cwq *gcwq)
+static bool __need_more_worker(struct worker_pool *pool)
 {
-        return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
+        return !atomic_read(get_pool_nr_running(pool));
-                gcwq->flags & GCWQ_HIGHPRI_PENDING;
 }
 /*
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
+ *
+ * Note that, because unbound workers never contribute to nr_running, this
+ * function will always return %true for unbound gcwq as long as the
+ * worklist isn't empty.
 */
-static bool need_more_worker(struct global_cwq *gcwq)
+static bool need_more_worker(struct worker_pool *pool)
 {
-        return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
+        return !list_empty(&pool->worklist) && __need_more_worker(pool);
 }
 /* Can I start working?  Called from busy but !running workers. */
-static bool may_start_working(struct global_cwq *gcwq)
+static bool may_start_working(struct worker_pool *pool)
 {
-        return gcwq->nr_idle;
+        return pool->nr_idle;
 }
 /* Do I need to keep working?  Called from currently running workers. */
-static bool keep_working(struct global_cwq *gcwq)
+static bool keep_working(struct worker_pool *pool)
 {
-        atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+        atomic_t *nr_running = get_pool_nr_running(pool);
-        return !list_empty(&gcwq->worklist) &&
+        return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
-                (atomic_read(nr_running) <= 1 ||
-                 gcwq->flags & GCWQ_HIGHPRI_PENDING);
 }
 /* Do we need a new worker?  Called from manager. */
-static bool need_to_create_worker(struct global_cwq *gcwq)
+static bool need_to_create_worker(struct worker_pool *pool)
 {
-        return need_more_worker(gcwq) && !may_start_working(gcwq);
+        return need_more_worker(pool) && !may_start_working(pool);
 }
 /* Do I need to be the manager? */
-static bool need_to_manage_workers(struct global_cwq *gcwq)
+static bool need_to_manage_workers(struct worker_pool *pool)
 {
-        return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+        return need_to_create_worker(pool) ||
+                (pool->flags & POOL_MANAGE_WORKERS);
 }
 /* Do we have too many workers and should some go away? */
-static bool too_many_workers(struct global_cwq *gcwq)
+static bool too_many_workers(struct worker_pool *pool)
 {
-        bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
+        bool managing = pool->flags & POOL_MANAGING_WORKERS;
-        int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
+        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
-        int nr_busy = gcwq->nr_workers - nr_idle;
+        int nr_busy = pool->nr_workers - nr_idle;
        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
@@ -629,26 +665,26 @@ static bool too_many_workers(struct global_cwq *gcwq)
 */
 /* Return the first worker.  Safe with preemption disabled */
-static struct worker *first_worker(struct global_cwq *gcwq)
+static struct worker *first_worker(struct worker_pool *pool)
 {
-        if (unlikely(list_empty(&gcwq->idle_list)))
+        if (unlikely(list_empty(&pool->idle_list)))
                return NULL;
-        return list_first_entry(&gcwq->idle_list, struct worker, entry);
+        return list_first_entry(&pool->idle_list, struct worker, entry);
 }
 /**
 * wake_up_worker - wake up an idle worker
- * @gcwq: gcwq to wake worker for
+ * @pool: worker pool to wake worker from
 *
- * Wake up the first idle worker of @gcwq.
+ * Wake up the first idle worker of @pool.
 *
 * CONTEXT:
 * spin_lock_irq(gcwq->lock).
 */
-static void wake_up_worker(struct global_cwq *gcwq)
+static void wake_up_worker(struct worker_pool *pool)
 {
-        struct worker *worker = first_worker(gcwq);
+        struct worker *worker = first_worker(pool);
        if (likely(worker))
                wake_up_process(worker->task);
@@ -670,7 +706,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
        struct worker *worker = kthread_data(task);
        if (!(worker->flags & WORKER_NOT_RUNNING))
-                atomic_inc(get_gcwq_nr_running(cpu));
+                atomic_inc(get_pool_nr_running(worker->pool));
 }
 /**
@@ -692,8 +728,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
                                       unsigned int cpu)
 {
        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
-        struct global_cwq *gcwq = get_gcwq(cpu);
+        struct worker_pool *pool = worker->pool;
-        atomic_t *nr_running = get_gcwq_nr_running(cpu);
+        atomic_t *nr_running = get_pool_nr_running(pool);
        if (worker->flags & WORKER_NOT_RUNNING)
                return NULL;
@@ -706,14 +742,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
         * worklist not empty test sequence is in insert_work().
         * Please read comment there.
         *
-         * NOT_RUNNING is clear.  This means that trustee is not in
+         * NOT_RUNNING is clear.  This means that we're bound to and
-         * charge and we're running on the local cpu w/ rq lock held
+         * running on the local cpu w/ rq lock held and preemption
-         * and preemption disabled, which in turn means that none else
+         * disabled, which in turn means that none else could be
-         * could be manipulating idle_list, so dereferencing idle_list
+         * manipulating idle_list, so dereferencing idle_list without gcwq
-         * without gcwq lock is safe.
+         * lock is safe.
         */
-        if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
+        if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
-                to_wakeup = first_worker(gcwq);
+                to_wakeup = first_worker(pool);
        return to_wakeup ? to_wakeup->task : NULL;
 }
@@ -733,7 +769,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
                                    bool wakeup)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        WARN_ON_ONCE(worker->task != current);
@@ -744,12 +780,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
         */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
-                atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+                atomic_t *nr_running = get_pool_nr_running(pool);
                if (wakeup) {
                        if (atomic_dec_and_test(nr_running) &&
-                            !list_empty(&gcwq->worklist))
+                            !list_empty(&pool->worklist))
-                                wake_up_worker(gcwq);
+                                wake_up_worker(pool);
                } else
                        atomic_dec(nr_running);
        }
@@ -769,7 +805,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        unsigned int oflags = worker->flags;
        WARN_ON_ONCE(worker->task != current);
@@ -783,7 +819,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
-                        atomic_inc(get_gcwq_nr_running(gcwq->cpu));
+                        atomic_inc(get_pool_nr_running(pool));
 }
 /**
@@ -867,43 +903,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 }
 /**
- * gcwq_determine_ins_pos - find insertion position
- * @gcwq: gcwq of interest
- * @cwq: cwq a work is being queued for
- *
- * A work for @cwq is about to be queued on @gcwq, determine insertion
- * position for the work.  If @cwq is for HIGHPRI wq, the work is
- * queued at the head of the queue but in FIFO order with respect to
- * other HIGHPRI works; otherwise, at the end of the queue.  This
- * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
- * there are HIGHPRI works pending.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to inserstion position.
- */
-static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
-                                               struct cpu_workqueue_struct *cwq)
-{
-        struct work_struct *twork;
-        if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
-                return &gcwq->worklist;
-        list_for_each_entry(twork, &gcwq->worklist, entry) {
-                struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
-                if (!(tcwq->wq->flags & WQ_HIGHPRI))
-                        break;
-        }
-        gcwq->flags |= GCWQ_HIGHPRI_PENDING;
-        return &twork->entry;
-}
-/**
 * insert_work - insert a work into gcwq
 * @cwq: cwq @work belongs to
 * @work: work to insert
@@ -920,7 +919,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
                        struct work_struct *work, struct list_head *head,
                        unsigned int extra_flags)
 {
-        struct global_cwq *gcwq = cwq->gcwq;
+        struct worker_pool *pool = cwq->pool;
        /* we own @work, set data and link */
        set_work_cwq(work, cwq, extra_flags);
@@ -940,8 +939,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
         */
        smp_mb();
-        if (__need_more_worker(gcwq))
+        if (__need_more_worker(pool))
-                wake_up_worker(gcwq);
+                wake_up_worker(pool);
 }
 /*
@@ -1043,7 +1042,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        if (likely(cwq->nr_active < cwq->max_active)) {
                trace_workqueue_activate_work(work);
                cwq->nr_active++;
-                worklist = gcwq_determine_ins_pos(gcwq, cwq);
+                worklist = &cwq->pool->worklist;
        } else {
                work_flags |= WORK_STRUCT_DELAYED;
                worklist = &cwq->delayed_works;
@@ -1192,7 +1191,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 */
 static void worker_enter_idle(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        BUG_ON(worker->flags & WORKER_IDLE);
        BUG_ON(!list_empty(&worker->entry) &&
@@ -1200,27 +1200,24 @@ static void worker_enter_idle(struct worker *worker)
        /* can't use worker_set_flags(), also called from start_worker() */
        worker->flags |= WORKER_IDLE;
-        gcwq->nr_idle++;
+        pool->nr_idle++;
        worker->last_active = jiffies;
        /* idle_list is LIFO */
-        list_add(&worker->entry, &gcwq->idle_list);
+        list_add(&worker->entry, &pool->idle_list);
-        if (likely(!(worker->flags & WORKER_ROGUE))) {
+        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
-                if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
+                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
-                        mod_timer(&gcwq->idle_timer,
-                                  jiffies + IDLE_WORKER_TIMEOUT);
-        } else
-                wake_up_all(&gcwq->trustee_wait);
        /*
-         * Sanity check nr_running.  Because trustee releases gcwq->lock
+         * Sanity check nr_running.  Because gcwq_unbind_fn() releases
-         * between setting %WORKER_ROGUE and zapping nr_running, the
+         * gcwq->lock between setting %WORKER_UNBOUND and zapping
-         * warning may trigger spuriously.  Check iff trustee is idle.
+         * nr_running, the warning may trigger spuriously.  Check iff
+         * unbind is not in progress.
         */
-        WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+        WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
-                     gcwq->nr_workers == gcwq->nr_idle &&
+                     pool->nr_workers == pool->nr_idle &&
-                     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+                     atomic_read(get_pool_nr_running(pool)));
 }
 /**
@@ -1234,11 +1231,11 @@ static void worker_enter_idle(struct worker *worker)
 */
 static void worker_leave_idle(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        BUG_ON(!(worker->flags & WORKER_IDLE));
        worker_clr_flags(worker, WORKER_IDLE);
-        gcwq->nr_idle--;
+        pool->nr_idle--;
        list_del_init(&worker->entry);
 }
@@ -1258,11 +1255,11 @@ static void worker_leave_idle(struct worker *worker)
 * verbatim as it's best effort and blocking and gcwq may be
 * [dis]associated in the meantime.
 *
- * This function tries set_cpus_allowed() and locks gcwq and verifies
+ * This function tries set_cpus_allowed() and locks gcwq and verifies the
- * the binding against GCWQ_DISASSOCIATED which is set during
+ * binding against %GCWQ_DISASSOCIATED which is set during
- * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
+ * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
- * idle state or fetches works without dropping lock, it can guarantee
+ * enters idle state or fetches works without dropping lock, it can
- * the scheduling requirement described in the first paragraph.
+ * guarantee the scheduling requirement described in the first paragraph.
 *
 * CONTEXT:
 * Might sleep.  Called without any lock but returns with gcwq->lock
@@ -1275,7 +1272,7 @@ static void worker_leave_idle(struct worker *worker)
 static bool worker_maybe_bind_and_lock(struct worker *worker)
 __acquires(&gcwq->lock)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct global_cwq *gcwq = worker->pool->gcwq;
        struct task_struct *task = worker->task;
        while (true) {
@@ -1308,16 +1305,49 @@ __acquires(&gcwq->lock)
        }
 }
+struct idle_rebind {
+        int                     cnt;            /* # workers to be rebound */
+        struct completion       done;           /* all workers rebound */
+};
 /*
- * Function for worker->rebind_work used to rebind rogue busy workers
+ * Rebind an idle @worker to its CPU.  During CPU onlining, this has to
- * to the associated cpu which is coming back online.  This is
+ * happen synchronously for idle workers.  worker_thread() will test
- * scheduled by cpu up but can race with other cpu hotplug operations
+ * %WORKER_REBIND before leaving idle and call this function.
- * and may be executed twice without intervening cpu down.
 */
-static void worker_rebind_fn(struct work_struct *work)
+static void idle_worker_rebind(struct worker *worker)
+{
+        struct global_cwq *gcwq = worker->pool->gcwq;
+        /* CPU must be online at this point */
+        WARN_ON(!worker_maybe_bind_and_lock(worker));
+        if (!--worker->idle_rebind->cnt)
+                complete(&worker->idle_rebind->done);
+        spin_unlock_irq(&worker->pool->gcwq->lock);
+        /* we did our part, wait for rebind_workers() to finish up */
+        wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+        /*
+         * rebind_workers() shouldn't finish until all workers passed the
+         * above WORKER_REBIND wait.  Tell it when done.
+         */
+        spin_lock_irq(&worker->pool->gcwq->lock);
+        if (!--worker->idle_rebind->cnt)
+                complete(&worker->idle_rebind->done);
+        spin_unlock_irq(&worker->pool->gcwq->lock);
+}
+/*
+ * Function for @worker->rebind.work used to rebind unbound busy workers to
+ * the associated cpu which is coming back online.  This is scheduled by
+ * cpu up but can race with other cpu hotplug operations and may be
+ * executed twice without intervening cpu down.
+ */
+static void busy_worker_rebind_fn(struct work_struct *work)
 {
        struct worker *worker = container_of(work, struct worker, rebind_work);
-        struct global_cwq *gcwq = worker->gcwq;
+        struct global_cwq *gcwq = worker->pool->gcwq;
        if (worker_maybe_bind_and_lock(worker))
                worker_clr_flags(worker, WORKER_REBIND);
@@ -1325,6 +1355,133 @@ static void worker_rebind_fn(struct work_struct *work)
        spin_unlock_irq(&gcwq->lock);
 }
+/**
+ * rebind_workers - rebind all workers of a gcwq to the associated CPU
+ * @gcwq: gcwq of interest
+ *
+ * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
+ * is different for idle and busy ones.
+ *
+ * The idle ones should be rebound synchronously and idle rebinding should
+ * be complete before any worker starts executing work items with
+ * concurrency management enabled; otherwise, scheduler may oops trying to
+ * wake up non-local idle worker from wq_worker_sleeping().
+ *
+ * This is achieved by repeatedly requesting rebinding until all idle
+ * workers are known to have been rebound under @gcwq->lock and holding all
+ * idle workers from becoming busy until idle rebinding is complete.
+ *
+ * Once idle workers are rebound, busy workers can be rebound as they
+ * finish executing their current work items.  Queueing the rebind work at
+ * the head of their scheduled lists is enough.  Note that nr_running will
+ * be properbly bumped as busy workers rebind.
+ *
+ * On return, all workers are guaranteed to either be bound or have rebind
+ * work item scheduled.
+ */
+static void rebind_workers(struct global_cwq *gcwq)
+        __releases(&gcwq->lock) __acquires(&gcwq->lock)
+{
+        struct idle_rebind idle_rebind;
+        struct worker_pool *pool;
+        struct worker *worker;
+        struct hlist_node *pos;
+        int i;
+        lockdep_assert_held(&gcwq->lock);
+        for_each_worker_pool(pool, gcwq)
+                lockdep_assert_held(&pool->manager_mutex);
+        /*
+         * Rebind idle workers.  Interlocked both ways.  We wait for
+         * workers to rebind via @idle_rebind.done.  Workers will wait for
+         * us to finish up by watching %WORKER_REBIND.
+         */
+        init_completion(&idle_rebind.done);
+retry:
+        idle_rebind.cnt = 1;
+        INIT_COMPLETION(idle_rebind.done);
+        /* set REBIND and kick idle ones, we'll wait for these later */
+        for_each_worker_pool(pool, gcwq) {
+                list_for_each_entry(worker, &pool->idle_list, entry) {
+                        unsigned long worker_flags = worker->flags;
+                        if (worker->flags & WORKER_REBIND)
+                                continue;
+                        /* morph UNBOUND to REBIND atomically */
+                        worker_flags &= ~WORKER_UNBOUND;
+                        worker_flags |= WORKER_REBIND;
+                        ACCESS_ONCE(worker->flags) = worker_flags;
+                        idle_rebind.cnt++;
+                        worker->idle_rebind = &idle_rebind;
+                        /* worker_thread() will call idle_worker_rebind() */
+                        wake_up_process(worker->task);
+                }
+        }
+        if (--idle_rebind.cnt) {
+                spin_unlock_irq(&gcwq->lock);
+                wait_for_completion(&idle_rebind.done);
+                spin_lock_irq(&gcwq->lock);
+                /* busy ones might have become idle while waiting, retry */
+                goto retry;
+        }
+        /* all idle workers are rebound, rebind busy workers */
+        for_each_busy_worker(worker, i, pos, gcwq) {
+                struct work_struct *rebind_work = &worker->rebind_work;
+                unsigned long worker_flags = worker->flags;
+                /* morph UNBOUND to REBIND atomically */
+                worker_flags &= ~WORKER_UNBOUND;
+                worker_flags |= WORKER_REBIND;
+                ACCESS_ONCE(worker->flags) = worker_flags;
+                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+                                     work_data_bits(rebind_work)))
+                        continue;
+                /* wq doesn't matter, use the default one */
+                debug_work_activate(rebind_work);
+                insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+                            worker->scheduled.next,
+                            work_color_to_flags(WORK_NO_COLOR));
+        }
+        /*
+         * All idle workers are rebound and waiting for %WORKER_REBIND to
+         * be cleared inside idle_worker_rebind().  Clear and release.
+         * Clearing %WORKER_REBIND from this foreign context is safe
+         * because these workers are still guaranteed to be idle.
+         *
+         * We need to make sure all idle workers passed WORKER_REBIND wait
+         * in idle_worker_rebind() before returning; otherwise, workers can
+         * get stuck at the wait if hotplug cycle repeats.
+         */
+        idle_rebind.cnt = 1;
+        INIT_COMPLETION(idle_rebind.done);
+        for_each_worker_pool(pool, gcwq) {
+                list_for_each_entry(worker, &pool->idle_list, entry) {
+                        worker->flags &= ~WORKER_REBIND;
+                        idle_rebind.cnt++;
+                }
+        }
+        wake_up_all(&gcwq->rebind_hold);
+        if (--idle_rebind.cnt) {
+                spin_unlock_irq(&gcwq->lock);
+                wait_for_completion(&idle_rebind.done);
+                spin_lock_irq(&gcwq->lock);
+        }
+}
 static struct worker *alloc_worker(void)
 {
        struct worker *worker;
@@ -1333,7 +1490,7 @@ static struct worker *alloc_worker(void)
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
-                INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+                INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
@@ -1342,10 +1499,9 @@ static struct worker *alloc_worker(void)
 /**
 * create_worker - create a new workqueue worker
- * @gcwq: gcwq the new worker will belong to
+ * @pool: pool the new worker will belong to
- * @bind: whether to set affinity to @cpu or not
 *
- * Create a new worker which is bound to @gcwq.  The returned worker
+ * Create a new worker which is bound to @pool.  The returned worker
 * can be started by calling start_worker() or destroyed using
 * destroy_worker().
 *
@@ -1355,16 +1511,17 @@ static struct worker *alloc_worker(void)
 * RETURNS:
 * Pointer to the newly created worker.
 */
-static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+static struct worker *create_worker(struct worker_pool *pool)
 {
-        bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+        struct global_cwq *gcwq = pool->gcwq;
+        const char *pri = worker_pool_pri(pool) ? "H" : "";
        struct worker *worker = NULL;
        int id = -1;
        spin_lock_irq(&gcwq->lock);
-        while (ida_get_new(&gcwq->worker_ida, &id)) {
+        while (ida_get_new(&pool->worker_ida, &id)) {
                spin_unlock_irq(&gcwq->lock);
-                if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
+                if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
                        goto fail;
                spin_lock_irq(&gcwq->lock);
        }
@@ -1374,38 +1531,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
        if (!worker)
                goto fail;
-        worker->gcwq = gcwq;
+        worker->pool = pool;
        worker->id = id;
-        if (!on_unbound_cpu)
+        if (gcwq->cpu != WORK_CPU_UNBOUND)
                worker->task = kthread_create_on_node(worker_thread,
-                                                      worker,
+                                        worker, cpu_to_node(gcwq->cpu),
-                                                      cpu_to_node(gcwq->cpu),
+                                        "kworker/%u:%d%s", gcwq->cpu, id, pri);
-                                                      "kworker/%u:%d", gcwq->cpu, id);
        else
                worker->task = kthread_create(worker_thread, worker,
-                                              "kworker/u:%d", id);
+                                              "kworker/u:%d%s", id, pri);
        if (IS_ERR(worker->task))
                goto fail;
+        if (worker_pool_pri(pool))
+                set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
        /*
-         * A rogue worker will become a regular one if CPU comes
+         * Determine CPU binding of the new worker depending on
-         * online later on.  Make sure every worker has
+         * %GCWQ_DISASSOCIATED.  The caller is responsible for ensuring the
-         * PF_THREAD_BOUND set.
+         * flag remains stable across this function.  See the comments
+         * above the flag definition for details.
+         *
+         * As an unbound worker may later become a regular one if CPU comes
+         * online, make sure every worker has %PF_THREAD_BOUND set.
         */
-        if (bind && !on_unbound_cpu)
+        if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
                kthread_bind(worker->task, gcwq->cpu);
-        else {
+        } else {
                worker->task->flags |= PF_THREAD_BOUND;
-                if (on_unbound_cpu)
+                worker->flags |= WORKER_UNBOUND;
-                        worker->flags |= WORKER_UNBOUND;
        }
        return worker;
 fail:
        if (id >= 0) {
                spin_lock_irq(&gcwq->lock);
-                ida_remove(&gcwq->worker_ida, id);
+                ida_remove(&pool->worker_ida, id);
                spin_unlock_irq(&gcwq->lock);
        }
        kfree(worker);
@@ -1424,7 +1586,7 @@ fail:
 static void start_worker(struct worker *worker)
 {
        worker->flags |= WORKER_STARTED;
-        worker->gcwq->nr_workers++;
+        worker->pool->nr_workers++;
        worker_enter_idle(worker);
        wake_up_process(worker->task);
 }
@@ -1440,7 +1602,8 @@ static void start_worker(struct worker *worker)
 */
 static void destroy_worker(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        int id = worker->id;
        /* sanity check frenzy */
@@ -1448,9 +1611,9 @@ static void destroy_worker(struct worker *worker)
        BUG_ON(!list_empty(&worker->scheduled));
        if (worker->flags & WORKER_STARTED)
-                gcwq->nr_workers--;
+                pool->nr_workers--;
        if (worker->flags & WORKER_IDLE)
-                gcwq->nr_idle--;
+                pool->nr_idle--;
        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
@@ -1461,29 +1624,30 @@ static void destroy_worker(struct worker *worker)
        kfree(worker);
        spin_lock_irq(&gcwq->lock);
-        ida_remove(&gcwq->worker_ida, id);
+        ida_remove(&pool->worker_ida, id);
 }
-static void idle_worker_timeout(unsigned long __gcwq)
+static void idle_worker_timeout(unsigned long __pool)
 {
-        struct global_cwq *gcwq = (void *)__gcwq;
+        struct worker_pool *pool = (void *)__pool;
+        struct global_cwq *gcwq = pool->gcwq;
        spin_lock_irq(&gcwq->lock);
-        if (too_many_workers(gcwq)) {
+        if (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;
                /* idle_list is kept in LIFO order, check the last one */
-                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+                worker = list_entry(pool->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                if (time_before(jiffies, expires))
-                        mod_timer(&gcwq->idle_timer, expires);
+                        mod_timer(&pool->idle_timer, expires);
                else {
                        /* it's been idle for too long, wake up manager */
-                        gcwq->flags |= GCWQ_MANAGE_WORKERS;
+                        pool->flags |= POOL_MANAGE_WORKERS;
-                        wake_up_worker(gcwq);
+                        wake_up_worker(pool);
                }
        }
@@ -1500,7 +1664,7 @@ static bool send_mayday(struct work_struct *work)
                return false;
        /* mayday mayday mayday */
-        cpu = cwq->gcwq->cpu;
+        cpu = cwq->pool->gcwq->cpu;
        /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
        if (cpu == WORK_CPU_UNBOUND)
                cpu = 0;
@@ -1509,37 +1673,38 @@ static bool send_mayday(struct work_struct *work)
        return true;
 }
-static void gcwq_mayday_timeout(unsigned long __gcwq)
+static void gcwq_mayday_timeout(unsigned long __pool)
 {
-        struct global_cwq *gcwq = (void *)__gcwq;
+        struct worker_pool *pool = (void *)__pool;
+        struct global_cwq *gcwq = pool->gcwq;
        struct work_struct *work;
        spin_lock_irq(&gcwq->lock);
-        if (need_to_create_worker(gcwq)) {
+        if (need_to_create_worker(pool)) {
                /*
                 * We've been trying to create a new worker but
                 * haven't been successful.  We might be hitting an
                 * allocation deadlock.  Send distress signals to
                 * rescuers.
                 */
-                list_for_each_entry(work, &gcwq->worklist, entry)
+                list_for_each_entry(work, &pool->worklist, entry)
                        send_mayday(work);
        }
        spin_unlock_irq(&gcwq->lock);
-        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 /**
 * maybe_create_worker - create a new worker if necessary
- * @gcwq: gcwq to create a new worker for
+ * @pool: pool to create a new worker for
 *
- * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
+ * Create a new worker for @pool if necessary.  @pool is guaranteed to
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
- * sent to all rescuers with works scheduled on @gcwq to resolve
+ * sent to all rescuers with works scheduled on @pool to resolve
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be false and
@@ -1554,52 +1719,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
 * false if no action was taken and gcwq->lock stayed locked, true
 * otherwise.
 */
-static bool maybe_create_worker(struct global_cwq *gcwq)
+static bool maybe_create_worker(struct worker_pool *pool)
 __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
-        if (!need_to_create_worker(gcwq))
+        struct global_cwq *gcwq = pool->gcwq;
+        if (!need_to_create_worker(pool))
                return false;
 restart:
        spin_unlock_irq(&gcwq->lock);
        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
-        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
        while (true) {
                struct worker *worker;
-                worker = create_worker(gcwq, true);
+                worker = create_worker(pool);
                if (worker) {
-                        del_timer_sync(&gcwq->mayday_timer);
+                        del_timer_sync(&pool->mayday_timer);
                        spin_lock_irq(&gcwq->lock);
                        start_worker(worker);
-                        BUG_ON(need_to_create_worker(gcwq));
+                        BUG_ON(need_to_create_worker(pool));
                        return true;
                }
-                if (!need_to_create_worker(gcwq))
+                if (!need_to_create_worker(pool))
                        break;
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(CREATE_COOLDOWN);
-                if (!need_to_create_worker(gcwq))
+                if (!need_to_create_worker(pool))
                        break;
        }
-        del_timer_sync(&gcwq->mayday_timer);
+        del_timer_sync(&pool->mayday_timer);
        spin_lock_irq(&gcwq->lock);
-        if (need_to_create_worker(gcwq))
+        if (need_to_create_worker(pool))
                goto restart;
        return true;
 }
 /**
 * maybe_destroy_worker - destroy workers which have been idle for a while
- * @gcwq: gcwq to destroy workers for
+ * @pool: pool to destroy workers for
 *
- * Destroy @gcwq workers which have been idle for longer than
+ * Destroy @pool workers which have been idle for longer than
 * IDLE_WORKER_TIMEOUT.
 *
 * LOCKING:
@@ -1610,19 +1777,19 @@ restart:
 * false if no action was taken and gcwq->lock stayed locked, true
 * otherwise.
 */
-static bool maybe_destroy_workers(struct global_cwq *gcwq)
+static bool maybe_destroy_workers(struct worker_pool *pool)
 {
        bool ret = false;
-        while (too_many_workers(gcwq)) {
+        while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;
-                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+                worker = list_entry(pool->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                if (time_before(jiffies, expires)) {
-                        mod_timer(&gcwq->idle_timer, expires);
+                        mod_timer(&pool->idle_timer, expires);
                        break;
                }
@@ -1655,31 +1822,59 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
 */
 static bool manage_workers(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        bool ret = false;
-        if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+        if (pool->flags & POOL_MANAGING_WORKERS)
                return ret;
-        gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
+        pool->flags |= POOL_MANAGING_WORKERS;
-        gcwq->flags |= GCWQ_MANAGING_WORKERS;
        /*
-         * Destroy and then create so that may_start_working() is true
+         * To simplify both worker management and CPU hotplug, hold off
-         * on return.
+         * management while hotplug is in progress.  CPU hotplug path can't
+         * grab %POOL_MANAGING_WORKERS to achieve this because that can
+         * lead to idle worker depletion (all become busy thinking someone
+         * else is managing) which in turn can result in deadlock under
+         * extreme circumstances.  Use @pool->manager_mutex to synchronize
+         * manager against CPU hotplug.
+         *
+         * manager_mutex would always be free unless CPU hotplug is in
+         * progress.  trylock first without dropping @gcwq->lock.
         */
-        ret |= maybe_destroy_workers(gcwq);
+        if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
-        ret |= maybe_create_worker(gcwq);
+                spin_unlock_irq(&pool->gcwq->lock);
+                mutex_lock(&pool->manager_mutex);
+                /*
+                 * CPU hotplug could have happened while we were waiting
+                 * for manager_mutex.  Hotplug itself can't handle us
+                 * because manager isn't either on idle or busy list, and
+                 * @gcwq's state and ours could have deviated.
+                 *
+                 * As hotplug is now excluded via manager_mutex, we can
+                 * simply try to bind.  It will succeed or fail depending
+                 * on @gcwq's current state.  Try it and adjust
+                 * %WORKER_UNBOUND accordingly.
+                 */
+                if (worker_maybe_bind_and_lock(worker))
+                        worker->flags &= ~WORKER_UNBOUND;
+                else
+                        worker->flags |= WORKER_UNBOUND;
-        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
+                ret = true;
+        }
+        pool->flags &= ~POOL_MANAGE_WORKERS;
        /*
-         * The trustee might be waiting to take over the manager
+         * Destroy and then create so that may_start_working() is true
-         * position, tell it we're done.
+         * on return.
         */
-        if (unlikely(gcwq->trustee))
+        ret |= maybe_destroy_workers(pool);
-                wake_up_all(&gcwq->trustee_wait);
+        ret |= maybe_create_worker(pool);
+        pool->flags &= ~POOL_MANAGING_WORKERS;
+        mutex_unlock(&pool->manager_mutex);
        return ret;
 }
@@ -1728,10 +1923,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
        struct work_struct *work = list_first_entry(&cwq->delayed_works,
                                                    struct work_struct, entry);
-        struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
        trace_workqueue_activate_work(work);
-        move_linked_works(work, pos, NULL);
+        move_linked_works(work, &cwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
        cwq->nr_active++;
 }
@@ -1804,7 +1998,8 @@ __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
-        struct global_cwq *gcwq = cwq->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        struct hlist_head *bwh = busy_worker_head(gcwq, work);
        bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
        work_func_t f = work->func;
@@ -1823,6 +2018,15 @@ __acquires(&gcwq->lock)
        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
        /*
+         * Ensure we're on the correct CPU.  DISASSOCIATED test is
+         * necessary to avoid spurious warnings from rescuers servicing the
+         * unbound or a disassociated gcwq.
+         */
+        WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+                     !(gcwq->flags & GCWQ_DISASSOCIATED) &&
+                     raw_smp_processor_id() != gcwq->cpu);
+        /*
         * A single work shouldn't be executed concurrently by
         * multiple workers on a single cpu.  Check whether anyone is
         * already processing the work.  If so, defer the work to the
@@ -1846,27 +2050,19 @@ __acquires(&gcwq->lock)
        list_del_init(&work->entry);
        /*
-         * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
-         * wake up another worker; otherwise, clear HIGHPRI_PENDING.
-         */
-        if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
-                struct work_struct *nwork = list_first_entry(&gcwq->worklist,
-                                                struct work_struct, entry);
-                if (!list_empty(&gcwq->worklist) &&
-                    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
-                        wake_up_worker(gcwq);
-                else
-                        gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
-        }
-        /*
         * CPU intensive works don't participate in concurrency
         * management.  They're the scheduler's responsibility.
         */
        if (unlikely(cpu_intensive))
                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+        /*
+         * Unbound gcwq isn't concurrency managed and work items should be
+         * executed ASAP.  Wake up another worker if necessary.
+         */
+        if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+                wake_up_worker(pool);
        spin_unlock_irq(&gcwq->lock);
        work_clear_pending(work);
@@ -1939,28 +2135,38 @@ static void process_scheduled_works(struct worker *worker)
 static int worker_thread(void *__worker)
 {
        struct worker *worker = __worker;
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        /* tell the scheduler that this is a workqueue worker */
        worker->task->flags |= PF_WQ_WORKER;
 woke_up:
        spin_lock_irq(&gcwq->lock);
-        /* DIE can be set only while we're idle, checking here is enough */
+        /*
-        if (worker->flags & WORKER_DIE) {
+         * DIE can be set only while idle and REBIND set while busy has
+         * @worker->rebind_work scheduled.  Checking here is enough.
+         */
+        if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
                spin_unlock_irq(&gcwq->lock);
-                worker->task->flags &= ~PF_WQ_WORKER;
-                return 0;
+                if (worker->flags & WORKER_DIE) {
+                        worker->task->flags &= ~PF_WQ_WORKER;
+                        return 0;
+                }
+                idle_worker_rebind(worker);
+                goto woke_up;
        }
        worker_leave_idle(worker);
 recheck:
        /* no more worker necessary? */
-        if (!need_more_worker(gcwq))
+        if (!need_more_worker(pool))
                goto sleep;
        /* do we need to manage? */
-        if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+        if (unlikely(!may_start_working(pool)) && manage_workers(worker))
                goto recheck;
        /*
@@ -1979,7 +2185,7 @@ recheck:
        do {
                struct work_struct *work =
-                        list_first_entry(&gcwq->worklist,
+                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);
                if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -1991,11 +2197,11 @@ recheck:
                        move_linked_works(work, &worker->scheduled, NULL);
                        process_scheduled_works(worker);
                }
-        } while (keep_working(gcwq));
+        } while (keep_working(pool));
        worker_set_flags(worker, WORKER_PREP, false);
 sleep:
-        if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+        if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
                goto recheck;
        /*
@@ -2053,14 +2259,15 @@ repeat:
        for_each_mayday_cpu(cpu, wq->mayday_mask) {
                unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
                struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
-                struct global_cwq *gcwq = cwq->gcwq;
+                struct worker_pool *pool = cwq->pool;
+                struct global_cwq *gcwq = pool->gcwq;
                struct work_struct *work, *n;
                __set_current_state(TASK_RUNNING);
                mayday_clear_cpu(cpu, wq->mayday_mask);
                /* migrate to the target cpu if possible */
-                rescuer->gcwq = gcwq;
+                rescuer->pool = pool;
                worker_maybe_bind_and_lock(rescuer);
                /*
@@ -2068,7 +2275,7 @@ repeat:
                 * process'em.
                 */
                BUG_ON(!list_empty(&rescuer->scheduled));
-                list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+                list_for_each_entry_safe(work, n, &pool->worklist, entry)
                        if (get_work_cwq(work) == cwq)
                                move_linked_works(work, scheduled, &n);
@@ -2079,8 +2286,8 @@ repeat:
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
                 */
-                if (keep_working(gcwq))
+                if (keep_working(pool))
-                        wake_up_worker(gcwq);
+                        wake_up_worker(pool);
                spin_unlock_irq(&gcwq->lock);
        }
@@ -2205,7 +2412,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-                struct global_cwq *gcwq = cwq->gcwq;
+                struct global_cwq *gcwq = cwq->pool->gcwq;
                spin_lock_irq(&gcwq->lock);
@@ -2421,9 +2628,9 @@ reflush:
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
                bool drained;
-                spin_lock_irq(&cwq->gcwq->lock);
+                spin_lock_irq(&cwq->pool->gcwq->lock);
                drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
-                spin_unlock_irq(&cwq->gcwq->lock);
+                spin_unlock_irq(&cwq->pool->gcwq->lock);
                if (drained)
                        continue;
@@ -2463,7 +2670,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                 */
                smp_rmb();
                cwq = get_work_cwq(work);
-                if (unlikely(!cwq || gcwq != cwq->gcwq))
+                if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
                        goto already_gone;
        } else if (wait_executing) {
                worker = find_worker_executing_work(gcwq, work);
@@ -2984,13 +3191,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        if (flags & WQ_MEM_RECLAIM)
                flags |= WQ_RESCUER;
-        /*
-         * Unbound workqueues aren't concurrency managed and should be
-         * dispatched to workers immediately.
-         */
-        if (flags & WQ_UNBOUND)
-                flags |= WQ_HIGHPRI;
        max_active = max_active ?: WQ_DFL_ACTIVE;
        max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -3011,9 +3211,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
                struct global_cwq *gcwq = get_gcwq(cpu);
+                int pool_idx = (bool)(flags & WQ_HIGHPRI);
                BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-                cwq->gcwq = gcwq;
+                cwq->pool = &gcwq->pools[pool_idx];
                cwq->wq = wq;
                cwq->flush_color = -1;
                cwq->max_active = max_active;
@@ -3225,369 +3426,143 @@ EXPORT_SYMBOL_GPL(work_busy);
 * gcwqs serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
- * This is solved by allowing a gcwq to be detached from CPU, running
+ * This is solved by allowing a gcwq to be disassociated from the CPU
- * it with unbound (rogue) workers and allowing it to be reattached
+ * running as an unbound one and allowing it to be reattached later if the
- * later if the cpu comes back online.  A separate thread is created
+ * cpu comes back online.
- * to govern a gcwq in such state and is called the trustee of the
- * gcwq.
- *
- * Trustee states and their descriptions.
- *
- * START        Command state used on startup.  On CPU_DOWN_PREPARE, a
- *              new trustee is started with this state.
- *
- * IN_CHARGE    Once started, trustee will enter this state after
- *              assuming the manager role and making all existing
- *              workers rogue.  DOWN_PREPARE waits for trustee to
- *              enter this state.  After reaching IN_CHARGE, trustee
- *              tries to execute the pending worklist until it's empty
- *              and the state is set to BUTCHER, or the state is set
- *              to RELEASE.
- *
- * BUTCHER      Command state which is set by the cpu callback after
- *              the cpu has went down.  Once this state is set trustee
- *              knows that there will be no new works on the worklist
- *              and once the worklist is empty it can proceed to
- *              killing idle workers.
- *
- * RELEASE      Command state which is set by the cpu callback if the
- *              cpu down has been canceled or it has come online
- *              again.  After recognizing this state, trustee stops
- *              trying to drain or butcher and clears ROGUE, rebinds
- *              all remaining workers back to the cpu and releases
- *              manager role.
- *
- * DONE         Trustee will enter this state after BUTCHER or RELEASE
- *              is complete.
- *
- *          trustee                 CPU                draining
- *         took over                down               complete
- * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
- *                        |                     |                  ^
- *                        | CPU is back online  v   return workers |
- *                         ----------------> RELEASE --------------
 */
-/**
+/* claim manager positions of all pools */
- * trustee_wait_event_timeout - timed event wait for trustee
+static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
- * @cond: condition to wait for
- * @timeout: timeout in jiffies
- *
- * wait_event_timeout() for trustee to use.  Handles locking and
- * checks for RELEASE request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * Positive indicating left time if @cond is satisfied, 0 if timed
- * out, -1 if canceled.
- */
-#define trustee_wait_event_timeout(cond, timeout) ({                    \
-        long __ret = (timeout);                                         \
-        while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
-               __ret) {                                                 \
-                spin_unlock_irq(&gcwq->lock);                           \
-                __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
-                        (gcwq->trustee_state == TRUSTEE_RELEASE),       \
-                        __ret);                                         \
-                spin_lock_irq(&gcwq->lock);                             \
-        }                                                               \
-        gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
-})
-/**
- * trustee_wait_event - event wait for trustee
- * @cond: condition to wait for
- *
- * wait_event() for trustee to use.  Automatically handles locking and
- * checks for CANCEL request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * 0 if @cond is satisfied, -1 if canceled.
- */
-#define trustee_wait_event(cond) ({                                     \
-        long __ret1;                                                    \
-        __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
-        __ret1 < 0 ? -1 : 0;                                            \
-})
-static int __cpuinit trustee_thread(void *__gcwq)
 {
-        struct global_cwq *gcwq = __gcwq;
+        struct worker_pool *pool;
-        struct worker *worker;
-        struct work_struct *work;
-        struct hlist_node *pos;
-        long rc;
-        int i;
-        BUG_ON(gcwq->cpu != smp_processor_id());
+        for_each_worker_pool(pool, gcwq)
+                mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
        spin_lock_irq(&gcwq->lock);
-        /*
+}
-         * Claim the manager position and make all workers rogue.
-         * Trustee must be bound to the target cpu and can't be
-         * cancelled.
-         */
-        BUG_ON(gcwq->cpu != smp_processor_id());
-        rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
-        BUG_ON(rc < 0);
-        gcwq->flags |= GCWQ_MANAGING_WORKERS;
-        list_for_each_entry(worker, &gcwq->idle_list, entry)
-                worker->flags |= WORKER_ROGUE;
-        for_each_busy_worker(worker, i, pos, gcwq)
+/* release manager positions */
-                worker->flags |= WORKER_ROGUE;
+static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
+{
+        struct worker_pool *pool;
-        /*
-         * Call schedule() so that we cross rq->lock and thus can
-         * guarantee sched callbacks see the rogue flag.  This is
-         * necessary as scheduler callbacks may be invoked from other
-         * cpus.
-         */
        spin_unlock_irq(&gcwq->lock);
-        schedule();
+        for_each_worker_pool(pool, gcwq)
-        spin_lock_irq(&gcwq->lock);
+                mutex_unlock(&pool->manager_mutex);
+}
-        /*
+static void gcwq_unbind_fn(struct work_struct *work)
-         * Sched callbacks are disabled now.  Zap nr_running.  After
+{
-         * this, nr_running stays zero and need_more_worker() and
+        struct global_cwq *gcwq = get_gcwq(smp_processor_id());
-         * keep_working() are always true as long as the worklist is
+        struct worker_pool *pool;
-         * not empty.
+        struct worker *worker;
-         */
+        struct hlist_node *pos;
-        atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+        int i;
-        spin_unlock_irq(&gcwq->lock);
+        BUG_ON(gcwq->cpu != smp_processor_id());
-        del_timer_sync(&gcwq->idle_timer);
-        spin_lock_irq(&gcwq->lock);
-        /*
+        gcwq_claim_management_and_lock(gcwq);
-         * We're now in charge.  Notify and proceed to drain.  We need
-         * to keep the gcwq running during the whole CPU down
-         * procedure as other cpu hotunplug callbacks may need to
-         * flush currently running tasks.
-         */
-        gcwq->trustee_state = TRUSTEE_IN_CHARGE;
-        wake_up_all(&gcwq->trustee_wait);
        /*
-         * The original cpu is in the process of dying and may go away
+         * We've claimed all manager positions.  Make all workers unbound
-         * anytime now.  When that happens, we and all workers would
+         * and set DISASSOCIATED.  Before this, all workers except for the
-         * be migrated to other cpus.  Try draining any left work.  We
+         * ones which are still executing works from before the last CPU
-         * want to get it over with ASAP - spam rescuers, wake up as
+         * down must be on the cpu.  After this, they may become diasporas.
-         * many idlers as necessary and create new ones till the
-         * worklist is empty.  Note that if the gcwq is frozen, there
-         * may be frozen works in freezable cwqs.  Don't declare
-         * completion while frozen.
         */
-        while (gcwq->nr_workers != gcwq->nr_idle ||
+        for_each_worker_pool(pool, gcwq)
-               gcwq->flags & GCWQ_FREEZING ||
+                list_for_each_entry(worker, &pool->idle_list, entry)
-               gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+                        worker->flags |= WORKER_UNBOUND;
-                int nr_works = 0;
-                list_for_each_entry(work, &gcwq->worklist, entry) {
-                        send_mayday(work);
-                        nr_works++;
-                }
-                list_for_each_entry(worker, &gcwq->idle_list, entry) {
+        for_each_busy_worker(worker, i, pos, gcwq)
-                        if (!nr_works--)
+                worker->flags |= WORKER_UNBOUND;
-                                break;
-                        wake_up_process(worker->task);
-                }
-                if (need_to_create_worker(gcwq)) {
+        gcwq->flags |= GCWQ_DISASSOCIATED;
-                        spin_unlock_irq(&gcwq->lock);
-                        worker = create_worker(gcwq, false);
-                        spin_lock_irq(&gcwq->lock);
-                        if (worker) {
-                                worker->flags |= WORKER_ROGUE;
-                                start_worker(worker);
-                        }
-                }
-                /* give a breather */
+        gcwq_release_management_and_unlock(gcwq);
-                if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
-                        break;
-        }
        /*
-         * Either all works have been scheduled and cpu is down, or
+         * Call schedule() so that we cross rq->lock and thus can guarantee
-         * cpu down has already been canceled.  Wait for and butcher
+         * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
-         * all workers till we're canceled.
+         * as scheduler callbacks may be invoked from other cpus.
         */
-        do {
+        schedule();
-                rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
-                while (!list_empty(&gcwq->idle_list))
-                        destroy_worker(list_first_entry(&gcwq->idle_list,
-                                                        struct worker, entry));
-        } while (gcwq->nr_workers && rc >= 0);
        /*
-         * At this point, either draining has completed and no worker
+         * Sched callbacks are disabled now.  Zap nr_running.  After this,
-         * is left, or cpu down has been canceled or the cpu is being
+         * nr_running stays zero and need_more_worker() and keep_working()
-         * brought back up.  There shouldn't be any idle one left.
+         * are always true as long as the worklist is not empty.  @gcwq now
-         * Tell the remaining busy ones to rebind once it finishes the
+         * behaves as unbound (in terms of concurrency management) gcwq
-         * currently scheduled works by scheduling the rebind_work.
+         * which is served by workers tied to the CPU.
+         *
+         * On return from this function, the current worker would trigger
+         * unbound chain execution of pending work items if other workers
+         * didn't already.
         */
-        WARN_ON(!list_empty(&gcwq->idle_list));
+        for_each_worker_pool(pool, gcwq)
+                atomic_set(get_pool_nr_running(pool), 0);
-        for_each_busy_worker(worker, i, pos, gcwq) {
-                struct work_struct *rebind_work = &worker->rebind_work;
-                /*
-                 * Rebind_work may race with future cpu hotplug
-                 * operations.  Use a separate flag to mark that
-                 * rebinding is scheduled.
-                 */
-                worker->flags |= WORKER_REBIND;
-                worker->flags &= ~WORKER_ROGUE;
-                /* queue rebind_work, wq doesn't matter, use the default one */
-                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
-                                     work_data_bits(rebind_work)))
-                        continue;
-                debug_work_activate(rebind_work);
-                insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
-                            worker->scheduled.next,
-                            work_color_to_flags(WORK_NO_COLOR));
-        }
-        /* relinquish manager role */
-        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
-        /* notify completion */
-        gcwq->trustee = NULL;
-        gcwq->trustee_state = TRUSTEE_DONE;
-        wake_up_all(&gcwq->trustee_wait);
-        spin_unlock_irq(&gcwq->lock);
-        return 0;
 }
-/**
+/*
- * wait_trustee_state - wait for trustee to enter the specified state
+ * Workqueues should be brought up before normal priority CPU notifiers.
- * @gcwq: gcwq the trustee of interest belongs to
+ * This will be registered high priority CPU notifier.
- * @state: target state to wait for
- *
- * Wait for the trustee to reach @state.  DONE is already matched.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by cpu_callback.
 */
-static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
+static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
-__releases(&gcwq->lock)
+                                               unsigned long action,
-__acquires(&gcwq->lock)
+                                               void *hcpu)
-{
-        if (!(gcwq->trustee_state == state ||
-              gcwq->trustee_state == TRUSTEE_DONE)) {
-                spin_unlock_irq(&gcwq->lock);
-                __wait_event(gcwq->trustee_wait,
-                             gcwq->trustee_state == state ||
-                             gcwq->trustee_state == TRUSTEE_DONE);
-                spin_lock_irq(&gcwq->lock);
-        }
-}
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
-                                                unsigned long action,
-                                                void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
        struct global_cwq *gcwq = get_gcwq(cpu);
-        struct task_struct *new_trustee = NULL;
+        struct worker_pool *pool;
-        struct worker *uninitialized_var(new_worker);
-        unsigned long flags;
-        action &= ~CPU_TASKS_FROZEN;
-        switch (action) {
+        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_DOWN_PREPARE:
-                new_trustee = kthread_create(trustee_thread, gcwq,
-                                             "workqueue_trustee/%d\n", cpu);
-                if (IS_ERR(new_trustee))
-                        return notifier_from_errno(PTR_ERR(new_trustee));
-                kthread_bind(new_trustee, cpu);
-                /* fall through */
        case CPU_UP_PREPARE:
-                BUG_ON(gcwq->first_idle);
+                for_each_worker_pool(pool, gcwq) {
-                new_worker = create_worker(gcwq, false);
+                        struct worker *worker;
-                if (!new_worker) {
-                        if (new_trustee)
-                                kthread_stop(new_trustee);
-                        return NOTIFY_BAD;
-                }
-        }
-        /* some are called w/ irq disabled, don't disturb irq status */
-        spin_lock_irqsave(&gcwq->lock, flags);
-        switch (action) {
+                        if (pool->nr_workers)
-        case CPU_DOWN_PREPARE:
+                                continue;
-                /* initialize trustee and tell it to acquire the gcwq */
-                BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
-                gcwq->trustee = new_trustee;
-                gcwq->trustee_state = TRUSTEE_START;
-                wake_up_process(gcwq->trustee);
-                wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
-                /* fall through */
-        case CPU_UP_PREPARE:
-                BUG_ON(gcwq->first_idle);
-                gcwq->first_idle = new_worker;
-                break;
-        case CPU_DYING:
+                        worker = create_worker(pool);
-                /*
+                        if (!worker)
-                 * Before this, the trustee and all workers except for
+                                return NOTIFY_BAD;
-                 * the ones which are still executing works from
-                 * before the last CPU down must be on the cpu.  After
-                 * this, they'll all be diasporas.
-                 */
-                gcwq->flags |= GCWQ_DISASSOCIATED;
-                break;
-        case CPU_POST_DEAD:
+                        spin_lock_irq(&gcwq->lock);
-                gcwq->trustee_state = TRUSTEE_BUTCHER;
+                        start_worker(worker);
-                /* fall through */
+                        spin_unlock_irq(&gcwq->lock);
-        case CPU_UP_CANCELED:
+                }
-                destroy_worker(gcwq->first_idle);
-                gcwq->first_idle = NULL;
                break;
        case CPU_DOWN_FAILED:
        case CPU_ONLINE:
+                gcwq_claim_management_and_lock(gcwq);
                gcwq->flags &= ~GCWQ_DISASSOCIATED;
-                if (gcwq->trustee_state != TRUSTEE_DONE) {
+                rebind_workers(gcwq);
-                        gcwq->trustee_state = TRUSTEE_RELEASE;
+                gcwq_release_management_and_unlock(gcwq);
-                        wake_up_process(gcwq->trustee);
-                        wait_trustee_state(gcwq, TRUSTEE_DONE);
-                }
-                /*
-                 * Trustee is done and there might be no worker left.
-                 * Put the first_idle in and request a real manager to
-                 * take a look.
-                 */
-                spin_unlock_irq(&gcwq->lock);
-                kthread_bind(gcwq->first_idle->task, cpu);
-                spin_lock_irq(&gcwq->lock);
-                gcwq->flags |= GCWQ_MANAGE_WORKERS;
-                start_worker(gcwq->first_idle);
-                gcwq->first_idle = NULL;
                break;
        }
+        return NOTIFY_OK;
+}
-        spin_unlock_irqrestore(&gcwq->lock, flags);
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+                                                 unsigned long action,
+                                                 void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+        struct work_struct unbind_work;
-        return notifier_from_errno(0);
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_DOWN_PREPARE:
+                /* unbinding should happen on the local CPU */
+                INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+                schedule_work_on(cpu, &unbind_work);
+                flush_work(&unbind_work);
+                break;
+        }
+        return NOTIFY_OK;
 }
 #ifdef CONFIG_SMP
@@ -3746,6 +3721,7 @@ void thaw_workqueues(void)
        for_each_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker_pool *pool;
                struct workqueue_struct *wq;
                spin_lock_irq(&gcwq->lock);
@@ -3767,7 +3743,8 @@ void thaw_workqueues(void)
                                cwq_activate_first_delayed(cwq);
                }
-                wake_up_worker(gcwq);
+                for_each_worker_pool(pool, gcwq)
+                        wake_up_worker(pool);
                spin_unlock_irq(&gcwq->lock);
        }
@@ -3783,46 +3760,57 @@ static int __init init_workqueues(void)
        unsigned int cpu;
        int i;
-        cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
+        cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
        /* initialize gcwqs */
        for_each_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker_pool *pool;
                spin_lock_init(&gcwq->lock);
-                INIT_LIST_HEAD(&gcwq->worklist);
                gcwq->cpu = cpu;
                gcwq->flags |= GCWQ_DISASSOCIATED;
-                INIT_LIST_HEAD(&gcwq->idle_list);
                for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
                        INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
-                init_timer_deferrable(&gcwq->idle_timer);
+                for_each_worker_pool(pool, gcwq) {
-                gcwq->idle_timer.function = idle_worker_timeout;
+                        pool->gcwq = gcwq;
-                gcwq->idle_timer.data = (unsigned long)gcwq;
+                        INIT_LIST_HEAD(&pool->worklist);
+                        INIT_LIST_HEAD(&pool->idle_list);
-                setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
+                        init_timer_deferrable(&pool->idle_timer);
-                            (unsigned long)gcwq);
+                        pool->idle_timer.function = idle_worker_timeout;
+                        pool->idle_timer.data = (unsigned long)pool;
-                ida_init(&gcwq->worker_ida);
+                        setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
+                                    (unsigned long)pool);
-                gcwq->trustee_state = TRUSTEE_DONE;
+                        mutex_init(&pool->manager_mutex);
-                init_waitqueue_head(&gcwq->trustee_wait);
+                        ida_init(&pool->worker_ida);
+                }
+                init_waitqueue_head(&gcwq->rebind_hold);
        }
        /* create the initial worker */
        for_each_online_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
-                struct worker *worker;
+                struct worker_pool *pool;
                if (cpu != WORK_CPU_UNBOUND)
                        gcwq->flags &= ~GCWQ_DISASSOCIATED;
-                worker = create_worker(gcwq, true);
-                BUG_ON(!worker);
+                for_each_worker_pool(pool, gcwq) {
-                spin_lock_irq(&gcwq->lock);
+                        struct worker *worker;
-                start_worker(worker);
-                spin_unlock_irq(&gcwq->lock);
+                        worker = create_worker(pool);
+                        BUG_ON(!worker);
+                        spin_lock_irq(&gcwq->lock);
+                        start_worker(worker);
+                        spin_unlock_irq(&gcwq->lock);
+                }
        }
        system_wq = alloc_workqueue("events", 0, 0);