Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core

Pull perf/core improvements and fixes from Arnaldo Carvalho de Melo: * Fix include order for bison/flex-generated C files, from Ben Hutchings * Build fixes and documentation corrections from David Ahern * Group parsing support, from Jiri Olsa * UI/gtk refactorings and improvements from Namhyung Kim * NULL deref fix for perf script, from Namhyung Kim * Assorted cleanups from Robert Richter * Let O= makes handle relative paths, from Steven Rostedt * perf script python fixes, from Feng Tang. * Improve 'perf lock' error message when the needed tracepoints are not present, from David Ahern. * Initial bash completion support, from Frederic Weisbecker * Allow building without libelf, from Namhyung Kim. * Support DWARF CFI based unwind to have callchains when %bp based unwinding is not possible, from Jiri Olsa. * Symbol resolution fixes, while fixing support PPC64 files with an .opt ELF section was the end goal, several fixes for code that handles all architectures and cleanups are included, from Cody Schafer. * Add a description for the JIT interface, from Andi Kleen. * Assorted fixes for Documentation and build in 32 bit, from Robert Richter * Add support for non-tracepoint events in perf script python, from Feng Tang * Cache the libtraceevent event_format associated to each evsel early, so that we avoid relookups, i.e. calling pevent_find_event repeatedly when processing tracepoint events. [ This is to reduce the surface contact with libtraceevents and make clear what is that the perf tools needs from that lib: so far parsing the common and per event fields. ] Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2012-08-21 05:27:00 -0400
committer: Ingo Molnar <mingo@kernel.org> 2012-08-21 05:27:00 -0400
commit: bcada3d4b8c96b8792c2306f363992ca5ab9da42 (patch)
tree: e420679a5db6ea4e1694eef57f9abb6acac8d4d3 /kernel
parent: 26198c21d1b286a084fe5d514a30bc7e6c712a34 (diff)
parent: 000078bc3ee69efb1124b8478c7527389a826074 (diff)
55 files changed, 2399 insertions, 1516 deletions
diff --git a/kernel/async.c b/kernel/async.c
index bd0c168a3bbe..9d3118384858 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -62,8 +62,10 @@ static async_cookie_t next_cookie = 1;
 #define MAX_WORK        32768
 static LIST_HEAD(async_pending);
-static LIST_HEAD(async_running);
+static ASYNC_DOMAIN(async_running);
+static LIST_HEAD(async_domains);
 static DEFINE_SPINLOCK(async_lock);
+static DEFINE_MUTEX(async_register_mutex);
 struct async_entry {
        struct list_head        list;
@@ -71,7 +73,7 @@ struct async_entry {
        async_cookie_t          cookie;
        async_func_ptr          *func;
        void                    *data;
-        struct list_head        *running;
+        struct async_domain     *running;
 };
 static DECLARE_WAIT_QUEUE_HEAD(async_done);
@@ -82,13 +84,12 @@ static atomic_t entry_count;
 /*
 * MUST be called with the lock held!
 */
-static async_cookie_t  __lowest_in_progress(struct list_head *running)
+static async_cookie_t  __lowest_in_progress(struct async_domain *running)
 {
        struct async_entry *entry;
-        if (!list_empty(running)) {
+        if (!list_empty(&running->domain)) {
-                entry = list_first_entry(running,
+                entry = list_first_entry(&running->domain, typeof(*entry), list);
-                        struct async_entry, list);
                return entry->cookie;
        }
@@ -99,7 +100,7 @@ static async_cookie_t  __lowest_in_progress(struct list_head *running)
        return next_cookie;     /* "infinity" value */
 }
-static async_cookie_t  lowest_in_progress(struct list_head *running)
+static async_cookie_t  lowest_in_progress(struct async_domain *running)
 {
        unsigned long flags;
        async_cookie_t ret;
@@ -119,10 +120,11 @@ static void async_run_entry_fn(struct work_struct *work)
                container_of(work, struct async_entry, work);
        unsigned long flags;
        ktime_t uninitialized_var(calltime), delta, rettime;
+        struct async_domain *running = entry->running;
        /* 1) move self to the running queue */
        spin_lock_irqsave(&async_lock, flags);
-        list_move_tail(&entry->list, entry->running);
+        list_move_tail(&entry->list, &running->domain);
        spin_unlock_irqrestore(&async_lock, flags);
        /* 2) run (and print duration) */
@@ -145,6 +147,8 @@ static void async_run_entry_fn(struct work_struct *work)
        /* 3) remove self from the running queue */
        spin_lock_irqsave(&async_lock, flags);
        list_del(&entry->list);
+        if (running->registered && --running->count == 0)
+                list_del_init(&running->node);
        /* 4) free the entry */
        kfree(entry);
@@ -156,7 +160,7 @@ static void async_run_entry_fn(struct work_struct *work)
        wake_up(&async_done);
 }
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
 {
        struct async_entry *entry;
        unsigned long flags;
@@ -187,6 +191,8 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
        spin_lock_irqsave(&async_lock, flags);
        newcookie = entry->cookie = next_cookie++;
        list_add_tail(&entry->list, &async_pending);
+        if (running->registered && running->count++ == 0)
+                list_add_tail(&running->node, &async_domains);
        atomic_inc(&entry_count);
        spin_unlock_irqrestore(&async_lock, flags);
@@ -223,7 +229,7 @@ EXPORT_SYMBOL_GPL(async_schedule);
 * Note: This function may be called from atomic or non-atomic contexts.
 */
 async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
-                                     struct list_head *running)
+                                     struct async_domain *running)
 {
        return __async_schedule(ptr, data, running);
 }
@@ -236,22 +242,52 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
 */
 void async_synchronize_full(void)
 {
+        mutex_lock(&async_register_mutex);
        do {
-                async_synchronize_cookie(next_cookie);
+                struct async_domain *domain = NULL;
-        } while (!list_empty(&async_running) || !list_empty(&async_pending));
+                spin_lock_irq(&async_lock);
+                if (!list_empty(&async_domains))
+                        domain = list_first_entry(&async_domains, typeof(*domain), node);
+                spin_unlock_irq(&async_lock);
+                async_synchronize_cookie_domain(next_cookie, domain);
+        } while (!list_empty(&async_domains));
+        mutex_unlock(&async_register_mutex);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 /**
+ * async_unregister_domain - ensure no more anonymous waiters on this domain
+ * @domain: idle domain to flush out of any async_synchronize_full instances
+ *
+ * async_synchronize_{cookie|full}_domain() are not flushed since callers
+ * of these routines should know the lifetime of @domain
+ *
+ * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
+ */
+void async_unregister_domain(struct async_domain *domain)
+{
+        mutex_lock(&async_register_mutex);
+        spin_lock_irq(&async_lock);
+        WARN_ON(!domain->registered || !list_empty(&domain->node) ||
+                !list_empty(&domain->domain));
+        domain->registered = 0;
+        spin_unlock_irq(&async_lock);
+        mutex_unlock(&async_register_mutex);
+}
+EXPORT_SYMBOL_GPL(async_unregister_domain);
+/**
 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @list: running list to synchronize on
+ * @domain: running list to synchronize on
 *
 * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list have been done.
+ * synchronization domain specified by the running list @domain have been done.
 */
-void async_synchronize_full_domain(struct list_head *list)
+void async_synchronize_full_domain(struct async_domain *domain)
 {
-        async_synchronize_cookie_domain(next_cookie, list);
+        async_synchronize_cookie_domain(next_cookie, domain);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
@@ -261,14 +297,16 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 * @running: running list to synchronize on
 *
 * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list submitted
+ * synchronization domain specified by running list @running submitted
 * prior to @cookie have been done.
 */
-void async_synchronize_cookie_domain(async_cookie_t cookie,
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
-                                     struct list_head *running)
 {
        ktime_t uninitialized_var(starttime), delta, endtime;
+        if (!running)
+                return;
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
                starttime = ktime_get();
diff --git a/kernel/audit.c b/kernel/audit.c
index 1c7f2c61416b..ea3b7b6191c7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -384,7 +384,7 @@ static void audit_hold_skb(struct sk_buff *skb)
 static void audit_printk_skb(struct sk_buff *skb)
 {
        struct nlmsghdr *nlh = nlmsg_hdr(skb);
-        char *data = NLMSG_DATA(nlh);
+        char *data = nlmsg_data(nlh);
        if (nlh->nlmsg_type != AUDIT_EOE) {
                if (printk_ratelimit())
@@ -516,14 +516,15 @@ struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
        if (!skb)
                return NULL;
-        nlh     = NLMSG_NEW(skb, pid, seq, t, size, flags);
+        nlh     = nlmsg_put(skb, pid, seq, t, size, flags);
-        data    = NLMSG_DATA(nlh);
+        if (!nlh)
+                goto out_kfree_skb;
+        data = nlmsg_data(nlh);
        memcpy(data, payload, size);
        return skb;
-nlmsg_failure:                  /* Used by NLMSG_NEW */
+out_kfree_skb:
-        if (skb)
+        kfree_skb(skb);
-                kfree_skb(skb);
        return NULL;
 }
@@ -680,7 +681,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
        sessionid = audit_get_sessionid(current);
        security_task_getsecid(current, &sid);
        seq  = nlh->nlmsg_seq;
-        data = NLMSG_DATA(nlh);
+        data = nlmsg_data(nlh);
        switch (msg_type) {
        case AUDIT_GET:
@@ -961,14 +962,17 @@ static void audit_receive(struct sk_buff  *skb)
 static int __init audit_init(void)
 {
        int i;
+        struct netlink_kernel_cfg cfg = {
+                .input  = audit_receive,
+        };
        if (audit_initialized == AUDIT_DISABLED)
                return 0;
        printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
               audit_default ? "enabled" : "disabled");
-        audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
+        audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT,
-                                           audit_receive, NULL, THIS_MODULE);
+                                           THIS_MODULE, &cfg);
        if (!audit_sock)
                audit_panic("cannot initialize netlink socket");
        else
@@ -1060,13 +1064,15 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
        ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
        if (!ab->skb)
-                goto nlmsg_failure;
+                goto err;
-        nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0);
+        nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0);
+        if (!nlh)
+                goto out_kfree_skb;
        return ab;
-nlmsg_failure:                  /* Used by NLMSG_NEW */
+out_kfree_skb:
        kfree_skb(ab->skb);
        ab->skb = NULL;
 err:
@@ -1450,6 +1456,27 @@ void audit_log_key(struct audit_buffer *ab, char *key)
 }
 /**
+ * audit_log_link_denied - report a link restriction denial
+ * @operation: specific link opreation
+ * @link: the path that triggered the restriction
+ */
+void audit_log_link_denied(const char *operation, struct path *link)
+{
+        struct audit_buffer *ab;
+        ab = audit_log_start(current->audit_context, GFP_KERNEL,
+                             AUDIT_ANOM_LINK);
+        audit_log_format(ab, "op=%s action=denied", operation);
+        audit_log_format(ab, " pid=%d comm=", current->pid);
+        audit_log_untrustedstring(ab, current->comm);
+        audit_log_d_path(ab, " path=", link);
+        audit_log_format(ab, " dev=");
+        audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id);
+        audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino);
+        audit_log_end(ab);
+}
+/**
 * audit_log_end - end one audit record
 * @ab: the audit_buffer
 *
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5bf0790497e7..3a5ca582ba1e 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -595,7 +595,7 @@ void audit_trim_trees(void)
                root_mnt = collect_mounts(&path);
                path_put(&path);
-                if (!root_mnt)
+                if (IS_ERR(root_mnt))
                        goto skip_it;
                spin_lock(&hash_lock);
@@ -669,8 +669,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
                goto Err;
        mnt = collect_mounts(&path);
        path_put(&path);
-        if (!mnt) {
+        if (IS_ERR(mnt)) {
-                err = -ENOMEM;
+                err = PTR_ERR(mnt);
                goto Err;
        }
@@ -719,8 +719,8 @@ int audit_tag_tree(char *old, char *new)
                return err;
        tagged = collect_mounts(&path2);
        path_put(&path2);
-        if (!tagged)
+        if (IS_ERR(tagged))
-                return -ENOMEM;
+                return PTR_ERR(tagged);
        err = kern_path(old, 0, &path1);
        if (err) {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e683869365d9..3823281401b5 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -355,34 +355,15 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
 /* Get path information necessary for adding watches. */
 static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 {
-        struct nameidata nd;
+        struct dentry *d = kern_path_locked(watch->path, parent);
-        struct dentry *d;
+        if (IS_ERR(d))
-        int err;
-        err = kern_path_parent(watch->path, &nd);
-        if (err)
-                return err;
-        if (nd.last_type != LAST_NORM) {
-                path_put(&nd.path);
-                return -EINVAL;
-        }
-        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
-        d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
-        if (IS_ERR(d)) {
-                mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-                path_put(&nd.path);
                return PTR_ERR(d);
-        }
+        mutex_unlock(&parent->dentry->d_inode->i_mutex);
        if (d->d_inode) {
                /* update watch filter fields */
                watch->dev = d->d_inode->i_sb->s_dev;
                watch->ino = d->d_inode->i_ino;
        }
-        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
-        *parent = nd.path;
        dput(d);
        return 0;
 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b303dfc7dce0..79818507e444 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -822,7 +822,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
 */
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static const struct inode_operations cgroup_dir_inode_operations;
@@ -954,7 +954,7 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
                dget(d);
                d_delete(d);
-                simple_unlink(d->d_inode, d);
+                simple_unlink(cgrp->dentry->d_inode, d);
                list_del_init(&cfe->node);
                dput(d);
@@ -1068,28 +1068,24 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        BUG_ON(cgrp->subsys[i]);
                        BUG_ON(!dummytop->subsys[i]);
                        BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
-                        mutex_lock(&ss->hierarchy_mutex);
                        cgrp->subsys[i] = dummytop->subsys[i];
                        cgrp->subsys[i]->cgroup = cgrp;
                        list_move(&ss->sibling, &root->subsys_list);
                        ss->root = root;
                        if (ss->bind)
                                ss->bind(cgrp);
-                        mutex_unlock(&ss->hierarchy_mutex);
                        /* refcount was already taken, and we're keeping it */
                } else if (bit & removed_bits) {
                        /* We're removing this subsystem */
                        BUG_ON(ss == NULL);
                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
-                        mutex_lock(&ss->hierarchy_mutex);
                        if (ss->bind)
                                ss->bind(dummytop);
                        dummytop->subsys[i]->cgroup = dummytop;
                        cgrp->subsys[i] = NULL;
                        subsys[i]->root = &rootnode;
                        list_move(&ss->sibling, &rootnode.subsys_list);
-                        mutex_unlock(&ss->hierarchy_mutex);
                        /* subsystem is now free - drop reference on module */
                        module_put(ss->module);
                } else if (bit & final_bits) {
@@ -1587,7 +1583,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
        opts.new_root = new_root;
        /* Locate an existing or new sb for this hierarchy */
-        sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
+        sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
        if (IS_ERR(sb)) {
                ret = PTR_ERR(sb);
                cgroup_drop_root(opts.new_root);
@@ -2570,7 +2566,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
        .rename = cgroup_rename,
 };
-static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
@@ -3915,37 +3911,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
                set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
 }
-static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
-{
-        /* We need to take each hierarchy_mutex in a consistent order */
-        int i;
-        /*
-         * No worry about a race with rebind_subsystems that might mess up the
-         * locking order, since both parties are under cgroup_mutex.
-         */
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                struct cgroup_subsys *ss = subsys[i];
-                if (ss == NULL)
-                        continue;
-                if (ss->root == root)
-                        mutex_lock(&ss->hierarchy_mutex);
-        }
-}
-static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
-{
-        int i;
-        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                struct cgroup_subsys *ss = subsys[i];
-                if (ss == NULL)
-                        continue;
-                if (ss->root == root)
-                        mutex_unlock(&ss->hierarchy_mutex);
-        }
-}
 /*
 * cgroup_create - create a cgroup
 * @parent: cgroup that will be parent of the new cgroup
@@ -4006,9 +3971,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                        ss->post_clone(cgrp);
        }
-        cgroup_lock_hierarchy(root);
        list_add(&cgrp->sibling, &cgrp->parent->children);
-        cgroup_unlock_hierarchy(root);
        root->number_of_cgroups++;
        err = cgroup_create_dir(cgrp, dentry, mode);
@@ -4035,9 +3998,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 err_remove:
-        cgroup_lock_hierarchy(root);
        list_del(&cgrp->sibling);
-        cgroup_unlock_hierarchy(root);
        root->number_of_cgroups--;
 err_destroy:
@@ -4245,10 +4206,8 @@ again:
                list_del_init(&cgrp->release_list);
        raw_spin_unlock(&release_list_lock);
-        cgroup_lock_hierarchy(cgrp->root);
        /* delete this cgroup from parent->children */
        list_del_init(&cgrp->sibling);
-        cgroup_unlock_hierarchy(cgrp->root);
        list_del_init(&cgrp->allcg_node);
@@ -4322,8 +4281,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));
-        mutex_init(&ss->hierarchy_mutex);
-        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
        ss->active = 1;
        /* this function shouldn't be used with modular subsystems, since they
@@ -4450,8 +4407,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        }
        write_unlock(&css_set_lock);
-        mutex_init(&ss->hierarchy_mutex);
-        lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
        ss->active = 1;
        /* success! */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a4eb5227a19e..14d32588cccd 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu)
        if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
                mutex_lock(&zonelists_mutex);
-                build_all_zonelists(NULL);
+                build_all_zonelists(NULL, NULL);
                mutex_unlock(&zonelists_mutex);
        }
 #endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8c8bd652dd12..f33c7153b6d7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -147,6 +147,12 @@ typedef enum {
        CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
+/* the type of hotplug event */
+enum hotplug_event {
+        CPUSET_CPU_OFFLINE,
+        CPUSET_MEM_OFFLINE,
+};
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 }
 /*
- * Walk the specified cpuset subtree and look for empty cpusets.
+ * Helper function to traverse cpusets.
- * The tasks of such cpuset must be moved to a parent cpuset.
+ * It can be used to walk the cpuset tree from top to bottom, completing
+ * one layer before dropping down to the next (thus always processing a
+ * node before any of its children).
+ */
+static struct cpuset *cpuset_next(struct list_head *queue)
+{
+        struct cpuset *cp;
+        struct cpuset *child;   /* scans child cpusets of cp */
+        struct cgroup *cont;
+        if (list_empty(queue))
+                return NULL;
+        cp = list_first_entry(queue, struct cpuset, stack_list);
+        list_del(queue->next);
+        list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                child = cgroup_cs(cont);
+                list_add_tail(&child->stack_list, queue);
+        }
+        return cp;
+}
+/*
+ * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
+ * online/offline) and update the cpusets accordingly.
+ * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
+ * cpuset must be moved to a parent cpuset.
 *
 * Called with cgroup_mutex held.  We take callback_mutex to modify
 * cpus_allowed and mems_allowed.
@@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 * before dropping down to the next.  It always processes a node before
 * any of its children.
 *
- * For now, since we lack memory hot unplug, we'll never see a cpuset
+ * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
- * that has tasks along with an empty 'mems'.  But if we did see such
+ * if all present pages from a node are offlined.
- * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
 */
-static void scan_for_empty_cpusets(struct cpuset *root)
+static void
+scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 {
        LIST_HEAD(queue);
-        struct cpuset *cp;      /* scans cpusets being updated */
+        struct cpuset *cp;              /* scans cpusets being updated */
-        struct cpuset *child;   /* scans child cpusets of cp */
-        struct cgroup *cont;
        static nodemask_t oldmems;      /* protected by cgroup_mutex */
        list_add_tail((struct list_head *)&root->stack_list, &queue);
-        while (!list_empty(&queue)) {
+        switch (event) {
-                cp = list_first_entry(&queue, struct cpuset, stack_list);
+        case CPUSET_CPU_OFFLINE:
-                list_del(queue.next);
+                while ((cp = cpuset_next(&queue)) != NULL) {
-                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-                        child = cgroup_cs(cont);
+                        /* Continue past cpusets with all cpus online */
-                        list_add_tail(&child->stack_list, &queue);
+                        if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
+                                continue;
+                        /* Remove offline cpus from this cpuset. */
+                        mutex_lock(&callback_mutex);
+                        cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+                                                        cpu_active_mask);
+                        mutex_unlock(&callback_mutex);
+                        /* Move tasks from the empty cpuset to a parent */
+                        if (cpumask_empty(cp->cpus_allowed))
+                                remove_tasks_in_empty_cpuset(cp);
+                        else
+                                update_tasks_cpumask(cp, NULL);
                }
+                break;
-                /* Continue past cpusets with all cpus, mems online */
+        case CPUSET_MEM_OFFLINE:
-                if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
+                while ((cp = cpuset_next(&queue)) != NULL) {
-                    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
-                        continue;
-                oldmems = cp->mems_allowed;
+                        /* Continue past cpusets with all mems online */
+                        if (nodes_subset(cp->mems_allowed,
+                                        node_states[N_HIGH_MEMORY]))
+                                continue;
-                /* Remove offline cpus and mems from this cpuset. */
+                        oldmems = cp->mems_allowed;
-                mutex_lock(&callback_mutex);
-                cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+                        /* Remove offline mems from this cpuset. */
-                            cpu_active_mask);
+                        mutex_lock(&callback_mutex);
-                nodes_and(cp->mems_allowed, cp->mems_allowed,
+                        nodes_and(cp->mems_allowed, cp->mems_allowed,
                                                node_states[N_HIGH_MEMORY]);
-                mutex_unlock(&callback_mutex);
+                        mutex_unlock(&callback_mutex);
-                /* Move tasks from the empty cpuset to a parent */
+                        /* Move tasks from the empty cpuset to a parent */
-                if (cpumask_empty(cp->cpus_allowed) ||
+                        if (nodes_empty(cp->mems_allowed))
-                     nodes_empty(cp->mems_allowed))
+                                remove_tasks_in_empty_cpuset(cp);
-                        remove_tasks_in_empty_cpuset(cp);
+                        else
-                else {
+                                update_tasks_nodemask(cp, &oldmems, NULL);
-                        update_tasks_cpumask(cp, NULL);
-                        update_tasks_nodemask(cp, &oldmems, NULL);
                }
        }
 }
@@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 * (of no affect) on systems that are actively using CPU hotplug
 * but making no active use of cpusets.
 *
+ * The only exception to this is suspend/resume, where we don't
+ * modify cpusets at all.
+ *
 * This routine ensures that top_cpuset.cpus_allowed tracks
 * cpu_active_mask on each CPU hotplug (cpuhp) event.
 *
 * Called within get_online_cpus().  Needs to call cgroup_lock()
 * before calling generate_sched_domains().
+ *
+ * @cpu_online: Indicates whether this is a CPU online event (true) or
+ * a CPU offline event (false).
 */
-void cpuset_update_active_cpus(void)
+void cpuset_update_active_cpus(bool cpu_online)
 {
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
@@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
        mutex_lock(&callback_mutex);
        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
        mutex_unlock(&callback_mutex);
-        scan_for_empty_cpusets(&top_cpuset);
+        if (!cpu_online)
+                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
        ndoms = generate_sched_domains(&doms, &attr);
        cgroup_unlock();
@@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
 /*
 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See also the previous routine cpuset_track_online_cpus().
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
 */
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
@@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
        case MEM_OFFLINE:
                /*
                 * needn't update top_cpuset.mems_allowed explicitly because
-                 * scan_for_empty_cpusets() will update it.
+                 * scan_cpusets_upon_hotplug() will update it.
                 */
-                scan_for_empty_cpusets(&top_cpuset);
+                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
                break;
        default:
                break;
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8b68ce78ff17..be7b33b73d30 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -12,6 +12,7 @@
 #include <linux/kdb.h>
 #include <linux/kdebug.h>
 #include <linux/export.h>
+#include <linux/hardirq.h>
 #include "kdb_private.h"
 #include "../debug_core.h"
@@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks)
        if (atomic_read(&kgdb_setting_breakpoint))
                reason = KDB_REASON_KEYBOARD;
+        if (in_nmi())
+                reason = KDB_REASON_NMI;
        for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
                if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
                        reason = KDB_REASON_BREAK;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index bb9520f0f6ff..0a69d2adc4f3 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -715,9 +715,6 @@ kdb_printit:
        /* check for having reached the LINES number of printed lines */
        if (kdb_nextline == linecount) {
                char buf1[16] = "";
-#if defined(CONFIG_SMP)
-                char buf2[32];
-#endif
                /* Watch out for recursion here.  Any routine that calls
                 * kdb_printf will come back through here.  And kdb_read
@@ -732,14 +729,6 @@ kdb_printit:
                if (moreprompt == NULL)
                        moreprompt = "more> ";
-#if defined(CONFIG_SMP)
-                if (strchr(moreprompt, '%')) {
-                        sprintf(buf2, moreprompt, get_cpu());
-                        put_cpu();
-                        moreprompt = buf2;
-                }
-#endif
                kdb_input_flush();
                c = console_drivers;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 1f91413edb87..31df1706b9a9 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -139,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
 static char *__env[] = {
 #if defined(CONFIG_SMP)
 "PROMPT=[%d]kdb> ",
- "MOREPROMPT=[%d]more> ",
 #else
 "PROMPT=kdb> ",
- "MOREPROMPT=more> ",
 #endif
+ "MOREPROMPT=more> ",
 "RADIX=16",
 "MDCOUNT=8",                   /* lines of md output */
 KDB_PLATFORM_ENV,
@@ -1236,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                *cmdbuf = '\0';
                *(cmd_hist[cmd_head]) = '\0';
-                if (KDB_FLAG(ONLY_DO_DUMP)) {
-                        /* kdb is off but a catastrophic error requires a dump.
-                         * Take the dump and reboot.
-                         * Turn on logging so the kdb output appears in the log
-                         * buffer in the dump.
-                         */
-                        const char *setargs[] = { "set", "LOGGING", "1" };
-                        kdb_set(2, setargs);
-                        kdb_reboot(0, NULL);
-                        /*NOTREACHED*/
-                }
 do_full_getstr:
 #if defined(CONFIG_SMP)
                snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6581a040f399..c77206184b8b 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -153,11 +153,17 @@ put_callchain_entry(int rctx)
        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
 }
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
        int rctx;
        struct perf_callchain_entry *entry;
+        int kernel = !event->attr.exclude_callchain_kernel;
+        int user   = !event->attr.exclude_callchain_user;
+        if (!kernel && !user)
+                return NULL;
        entry = get_callchain_entry(&rctx);
        if (rctx == -1)
@@ -168,18 +174,29 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
        entry->nr = 0;
-        if (!user_mode(regs)) {
+        if (kernel && !user_mode(regs)) {
                perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
                perf_callchain_kernel(entry, regs);
-                if (current->mm)
-                        regs = task_pt_regs(current);
-                else
-                        regs = NULL;
        }
-        if (regs) {
+        if (user) {
-                perf_callchain_store(entry, PERF_CONTEXT_USER);
+                if (!user_mode(regs)) {
-                perf_callchain_user(entry, regs);
+                        if  (current->mm)
+                                regs = task_pt_regs(current);
+                        else
+                                regs = NULL;
+                }
+                if (regs) {
+                        /*
+                         * Disallow cross-task user callchains.
+                         */
+                        if (event->ctx->task && event->ctx->task != current)
+                                goto exit_put;
+                        perf_callchain_store(entry, PERF_CONTEXT_USER);
+                        perf_callchain_user(entry, regs);
+                }
        }
 exit_put:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f1cf0edeb39a..2ba890450d15 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,7 @@
 #include <linux/perf_event.h>
 #include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/mm_types.h>
 #include "internal.h"
@@ -3756,6 +3757,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 }
 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+static void
+perf_output_sample_regs(struct perf_output_handle *handle,
+                        struct pt_regs *regs, u64 mask)
+{
+        int bit;
+        for_each_set_bit(bit, (const unsigned long *) &mask,
+                         sizeof(mask) * BITS_PER_BYTE) {
+                u64 val;
+                val = perf_reg_value(regs, bit);
+                perf_output_put(handle, val);
+        }
+}
+static void perf_sample_regs_user(struct perf_regs_user *regs_user,
+                                  struct pt_regs *regs)
+{
+        if (!user_mode(regs)) {
+                if (current->mm)
+                        regs = task_pt_regs(current);
+                else
+                        regs = NULL;
+        }
+        if (regs) {
+                regs_user->regs = regs;
+                regs_user->abi  = perf_reg_abi(current);
+        }
+}
+/*
+ * Get remaining task size from user stack pointer.
+ *
+ * It'd be better to take stack vma map and limit this more
+ * precisly, but there's no way to get it safely under interrupt,
+ * so using TASK_SIZE as limit.
+ */
+static u64 perf_ustack_task_size(struct pt_regs *regs)
+{
+        unsigned long addr = perf_user_stack_pointer(regs);
+        if (!addr || addr >= TASK_SIZE)
+                return 0;
+        return TASK_SIZE - addr;
+}
+static u16
+perf_sample_ustack_size(u16 stack_size, u16 header_size,
+                        struct pt_regs *regs)
+{
+        u64 task_size;
+        /* No regs, no stack pointer, no dump. */
+        if (!regs)
+                return 0;
+        /*
+         * Check if we fit in with the requested stack size into the:
+         * - TASK_SIZE
+         *   If we don't, we limit the size to the TASK_SIZE.
+         *
+         * - remaining sample size
+         *   If we don't, we customize the stack size to
+         *   fit in to the remaining sample size.
+         */
+        task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
+        stack_size = min(stack_size, (u16) task_size);
+        /* Current header size plus static size and dynamic size. */
+        header_size += 2 * sizeof(u64);
+        /* Do we fit in with the current stack dump size? */
+        if ((u16) (header_size + stack_size) < header_size) {
+                /*
+                 * If we overflow the maximum size for the sample,
+                 * we customize the stack dump size to fit in.
+                 */
+                stack_size = USHRT_MAX - header_size - sizeof(u64);
+                stack_size = round_up(stack_size, sizeof(u64));
+        }
+        return stack_size;
+}
+static void
+perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
+                          struct pt_regs *regs)
+{
+        /* Case of a kernel thread, nothing to dump */
+        if (!regs) {
+                u64 size = 0;
+                perf_output_put(handle, size);
+        } else {
+                unsigned long sp;
+                unsigned int rem;
+                u64 dyn_size;
+                /*
+                 * We dump:
+                 * static size
+                 *   - the size requested by user or the best one we can fit
+                 *     in to the sample max size
+                 * data
+                 *   - user stack dump data
+                 * dynamic size
+                 *   - the actual dumped size
+                 */
+                /* Static size. */
+                perf_output_put(handle, dump_size);
+                /* Data. */
+                sp = perf_user_stack_pointer(regs);
+                rem = __output_copy_user(handle, (void *) sp, dump_size);
+                dyn_size = dump_size - rem;
+                perf_output_skip(handle, rem);
+                /* Dynamic size. */
+                perf_output_put(handle, dyn_size);
+        }
+}
 static void __perf_event_header__init_id(struct perf_event_header *header,
                                         struct perf_sample_data *data,
                                         struct perf_event *event)
@@ -4016,6 +4143,28 @@ void perf_output_sample(struct perf_output_handle *handle,
                        perf_output_put(handle, nr);
                }
        }
+        if (sample_type & PERF_SAMPLE_REGS_USER) {
+                u64 abi = data->regs_user.abi;
+                /*
+                 * If there are no regs to dump, notice it through
+                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+                 */
+                perf_output_put(handle, abi);
+                if (abi) {
+                        u64 mask = event->attr.sample_regs_user;
+                        perf_output_sample_regs(handle,
+                                                data->regs_user.regs,
+                                                mask);
+                }
+        }
+        if (sample_type & PERF_SAMPLE_STACK_USER)
+                perf_output_sample_ustack(handle,
+                                          data->stack_user_size,
+                                          data->regs_user.regs);
 }
 void perf_prepare_sample(struct perf_event_header *header,
@@ -4039,7 +4188,7 @@ void perf_prepare_sample(struct perf_event_header *header,
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;
-                data->callchain = perf_callchain(regs);
+                data->callchain = perf_callchain(event, regs);
                if (data->callchain)
                        size += data->callchain->nr;
@@ -4067,6 +4216,49 @@ void perf_prepare_sample(struct perf_event_header *header,
                }
                header->size += size;
        }
+        if (sample_type & PERF_SAMPLE_REGS_USER) {
+                /* regs dump ABI info */
+                int size = sizeof(u64);
+                perf_sample_regs_user(&data->regs_user, regs);
+                if (data->regs_user.regs) {
+                        u64 mask = event->attr.sample_regs_user;
+                        size += hweight64(mask) * sizeof(u64);
+                }
+                header->size += size;
+        }
+        if (sample_type & PERF_SAMPLE_STACK_USER) {
+                /*
+                 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+                 * processed as the last one or have additional check added
+                 * in case new sample type is added, because we could eat
+                 * up the rest of the sample size.
+                 */
+                struct perf_regs_user *uregs = &data->regs_user;
+                u16 stack_size = event->attr.sample_stack_user;
+                u16 size = sizeof(u64);
+                if (!uregs->abi)
+                        perf_sample_regs_user(uregs, regs);
+                stack_size = perf_sample_ustack_size(stack_size, header->size,
+                                                     uregs->regs);
+                /*
+                 * If there is something to dump, add space for the dump
+                 * itself and for the field that tells the dynamic size,
+                 * which is how many have been actually dumped.
+                 */
+                if (stack_size)
+                        size += sizeof(u64) + stack_size;
+                data->stack_user_size = stack_size;
+                header->size += size;
+        }
 }
 static void perf_event_output(struct perf_event *event,
@@ -5209,7 +5401,8 @@ static int perf_tp_event_match(struct perf_event *event,
 }
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-                   struct pt_regs *regs, struct hlist_head *head, int rctx)
+                   struct pt_regs *regs, struct hlist_head *head, int rctx,
+                   struct task_struct *task)
 {
        struct perf_sample_data data;
        struct perf_event *event;
@@ -5228,6 +5421,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
                        perf_swevent_event(event, count, &data, regs);
        }
+        /*
+         * If we got specified a target task, also iterate its context and
+         * deliver this event there too.
+         */
+        if (task && task != current) {
+                struct perf_event_context *ctx;
+                struct trace_entry *entry = record;
+                rcu_read_lock();
+                ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+                if (!ctx)
+                        goto unlock;
+                list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+                        if (event->attr.type != PERF_TYPE_TRACEPOINT)
+                                continue;
+                        if (event->attr.config != entry->type)
+                                continue;
+                        if (perf_tp_event_match(event, &data, regs))
+                                perf_swevent_event(event, count, &data, regs);
+                }
+unlock:
+                rcu_read_unlock();
+        }
        perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -6116,6 +6334,28 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
                        attr->branch_sample_type = mask;
                }
        }
+        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
+                ret = perf_reg_validate(attr->sample_regs_user);
+                if (ret)
+                        return ret;
+        }
+        if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
+                if (!arch_perf_have_user_stack_dump())
+                        return -ENOSYS;
+                /*
+                 * We have __u32 type for the size, but so far
+                 * we can only use __u16 as maximum due to the
+                 * __u16 sample size limit.
+                 */
+                if (attr->sample_stack_user >= USHRT_MAX)
+                        ret = -EINVAL;
+                else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
+                        ret = -EINVAL;
+        }
 out:
        return ret;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f90afc..d56a64c99a8b 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -2,6 +2,7 @@
 #define _KERNEL_EVENTS_INTERNAL_H
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
 /* Buffer handling */
@@ -76,32 +77,56 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
        return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
 }
-static inline void
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                      \
-__output_copy(struct perf_output_handle *handle,
+static inline unsigned int                                              \
-                   const void *buf, unsigned int len)
+func_name(struct perf_output_handle *handle,                            \
+          const void *buf, unsigned int len)                            \
+{                                                                       \
+        unsigned long size, written;                                    \
+                                                                        \
+        do {                                                            \
+                size = min_t(unsigned long, handle->size, len);         \
+                                                                        \
+                written = memcpy_func(handle->addr, buf, size);         \
+                                                                        \
+                len -= written;                                         \
+                handle->addr += written;                                \
+                buf += written;                                         \
+                handle->size -= written;                                \
+                if (!handle->size) {                                    \
+                        struct ring_buffer *rb = handle->rb;            \
+                                                                        \
+                        handle->page++;                                 \
+                        handle->page &= rb->nr_pages - 1;               \
+                        handle->addr = rb->data_pages[handle->page];    \
+                        handle->size = PAGE_SIZE << page_order(rb);     \
+                }                                                       \
+        } while (len && written == size);                               \
+                                                                        \
+        return len;                                                     \
+}
+static inline int memcpy_common(void *dst, const void *src, size_t n)
 {
-        do {
+        memcpy(dst, src, n);
-                unsigned long size = min_t(unsigned long, handle->size, len);
+        return n;
-                memcpy(handle->addr, buf, size);
-                len -= size;
-                handle->addr += size;
-                buf += size;
-                handle->size -= size;
-                if (!handle->size) {
-                        struct ring_buffer *rb = handle->rb;
-                        handle->page++;
-                        handle->page &= rb->nr_pages - 1;
-                        handle->addr = rb->data_pages[handle->page];
-                        handle->size = PAGE_SIZE << page_order(rb);
-                }
-        } while (len);
 }
+DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
+#define MEMCPY_SKIP(dst, src, n) (n)
+DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
+#ifndef arch_perf_out_copy_user
+#define arch_perf_out_copy_user __copy_from_user_inatomic
+#endif
+DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
 /* Callchain handling */
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs);
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
@@ -133,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx)
        recursion[rctx]--;
 }
+#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+        return true;
+}
+#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
+#else
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+        return false;
+}
+#define perf_user_stack_pointer(regs) 0
+#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
 #endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 6ddaba43fb7a..23cb34ff3973 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -182,10 +182,16 @@ out:
        return -ENOSPC;
 }
-void perf_output_copy(struct perf_output_handle *handle,
+unsigned int perf_output_copy(struct perf_output_handle *handle,
                      const void *buf, unsigned int len)
 {
-        __output_copy(handle, buf, len);
+        return __output_copy(handle, buf, len);
+}
+unsigned int perf_output_skip(struct perf_output_handle *handle,
+                              unsigned int len)
+{
+        return __output_skip(handle, NULL, len);
 }
 void perf_output_end(struct perf_output_handle *handle)
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f59cc334516..f65345f9e5bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -483,7 +483,7 @@ static void close_files(struct files_struct * files)
        rcu_read_unlock();
        for (;;) {
                unsigned long set;
-                i = j * __NFDBITS;
+                i = j * BITS_PER_LONG;
                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds[j++];
@@ -953,14 +953,11 @@ void do_exit(long code)
        exit_signals(tsk);  /* sets PF_EXITING */
        /*
         * tsk->flags are checked in the futex code to protect against
-         * an exiting task cleaning up the robust pi futexes, and in
+         * an exiting task cleaning up the robust pi futexes.
-         * task_work_add() to avoid the race with exit_task_work().
         */
        smp_mb();
        raw_spin_unlock_wait(&tsk->pi_lock);
-        exit_task_work(tsk);
        if (unlikely(in_atomic()))
                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
                                current->comm, task_pid_nr(current),
@@ -995,6 +992,7 @@ void do_exit(long code)
        exit_shm(tsk);
        exit_files(tsk);
        exit_fs(tsk);
+        exit_task_work(tsk);
        check_stack_usage();
        exit_thread();
diff --git a/kernel/fork.c b/kernel/fork.c
index f00e319d8376..3bd2280d79f6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -114,6 +114,10 @@ int nr_processes(void)
        return total;
 }
+void __weak arch_release_task_struct(struct task_struct *tsk)
+{
+}
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 static struct kmem_cache *task_struct_cachep;
@@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node)
        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
 }
-void __weak arch_release_task_struct(struct task_struct *tsk) { }
 static inline void free_task_struct(struct task_struct *tsk)
 {
-        arch_release_task_struct(tsk);
        kmem_cache_free(task_struct_cachep, tsk);
 }
 #endif
+void __weak arch_release_thread_info(struct thread_info *ti)
+{
+}
 #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
-void __weak arch_release_thread_info(struct thread_info *ti) { }
 /*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
@@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 static inline void free_thread_info(struct thread_info *ti)
 {
-        arch_release_thread_info(ti);
        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 # else
@@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 static void free_thread_info(struct thread_info *ti)
 {
-        arch_release_thread_info(ti);
        kmem_cache_free(thread_info_cache, ti);
 }
@@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
        account_kernel_stack(tsk->stack, -1);
+        arch_release_thread_info(tsk->stack);
        free_thread_info(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
+        arch_release_task_struct(tsk);
        free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -298,23 +302,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
                return NULL;
        ti = alloc_thread_info_node(tsk, node);
-        if (!ti) {
+        if (!ti)
-                free_task_struct(tsk);
+                goto free_tsk;
-                return NULL;
-        }
        err = arch_dup_task_struct(tsk, orig);
+        if (err)
+                goto free_ti;
-        /*
-         * We defer looking at err, because we will need this setup
-         * for the clean up path to work correctly.
-         */
        tsk->stack = ti;
-        setup_thread_stack(tsk, orig);
-        if (err)
-                goto out;
+        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        stackend = end_of_stack(tsk);
@@ -338,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        return tsk;
-out:
+free_ti:
        free_thread_info(ti);
+free_tsk:
        free_task_struct(tsk);
        return NULL;
 }
@@ -383,16 +381,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                struct file *file;
                if (mpnt->vm_flags & VM_DONTCOPY) {
-                        long pages = vma_pages(mpnt);
-                        mm->total_vm -= pages;
                        vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
-                                                                -pages);
+                                                        -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                if (mpnt->vm_flags & VM_ACCOUNT) {
-                        unsigned long len;
+                        unsigned long len = vma_pages(mpnt);
-                        len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
@@ -1310,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
        p->memcg_batch.do_batch = 0;
        p->memcg_batch.memcg = NULL;
 #endif
@@ -1420,7 +1416,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         */
        p->group_leader = p;
        INIT_LIST_HEAD(&p->thread_group);
-        INIT_HLIST_HEAD(&p->task_works);
+        p->task_works = NULL;
        /* Now that the task is set up, run cgroup callbacks if
         * necessary. We need to run them before the task is visible
diff --git a/kernel/futex.c b/kernel/futex.c
index e2b0fb9a0b3b..3717e7b306e0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 * @uaddr2:     the pi futex we will take prior to returning to user-space
 *
 * The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
+ * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
- * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
- * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
- * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * without one, the pi logic would not know which task to boost/deboost, if
- * need to.
+ * there was a need to.
 *
 * We call schedule in futex_wait_queue_me() when we enqueue and return there
 * via the following:
@@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        struct futex_q q = futex_q_init;
        int res, ret;
+        if (uaddr == uaddr2)
+                return -EINVAL;
        if (!bitset)
                return -EINVAL;
@@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
                 * the pi_state.
                 */
-                WARN_ON(!&q.pi_state);
+                WARN_ON(!q.pi_state);
                pi_mutex = &q.pi_state->pi_mutex;
                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
                debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * fault, unlock the rt_mutex and return the fault to userspace.
         */
        if (ret == -EFAULT) {
-                if (rt_mutex_owner(pi_mutex) == current)
+                if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
                        rt_mutex_unlock(pi_mutex);
        } else if (ret == -EINTR) {
                /*
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index bdb180325551..131ca176b497 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,7 +133,7 @@ irqreturn_t
 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 {
        irqreturn_t retval = IRQ_NONE;
-        unsigned int random = 0, irq = desc->irq_data.irq;
+        unsigned int flags = 0, irq = desc->irq_data.irq;
        do {
                irqreturn_t res;
@@ -161,7 +161,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
                        /* Fall through to add to randomness */
                case IRQ_HANDLED:
-                        random |= action->flags;
+                        flags |= action->flags;
                        break;
                default:
@@ -172,8 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
                action = action->next;
        } while (action);
-        if (random & IRQF_SAMPLE_RANDOM)
+        add_interrupt_randomness(irq, flags);
-                add_interrupt_randomness(irq);
        if (!noirqdebug)
                note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 41c1564103f1..49a77727db42 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -10,6 +10,7 @@
 #include <linux/mutex.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/topology.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
@@ -45,7 +46,8 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
 {
        struct irq_domain *domain;
-        domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+        domain = kzalloc_node(sizeof(*domain), GFP_KERNEL,
+                              of_node_to_nid(of_node));
        if (WARN_ON(!domain))
                return NULL;
@@ -138,6 +140,36 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
 }
 /**
+ * irq_domain_add_simple() - Allocate and register a simple irq_domain.
+ * @of_node: pointer to interrupt controller's device tree node.
+ * @size: total number of irqs in mapping
+ * @first_irq: first number of irq block assigned to the domain
+ * @ops: map/unmap domain callbacks
+ * @host_data: Controller private data pointer
+ *
+ * Allocates a legacy irq_domain if irq_base is positive or a linear
+ * domain otherwise.
+ *
+ * This is intended to implement the expected behaviour for most
+ * interrupt controllers which is that a linear mapping should
+ * normally be used unless the system requires a legacy mapping in
+ * order to support supplying interrupt numbers during non-DT
+ * registration of devices.
+ */
+struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
+                                         unsigned int size,
+                                         unsigned int first_irq,
+                                         const struct irq_domain_ops *ops,
+                                         void *host_data)
+{
+        if (first_irq > 0)
+                return irq_domain_add_legacy(of_node, size, first_irq, 0,
+                                             ops, host_data);
+        else
+                return irq_domain_add_linear(of_node, size, ops, host_data);
+}
+/**
 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
 * @of_node: pointer to interrupt controller's device tree node.
 * @size: total number of irqs in legacy mapping
@@ -203,7 +235,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
                 * one can then use irq_create_mapping() to
                 * explicitly change them
                 */
-                ops->map(domain, irq, hwirq);
+                if (ops->map)
+                        ops->map(domain, irq, hwirq);
                /* Clear norequest flags */
                irq_clear_status_flags(irq, IRQ_NOREQUEST);
@@ -215,7 +248,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
 /**
- * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
+ * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
 * @of_node: pointer to interrupt controller's device tree node.
 * @size: Number of interrupts in the domain.
 * @ops: map/unmap domain callbacks
@@ -229,7 +262,8 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
        struct irq_domain *domain;
        unsigned int *revmap;
-        revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL);
+        revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
+                              of_node_to_nid(of_node));
        if (WARN_ON(!revmap))
                return NULL;
@@ -330,24 +364,112 @@ void irq_set_default_host(struct irq_domain *domain)
 }
 EXPORT_SYMBOL_GPL(irq_set_default_host);
-static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
+static void irq_domain_disassociate_many(struct irq_domain *domain,
-                            irq_hw_number_t hwirq)
+                                         unsigned int irq_base, int count)
 {
-        struct irq_data *irq_data = irq_get_irq_data(virq);
+        /*
+         * disassociate in reverse order;
+         * not strictly necessary, but nice for unwinding
+         */
+        while (count--) {
+                int irq = irq_base + count;
+                struct irq_data *irq_data = irq_get_irq_data(irq);
+                irq_hw_number_t hwirq = irq_data->hwirq;
+                if (WARN_ON(!irq_data || irq_data->domain != domain))
+                        continue;
+                irq_set_status_flags(irq, IRQ_NOREQUEST);
+                /* remove chip and handler */
+                irq_set_chip_and_handler(irq, NULL, NULL);
+                /* Make sure it's completed */
+                synchronize_irq(irq);
+                /* Tell the PIC about it */
+                if (domain->ops->unmap)
+                        domain->ops->unmap(domain, irq);
+                smp_mb();
-        irq_data->hwirq = hwirq;
-        irq_data->domain = domain;
-        if (domain->ops->map(domain, virq, hwirq)) {
-                pr_debug("irq-%i==>hwirq-0x%lx mapping failed\n", virq, hwirq);
                irq_data->domain = NULL;
                irq_data->hwirq = 0;
-                return -1;
+                /* Clear reverse map */
+                switch(domain->revmap_type) {
+                case IRQ_DOMAIN_MAP_LINEAR:
+                        if (hwirq < domain->revmap_data.linear.size)
+                                domain->revmap_data.linear.revmap[hwirq] = 0;
+                        break;
+                case IRQ_DOMAIN_MAP_TREE:
+                        mutex_lock(&revmap_trees_mutex);
+                        radix_tree_delete(&domain->revmap_data.tree, hwirq);
+                        mutex_unlock(&revmap_trees_mutex);
+                        break;
+                }
        }
+}
+int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
+                              irq_hw_number_t hwirq_base, int count)
+{
+        unsigned int virq = irq_base;
+        irq_hw_number_t hwirq = hwirq_base;
+        int i, ret;
+        pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
+                of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
+        for (i = 0; i < count; i++) {
+                struct irq_data *irq_data = irq_get_irq_data(virq + i);
+                if (WARN(!irq_data, "error: irq_desc not allocated; "
+                         "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
+                        return -EINVAL;
+                if (WARN(irq_data->domain, "error: irq_desc already associated; "
+                         "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i))
+                        return -EINVAL;
+        };
+        for (i = 0; i < count; i++, virq++, hwirq++) {
+                struct irq_data *irq_data = irq_get_irq_data(virq);
+                irq_data->hwirq = hwirq;
+                irq_data->domain = domain;
+                if (domain->ops->map) {
+                        ret = domain->ops->map(domain, virq, hwirq);
+                        if (ret != 0) {
+                                pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
+                                       virq, hwirq, ret);
+                                WARN_ON(1);
+                                irq_data->domain = NULL;
+                                irq_data->hwirq = 0;
+                                goto err_unmap;
+                        }
+                }
-        irq_clear_status_flags(virq, IRQ_NOREQUEST);
+                switch (domain->revmap_type) {
+                case IRQ_DOMAIN_MAP_LINEAR:
+                        if (hwirq < domain->revmap_data.linear.size)
+                                domain->revmap_data.linear.revmap[hwirq] = virq;
+                        break;
+                case IRQ_DOMAIN_MAP_TREE:
+                        mutex_lock(&revmap_trees_mutex);
+                        radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
+                        mutex_unlock(&revmap_trees_mutex);
+                        break;
+                }
+                irq_clear_status_flags(virq, IRQ_NOREQUEST);
+        }
        return 0;
+ err_unmap:
+        irq_domain_disassociate_many(domain, irq_base, i);
+        return -EINVAL;
 }
+EXPORT_SYMBOL_GPL(irq_domain_associate_many);
 /**
 * irq_create_direct_mapping() - Allocate an irq for direct mapping
@@ -364,10 +486,10 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
        if (domain == NULL)
                domain = irq_default_domain;
-        BUG_ON(domain == NULL);
+        if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
-        WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
+                return 0;
-        virq = irq_alloc_desc_from(1, 0);
+        virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
        if (!virq) {
                pr_debug("create_direct virq allocation failed\n");
                return 0;
@@ -380,7 +502,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
        }
        pr_debug("create_direct obtained virq %d\n", virq);
-        if (irq_setup_virq(domain, virq, virq)) {
+        if (irq_domain_associate(domain, virq, virq)) {
                irq_free_desc(virq);
                return 0;
        }
@@ -433,27 +555,64 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
        hint = hwirq % nr_irqs;
        if (hint == 0)
                hint++;
-        virq = irq_alloc_desc_from(hint, 0);
+        virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node));
        if (virq <= 0)
-                virq = irq_alloc_desc_from(1, 0);
+                virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
        if (virq <= 0) {
                pr_debug("-> virq allocation failed\n");
                return 0;
        }
-        if (irq_setup_virq(domain, virq, hwirq)) {
+        if (irq_domain_associate(domain, virq, hwirq)) {
-                if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
+                irq_free_desc(virq);
-                        irq_free_desc(virq);
                return 0;
        }
        pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
-                hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
+                hwirq, of_node_full_name(domain->of_node), virq);
        return virq;
 }
 EXPORT_SYMBOL_GPL(irq_create_mapping);
+/**
+ * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
+ * @domain: domain owning the interrupt range
+ * @irq_base: beginning of linux IRQ range
+ * @hwirq_base: beginning of hardware IRQ range
+ * @count: Number of interrupts to map
+ *
+ * This routine is used for allocating and mapping a range of hardware
+ * irqs to linux irqs where the linux irq numbers are at pre-defined
+ * locations. For use by controllers that already have static mappings
+ * to insert in to the domain.
+ *
+ * Non-linear users can use irq_create_identity_mapping() for IRQ-at-a-time
+ * domain insertion.
+ *
+ * 0 is returned upon success, while any failure to establish a static
+ * mapping is treated as an error.
+ */
+int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
+                               irq_hw_number_t hwirq_base, int count)
+{
+        int ret;
+        ret = irq_alloc_descs(irq_base, irq_base, count,
+                              of_node_to_nid(domain->of_node));
+        if (unlikely(ret < 0))
+                return ret;
+        ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count);
+        if (unlikely(ret < 0)) {
+                irq_free_descs(irq_base, count);
+                return ret;
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
 unsigned int irq_create_of_mapping(struct device_node *controller,
                                   const u32 *intspec, unsigned int intsize)
 {
@@ -477,7 +636,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
                        return intspec[0];
 #endif
                pr_warning("no irq domain found for %s !\n",
-                           controller->full_name);
+                           of_node_full_name(controller));
                return 0;
        }
@@ -511,7 +670,6 @@ void irq_dispose_mapping(unsigned int virq)
 {
        struct irq_data *irq_data = irq_get_irq_data(virq);
        struct irq_domain *domain;
-        irq_hw_number_t hwirq;
        if (!virq || !irq_data)
                return;
@@ -524,33 +682,7 @@ void irq_dispose_mapping(unsigned int virq)
        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
                return;
-        irq_set_status_flags(virq, IRQ_NOREQUEST);
+        irq_domain_disassociate_many(domain, virq, 1);
-        /* remove chip and handler */
-        irq_set_chip_and_handler(virq, NULL, NULL);
-        /* Make sure it's completed */
-        synchronize_irq(virq);
-        /* Tell the PIC about it */
-        if (domain->ops->unmap)
-                domain->ops->unmap(domain, virq);
-        smp_mb();
-        /* Clear reverse map */
-        hwirq = irq_data->hwirq;
-        switch(domain->revmap_type) {
-        case IRQ_DOMAIN_MAP_LINEAR:
-                if (hwirq < domain->revmap_data.linear.size)
-                        domain->revmap_data.linear.revmap[hwirq] = 0;
-                break;
-        case IRQ_DOMAIN_MAP_TREE:
-                mutex_lock(&revmap_trees_mutex);
-                radix_tree_delete(&domain->revmap_data.tree, hwirq);
-                mutex_unlock(&revmap_trees_mutex);
-                break;
-        }
        irq_free_desc(virq);
 }
 EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -559,16 +691,11 @@ EXPORT_SYMBOL_GPL(irq_dispose_mapping);
 * irq_find_mapping() - Find a linux irq from an hw irq number.
 * @domain: domain owning this hardware interrupt
 * @hwirq: hardware irq number in that domain space
- *
- * This is a slow path, for use by generic code. It's expected that an
- * irq controller implementation directly calls the appropriate low level
- * mapping function.
 */
 unsigned int irq_find_mapping(struct irq_domain *domain,
                              irq_hw_number_t hwirq)
 {
-        unsigned int i;
+        struct irq_data *data;
-        unsigned int hint = hwirq % nr_irqs;
        /* Look for default domain if nececssary */
        if (domain == NULL)
@@ -576,115 +703,47 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
        if (domain == NULL)
                return 0;
-        /* legacy -> bail early */
+        switch (domain->revmap_type) {
-        if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
+        case IRQ_DOMAIN_MAP_LEGACY:
                return irq_domain_legacy_revmap(domain, hwirq);
+        case IRQ_DOMAIN_MAP_LINEAR:
-        /* Slow path does a linear search of the map */
+                return irq_linear_revmap(domain, hwirq);
-        if (hint == 0)
+        case IRQ_DOMAIN_MAP_TREE:
-                hint = 1;
+                rcu_read_lock();
-        i = hint;
+                data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
-        do {
+                rcu_read_unlock();
-                struct irq_data *data = irq_get_irq_data(i);
+                if (data)
+                        return data->irq;
+                break;
+        case IRQ_DOMAIN_MAP_NOMAP:
+                data = irq_get_irq_data(hwirq);
                if (data && (data->domain == domain) && (data->hwirq == hwirq))
-                        return i;
+                        return hwirq;
-                i++;
+                break;
-                if (i >= nr_irqs)
+        }
-                        i = 1;
-        } while(i != hint);
        return 0;
 }
 EXPORT_SYMBOL_GPL(irq_find_mapping);
 /**
- * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
- * @domain: domain owning this hardware interrupt
- * @hwirq: hardware irq number in that domain space
- *
- * This is a fast path, for use by irq controller code that uses radix tree
- * revmaps
- */
-unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
-                                     irq_hw_number_t hwirq)
-{
-        struct irq_data *irq_data;
-        if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
-                return irq_find_mapping(domain, hwirq);
-        /*
-         * Freeing an irq can delete nodes along the path to
-         * do the lookup via call_rcu.
-         */
-        rcu_read_lock();
-        irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
-        rcu_read_unlock();
-        /*
-         * If found in radix tree, then fine.
-         * Else fallback to linear lookup - this should not happen in practice
-         * as it means that we failed to insert the node in the radix tree.
-         */
-        return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
-}
-EXPORT_SYMBOL_GPL(irq_radix_revmap_lookup);
-/**
- * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
- * @domain: domain owning this hardware interrupt
- * @virq: linux irq number
- * @hwirq: hardware irq number in that domain space
- *
- * This is for use by irq controllers that use a radix tree reverse
- * mapping for fast lookup.
- */
-void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
-                             irq_hw_number_t hwirq)
-{
-        struct irq_data *irq_data = irq_get_irq_data(virq);
-        if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
-                return;
-        if (virq) {
-                mutex_lock(&revmap_trees_mutex);
-                radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
-                mutex_unlock(&revmap_trees_mutex);
-        }
-}
-EXPORT_SYMBOL_GPL(irq_radix_revmap_insert);
-/**
 * irq_linear_revmap() - Find a linux irq from a hw irq number.
 * @domain: domain owning this hardware interrupt
 * @hwirq: hardware irq number in that domain space
 *
- * This is a fast path, for use by irq controller code that uses linear
+ * This is a fast path that can be called directly by irq controller code to
- * revmaps. It does fallback to the slow path if the revmap doesn't exist
+ * save a handful of instructions.
- * yet and will create the revmap entry with appropriate locking
 */
 unsigned int irq_linear_revmap(struct irq_domain *domain,
                               irq_hw_number_t hwirq)
 {
-        unsigned int *revmap;
+        BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR);
-        if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
-                return irq_find_mapping(domain, hwirq);
-        /* Check revmap bounds */
+        /* Check revmap bounds; complain if exceeded */
-        if (unlikely(hwirq >= domain->revmap_data.linear.size))
+        if (WARN_ON(hwirq >= domain->revmap_data.linear.size))
-                return irq_find_mapping(domain, hwirq);
+                return 0;
-        /* Check if revmap was allocated */
-        revmap = domain->revmap_data.linear.revmap;
-        if (unlikely(revmap == NULL))
-                return irq_find_mapping(domain, hwirq);
-        /* Fill up revmap with slow path if no mapping found */
-        if (unlikely(!revmap[hwirq]))
-                revmap[hwirq] = irq_find_mapping(domain, hwirq);
-        return revmap[hwirq];
+        return domain->revmap_data.linear.revmap[hwirq];
 }
 EXPORT_SYMBOL_GPL(irq_linear_revmap);
@@ -725,8 +784,8 @@ static int virq_debug_show(struct seq_file *m, void *private)
                        data = irq_desc_get_chip_data(desc);
                        seq_printf(m, data ? "0x%p  " : "  %p  ", data);
-                        if (desc->irq_data.domain && desc->irq_data.domain->of_node)
+                        if (desc->irq_data.domain)
-                                p = desc->irq_data.domain->of_node->full_name;
+                                p = of_node_full_name(desc->irq_data.domain->of_node);
                        else
                                p = none;
                        seq_printf(m, "%s\n", p);
@@ -761,12 +820,6 @@ static int __init irq_debugfs_init(void)
 __initcall(irq_debugfs_init);
 #endif /* CONFIG_IRQ_DOMAIN_DEBUG */
-static int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
-                                 irq_hw_number_t hwirq)
-{
-        return 0;
-}
 /**
 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
 *
@@ -829,7 +882,6 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d,
 EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
 const struct irq_domain_ops irq_domain_simple_ops = {
-        .map = irq_domain_simple_map,
        .xlate = irq_domain_xlate_onetwocell,
 };
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8c548232ba39..4c69326aa773 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -781,7 +781,7 @@ static void wake_threads_waitq(struct irq_desc *desc)
                wake_up(&desc->wait_for_threads);
 }
-static void irq_thread_dtor(struct task_work *unused)
+static void irq_thread_dtor(struct callback_head *unused)
 {
        struct task_struct *tsk = current;
        struct irq_desc *desc;
@@ -813,7 +813,7 @@ static void irq_thread_dtor(struct task_work *unused)
 */
 static int irq_thread(void *data)
 {
-        struct task_work on_exit_work;
+        struct callback_head on_exit_work;
        static const struct sched_param param = {
                .sched_priority = MAX_USER_RT_PRIO/2,
        };
@@ -830,7 +830,7 @@ static int irq_thread(void *data)
        sched_setscheduler(current, SCHED_FIFO, &param);
-        init_task_work(&on_exit_work, irq_thread_dtor, NULL);
+        init_task_work(&on_exit_work, irq_thread_dtor);
        task_work_add(current, &on_exit_work, false);
        while (!irq_wait_for_interrupt(action)) {
@@ -893,22 +893,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                return -ENOSYS;
        if (!try_module_get(desc->owner))
                return -ENODEV;
-        /*
-         * Some drivers like serial.c use request_irq() heavily,
-         * so we have to be careful not to interfere with a
-         * running system.
-         */
-        if (new->flags & IRQF_SAMPLE_RANDOM) {
-                /*
-                 * This function might sleep, we want to call it first,
-                 * outside of the atomic block.
-                 * Yes, this might clear the entropy pool if the wrong
-                 * driver is attempted to be loaded, without actually
-                 * installing a new handler, but is this really a problem,
-                 * only the sysadmin is able to do this.
-                 */
-                rand_initialize_irq(irq);
-        }
        /*
         * Check whether the interrupt nests into another interrupt
@@ -960,6 +944,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        }
        /*
+         * Drivers are often written to work w/o knowledge about the
+         * underlying irq chip implementation, so a request for a
+         * threaded irq without a primary hard irq context handler
+         * requires the ONESHOT flag to be set. Some irq chips like
+         * MSI based interrupts are per se one shot safe. Check the
+         * chip flags, so we can avoid the unmask dance at the end of
+         * the threaded handler for those.
+         */
+        if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
+                new->flags &= ~IRQF_ONESHOT;
+        /*
         * The following block of code has to be executed atomically
         */
        raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1033,7 +1029,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 */
                new->thread_mask = 1 << ffz(thread_mask);
-        } else if (new->handler == irq_default_primary_handler) {
+        } else if (new->handler == irq_default_primary_handler &&
+                   !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
                /*
                 * The interrupt was requested with handler = NULL, so
                 * we use the default primary handler for it. But it
@@ -1354,7 +1351,6 @@ EXPORT_SYMBOL(free_irq);
 *      Flags:
 *
 *      IRQF_SHARED             Interrupt is shared
- *      IRQF_SAMPLE_RANDOM      The interrupt can be used for entropy
 *      IRQF_TRIGGER_*          Specify active edge(s) or level
 *
 */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4e2e472f6aeb..0668d58d6413 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void)
 void crash_save_vmcoreinfo(void)
 {
-        vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+        vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
        update_vmcoreinfo_note();
 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ff2c7cb86d77..6f99aead66c6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,6 +45,13 @@ extern int max_threads;
 static struct workqueue_struct *khelper_wq;
+/*
+ * kmod_thread_locker is used for deadlock avoidance.  There is no explicit
+ * locking to protect this global - it is private to the singleton khelper
+ * thread and should only ever be modified by that thread.
+ */
+static const struct task_struct *kmod_thread_locker;
 #define CAP_BSET        (void *)1
 #define CAP_PI          (void *)2
@@ -221,6 +228,13 @@ fail:
        return 0;
 }
+static int call_helper(void *data)
+{
+        /* Worker thread started blocking khelper thread. */
+        kmod_thread_locker = current;
+        return ____call_usermodehelper(data);
+}
 static void call_usermodehelper_freeinfo(struct subprocess_info *info)
 {
        if (info->cleanup)
@@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work)
        if (wait == UMH_WAIT_PROC)
                pid = kernel_thread(wait_for_helper, sub_info,
                                    CLONE_FS | CLONE_FILES | SIGCHLD);
-        else
+        else {
-                pid = kernel_thread(____call_usermodehelper, sub_info,
+                pid = kernel_thread(call_helper, sub_info,
                                    CLONE_VFORK | SIGCHLD);
+                /* Worker thread stopped blocking khelper thread. */
+                kmod_thread_locker = NULL;
+        }
        switch (wait) {
        case UMH_NO_WAIT:
@@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
                retval = -EBUSY;
                goto out;
        }
+        /*
+         * Worker thread must not wait for khelper thread at below
+         * wait_for_completion() if the thread was created with CLONE_VFORK
+         * flag, for khelper thread is already waiting for the thread at
+         * wait_for_completion() in do_fork().
+         */
+        if (wait != UMH_NO_WAIT && current == kmod_thread_locker) {
+                retval = -EBUSY;
+                goto out;
+        }
        sub_info->complete = &done;
        sub_info->wait = wait;
@@ -577,6 +604,12 @@ unlock:
        return retval;
 }
+/*
+ * call_usermodehelper_fns() will not run the caller-provided cleanup function
+ * if a memory allocation failure is experienced.  So the caller might need to
+ * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
+ * the necessaary cleanup within the caller.
+ */
 int call_usermodehelper_fns(
        char *path, char **argv, char **envp, int wait,
        int (*init)(struct subprocess_info *info, struct cred *new),
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702e..b579af57ea10 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -360,16 +360,12 @@ repeat:
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
+        worker->current_work = work;
        spin_unlock_irq(&worker->lock);
        if (work) {
                __set_current_state(TASK_RUNNING);
                work->func(work);
-                smp_wmb();      /* wmb worker-b0 paired with flush-b1 */
-                work->done_seq = work->queue_seq;
-                smp_mb();       /* mb worker-b1 paired with flush-b0 */
-                if (atomic_read(&work->flushing))
-                        wake_up_all(&work->done);
        } else if (!freezing(current))
                schedule();
@@ -378,6 +374,19 @@ repeat:
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
+/* insert @work before @pos in @worker */
+static void insert_kthread_work(struct kthread_worker *worker,
+                               struct kthread_work *work,
+                               struct list_head *pos)
+{
+        lockdep_assert_held(&worker->lock);
+        list_add_tail(&work->node, pos);
+        work->worker = worker;
+        if (likely(worker->task))
+                wake_up_process(worker->task);
+}
 /**
 * queue_kthread_work - queue a kthread_work
 * @worker: target kthread_worker
@@ -395,10 +404,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
        spin_lock_irqsave(&worker->lock, flags);
        if (list_empty(&work->node)) {
-                list_add_tail(&work->node, &worker->work_list);
+                insert_kthread_work(worker, work, &worker->work_list);
-                work->queue_seq++;
-                if (likely(worker->task))
-                        wake_up_process(worker->task);
                ret = true;
        }
        spin_unlock_irqrestore(&worker->lock, flags);
@@ -406,6 +412,18 @@ bool queue_kthread_work(struct kthread_worker *worker,
 }
 EXPORT_SYMBOL_GPL(queue_kthread_work);
+struct kthread_flush_work {
+        struct kthread_work     work;
+        struct completion       done;
+};
+static void kthread_flush_work_fn(struct kthread_work *work)
+{
+        struct kthread_flush_work *fwork =
+                container_of(work, struct kthread_flush_work, work);
+        complete(&fwork->done);
+}
 /**
 * flush_kthread_work - flush a kthread_work
 * @work: work to flush
@@ -414,39 +432,37 @@ EXPORT_SYMBOL_GPL(queue_kthread_work);
 */
 void flush_kthread_work(struct kthread_work *work)
 {
-        int seq = work->queue_seq;
+        struct kthread_flush_work fwork = {
+                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
-        atomic_inc(&work->flushing);
+                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
+        };
+        struct kthread_worker *worker;
+        bool noop = false;
-        /*
+retry:
-         * mb flush-b0 paired with worker-b1, to make sure either
+        worker = work->worker;
-         * worker sees the above increment or we see done_seq update.
+        if (!worker)
-         */
+                return;
-        smp_mb__after_atomic_inc();
-        /* A - B <= 0 tests whether B is in front of A regardless of overflow */
+        spin_lock_irq(&worker->lock);
-        wait_event(work->done, seq - work->done_seq <= 0);
+        if (work->worker != worker) {
-        atomic_dec(&work->flushing);
+                spin_unlock_irq(&worker->lock);
+                goto retry;
+        }
-        /*
+        if (!list_empty(&work->node))
-         * rmb flush-b1 paired with worker-b0, to make sure our caller
+                insert_kthread_work(worker, &fwork.work, work->node.next);
-         * sees every change made by work->func().
+        else if (worker->current_work == work)
-         */
+                insert_kthread_work(worker, &fwork.work, worker->work_list.next);
-        smp_mb__after_atomic_dec();
+        else
-}
+                noop = true;
-EXPORT_SYMBOL_GPL(flush_kthread_work);
-struct kthread_flush_work {
+        spin_unlock_irq(&worker->lock);
-        struct kthread_work     work;
-        struct completion       done;
-};
-static void kthread_flush_work_fn(struct kthread_work *work)
+        if (!noop)
-{
+                wait_for_completion(&fwork.done);
-        struct kthread_flush_work *fwork =
-                container_of(work, struct kthread_flush_work, work);
-        complete(&fwork->done);
 }
+EXPORT_SYMBOL_GPL(flush_kthread_work);
 /**
 * flush_kthread_worker - flush all current works on a kthread_worker
diff --git a/kernel/panic.c b/kernel/panic.c
index d2a5f4ecc6dd..e1b2822fff97 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -75,6 +75,14 @@ void panic(const char *fmt, ...)
        int state = 0;
        /*
+         * Disable local interrupts. This will prevent panic_smp_self_stop
+         * from deadlocking the first cpu that invokes the panic, since
+         * there is nothing to prevent an interrupt handler (that runs
+         * after the panic_lock is acquired) from invoking panic again.
+         */
+        local_irq_disable();
+        /*
         * It's possible to come here directly from a panic-assertion and
         * not have preempt disabled. Some functions called from here want
         * preempt to be disabled. No point enabling it later though...
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 8f9b4eb974e0..a70518c9d82f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -175,7 +175,7 @@ config PM_TEST_SUSPEND
        You probably want to have your system's RTC driver statically
        linked, ensuring that it's available when this test runs.
-config CAN_PM_TRACE
+config PM_SLEEP_DEBUG
        def_bool y
        depends on PM_DEBUG && PM_SLEEP
@@ -196,7 +196,7 @@ config PM_TRACE
 config PM_TRACE_RTC
        bool "Suspend/resume event tracing"
-        depends on CAN_PM_TRACE
+        depends on PM_SLEEP_DEBUG
        depends on X86
        select PM_TRACE
        ---help---
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 238025f5472e..b26f5f1e773e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -5,6 +5,7 @@
 * Copyright (c) 2003 Open Source Development Lab
 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
+ * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
 *
 * This file is released under the GPLv2.
 */
@@ -45,6 +46,9 @@ enum {
        HIBERNATION_PLATFORM,
        HIBERNATION_SHUTDOWN,
        HIBERNATION_REBOOT,
+#ifdef CONFIG_SUSPEND
+        HIBERNATION_SUSPEND,
+#endif
        /* keep last */
        __HIBERNATION_AFTER_LAST
 };
@@ -353,6 +357,7 @@ int hibernation_snapshot(int platform_mode)
        }
        suspend_console();
+        ftrace_stop();
        pm_restrict_gfp_mask();
        error = dpm_suspend(PMSG_FREEZE);
@@ -378,6 +383,7 @@ int hibernation_snapshot(int platform_mode)
        if (error || !in_suspend)
                pm_restore_gfp_mask();
+        ftrace_start();
        resume_console();
        dpm_complete(msg);
@@ -480,6 +486,7 @@ int hibernation_restore(int platform_mode)
        pm_prepare_console();
        suspend_console();
+        ftrace_stop();
        pm_restrict_gfp_mask();
        error = dpm_suspend_start(PMSG_QUIESCE);
        if (!error) {
@@ -487,6 +494,7 @@ int hibernation_restore(int platform_mode)
                dpm_resume_end(PMSG_RECOVER);
        }
        pm_restore_gfp_mask();
+        ftrace_start();
        resume_console();
        pm_restore_console();
        return error;
@@ -513,6 +521,7 @@ int hibernation_platform_enter(void)
        entering_platform_hibernation = true;
        suspend_console();
+        ftrace_stop();
        error = dpm_suspend_start(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -556,6 +565,7 @@ int hibernation_platform_enter(void)
 Resume_devices:
        entering_platform_hibernation = false;
        dpm_resume_end(PMSG_RESTORE);
+        ftrace_start();
        resume_console();
 Close:
@@ -573,6 +583,10 @@ int hibernation_platform_enter(void)
 */
 static void power_down(void)
 {
+#ifdef CONFIG_SUSPEND
+        int error;
+#endif
        switch (hibernation_mode) {
        case HIBERNATION_REBOOT:
                kernel_restart(NULL);
@@ -582,6 +596,25 @@ static void power_down(void)
        case HIBERNATION_SHUTDOWN:
                kernel_power_off();
                break;
+#ifdef CONFIG_SUSPEND
+        case HIBERNATION_SUSPEND:
+                error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+                if (error) {
+                        if (hibernation_ops)
+                                hibernation_mode = HIBERNATION_PLATFORM;
+                        else
+                                hibernation_mode = HIBERNATION_SHUTDOWN;
+                        power_down();
+                }
+                /*
+                 * Restore swap signature.
+                 */
+                error = swsusp_unmark();
+                if (error)
+                        printk(KERN_ERR "PM: Swap will be unusable! "
+                                        "Try swapon -a.\n");
+                return;
+#endif
        }
        kernel_halt();
        /*
@@ -819,6 +852,9 @@ static const char * const hibernation_modes[] = {
        [HIBERNATION_PLATFORM]  = "platform",
        [HIBERNATION_SHUTDOWN]  = "shutdown",
        [HIBERNATION_REBOOT]    = "reboot",
+#ifdef CONFIG_SUSPEND
+        [HIBERNATION_SUSPEND]   = "suspend",
+#endif
 };
 /*
@@ -859,6 +895,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
                switch (i) {
                case HIBERNATION_SHUTDOWN:
                case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+                case HIBERNATION_SUSPEND:
+#endif
                        break;
                case HIBERNATION_PLATFORM:
                        if (hibernation_ops)
@@ -899,6 +938,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
                switch (mode) {
                case HIBERNATION_SHUTDOWN:
                case HIBERNATION_REBOOT:
+#ifdef CONFIG_SUSPEND
+                case HIBERNATION_SUSPEND:
+#endif
                        hibernation_mode = mode;
                        break;
                case HIBERNATION_PLATFORM:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 428f8a034e96..f458238109cc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -235,6 +235,47 @@ late_initcall(pm_debugfs_init);
 #endif /* CONFIG_PM_SLEEP */
+#ifdef CONFIG_PM_SLEEP_DEBUG
+/*
+ * pm_print_times: print time taken by devices to suspend and resume.
+ *
+ * show() returns whether printing of suspend and resume times is enabled.
+ * store() accepts 0 or 1.  0 disables printing and 1 enables it.
+ */
+bool pm_print_times_enabled;
+static ssize_t pm_print_times_show(struct kobject *kobj,
+                                   struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%d\n", pm_print_times_enabled);
+}
+static ssize_t pm_print_times_store(struct kobject *kobj,
+                                    struct kobj_attribute *attr,
+                                    const char *buf, size_t n)
+{
+        unsigned long val;
+        if (kstrtoul(buf, 10, &val))
+                return -EINVAL;
+        if (val > 1)
+                return -EINVAL;
+        pm_print_times_enabled = !!val;
+        return n;
+}
+power_attr(pm_print_times);
+static inline void pm_print_times_init(void)
+{
+        pm_print_times_enabled = !!initcall_debug;
+}
+#else /* !CONFIG_PP_SLEEP_DEBUG */
+static inline void pm_print_times_init(void) {}
+#endif /* CONFIG_PM_SLEEP_DEBUG */
 struct kobject *power_kobj;
 /**
@@ -531,6 +572,9 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_DEBUG
        &pm_test_attr.attr,
 #endif
+#ifdef CONFIG_PM_SLEEP_DEBUG
+        &pm_print_times_attr.attr,
+#endif
 #endif
        NULL,
 };
@@ -566,6 +610,7 @@ static int __init pm_init(void)
        error = sysfs_create_group(power_kobj, &attr_group);
        if (error)
                return error;
+        pm_print_times_init();
        return pm_autosleep_init();
 }
diff --git a/kernel/power/power.h b/kernel/power/power.h
index b0bd4beaebfe..7d4b7ffb3c1d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -156,6 +156,9 @@ extern void swsusp_free(void);
 extern int swsusp_read(unsigned int *flags_p);
 extern int swsusp_write(unsigned int flags);
 extern void swsusp_close(fmode_t);
+#ifdef CONFIG_SUSPEND
+extern int swsusp_unmark(void);
+#endif
 /* kernel/power/block_io.c */
 extern struct block_device *hib_resume_bdev;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 396d262b8fd0..1da39ea248fd 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
+#include <linux/ftrace.h>
 #include <trace/events/power.h>
 #include "power.h"
@@ -177,6 +178,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        arch_suspend_enable_irqs();
        BUG_ON(irqs_disabled());
+        /* Kick the lockup detector */
+        lockup_detector_bootcpu_resume();
 Enable_cpus:
        enable_nonboot_cpus();
@@ -212,6 +216,7 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
+        ftrace_stop();
        suspend_test_start();
        error = dpm_suspend_start(PMSG_SUSPEND);
        if (error) {
@@ -231,6 +236,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        dpm_resume_end(PMSG_RESUME);
        suspend_test_finish("resume devices");
+        ftrace_start();
        resume_console();
 Close:
        if (suspend_ops->end)
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11e22c068e8b..3c9d764eb0d8 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -448,9 +448,9 @@ static int save_image(struct swap_map_handle *handle,
        struct timeval start;
        struct timeval stop;
-        printk(KERN_INFO "PM: Saving image data pages (%u pages) ...     ",
+        printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
                nr_to_write);
-        m = nr_to_write / 100;
+        m = nr_to_write / 10;
        if (!m)
                m = 1;
        nr_pages = 0;
@@ -464,7 +464,8 @@ static int save_image(struct swap_map_handle *handle,
                if (ret)
                        break;
                if (!(nr_pages % m))
-                        printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
+                        printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
+                               nr_pages / m * 10);
                nr_pages++;
        }
        err2 = hib_wait_on_bio_chain(&bio);
@@ -472,9 +473,7 @@ static int save_image(struct swap_map_handle *handle,
        if (!ret)
                ret = err2;
        if (!ret)
-                printk(KERN_CONT "\b\b\b\bdone\n");
+                printk(KERN_INFO "PM: Image saving done.\n");
-        else
-                printk(KERN_CONT "\n");
        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
        return ret;
 }
@@ -668,9 +667,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
        printk(KERN_INFO
                "PM: Using %u thread(s) for compression.\n"
-                "PM: Compressing and saving image data (%u pages) ...     ",
+                "PM: Compressing and saving image data (%u pages)...\n",
                nr_threads, nr_to_write);
-        m = nr_to_write / 100;
+        m = nr_to_write / 10;
        if (!m)
                m = 1;
        nr_pages = 0;
@@ -690,8 +689,10 @@ static int save_image_lzo(struct swap_map_handle *handle,
                                       data_of(*snapshot), PAGE_SIZE);
                                if (!(nr_pages % m))
-                                        printk(KERN_CONT "\b\b\b\b%3d%%",
+                                        printk(KERN_INFO
-                                               nr_pages / m);
+                                               "PM: Image saving progress: "
+                                               "%3d%%\n",
+                                               nr_pages / m * 10);
                                nr_pages++;
                        }
                        if (!off)
@@ -761,11 +762,8 @@ out_finish:
        do_gettimeofday(&stop);
        if (!ret)
                ret = err2;
-        if (!ret) {
+        if (!ret)
-                printk(KERN_CONT "\b\b\b\bdone\n");
+                printk(KERN_INFO "PM: Image saving done.\n");
-        } else {
-                printk(KERN_CONT "\n");
-        }
        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
 out_clean:
        if (crc) {
@@ -973,9 +971,9 @@ static int load_image(struct swap_map_handle *handle,
        int err2;
        unsigned nr_pages;
-        printk(KERN_INFO "PM: Loading image data pages (%u pages) ...     ",
+        printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
                nr_to_read);
-        m = nr_to_read / 100;
+        m = nr_to_read / 10;
        if (!m)
                m = 1;
        nr_pages = 0;
@@ -993,7 +991,8 @@ static int load_image(struct swap_map_handle *handle,
                if (ret)
                        break;
                if (!(nr_pages % m))
-                        printk("\b\b\b\b%3d%%", nr_pages / m);
+                        printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
+                               nr_pages / m * 10);
                nr_pages++;
        }
        err2 = hib_wait_on_bio_chain(&bio);
@@ -1001,12 +1000,11 @@ static int load_image(struct swap_map_handle *handle,
        if (!ret)
                ret = err2;
        if (!ret) {
-                printk("\b\b\b\bdone\n");
+                printk(KERN_INFO "PM: Image loading done.\n");
                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
                        ret = -ENODATA;
-        } else
+        }
-                printk("\n");
        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
        return ret;
 }
@@ -1185,9 +1183,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
        printk(KERN_INFO
                "PM: Using %u thread(s) for decompression.\n"
-                "PM: Loading and decompressing image data (%u pages) ...     ",
+                "PM: Loading and decompressing image data (%u pages)...\n",
                nr_threads, nr_to_read);
-        m = nr_to_read / 100;
+        m = nr_to_read / 10;
        if (!m)
                m = 1;
        nr_pages = 0;
@@ -1319,7 +1317,10 @@ static int load_image_lzo(struct swap_map_handle *handle,
                                       data[thr].unc + off, PAGE_SIZE);
                                if (!(nr_pages % m))
-                                        printk("\b\b\b\b%3d%%", nr_pages / m);
+                                        printk(KERN_INFO
+                                               "PM: Image loading progress: "
+                                               "%3d%%\n",
+                                               nr_pages / m * 10);
                                nr_pages++;
                                ret = snapshot_write_next(snapshot);
@@ -1344,7 +1345,7 @@ out_finish:
        }
        do_gettimeofday(&stop);
        if (!ret) {
-                printk("\b\b\b\bdone\n");
+                printk(KERN_INFO "PM: Image loading done.\n");
                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
                        ret = -ENODATA;
@@ -1357,8 +1358,7 @@ out_finish:
                                }
                        }
                }
-        } else
+        }
-                printk("\n");
        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
 out_clean:
        for (i = 0; i < ring_size; i++)
@@ -1472,6 +1472,34 @@ void swsusp_close(fmode_t mode)
        blkdev_put(hib_resume_bdev, mode);
 }
+/**
+ *      swsusp_unmark - Unmark swsusp signature in the resume device
+ */
+#ifdef CONFIG_SUSPEND
+int swsusp_unmark(void)
+{
+        int error;
+        hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+        if (!memcmp(HIBERNATE_SIG,swsusp_header->sig, 10)) {
+                memcpy(swsusp_header->sig,swsusp_header->orig_sig, 10);
+                error = hib_bio_write_page(swsusp_resume_block,
+                                        swsusp_header, NULL);
+        } else {
+                printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+                error = -ENODEV;
+        }
+        /*
+         * We just returned from suspend, we don't need the image any more.
+         */
+        free_all_swap_pages(root_swap);
+        return error;
+}
+#endif
 static int swsusp_header_init(void)
 {
        swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index c8fba3380076..8f50de394d22 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -9,6 +9,7 @@
 * manipulate wakelocks on Android.
 */
+#include <linux/capability.h>
 #include <linux/ctype.h>
 #include <linux/device.h>
 #include <linux/err.h>
@@ -188,6 +189,9 @@ int pm_wake_lock(const char *buf)
        size_t len;
        int ret = 0;
+        if (!capable(CAP_BLOCK_SUSPEND))
+                return -EPERM;
        while (*str && !isspace(*str))
                str++;
@@ -231,6 +235,9 @@ int pm_wake_unlock(const char *buf)
        size_t len;
        int ret = 0;
+        if (!capable(CAP_BLOCK_SUSPEND))
+                return -EPERM;
        len = strlen(buf);
        if (!len)
                return -EINVAL;
diff --git a/kernel/printk.c b/kernel/printk.c
index ac4bc9e79465..6a76ab9d4476 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -216,6 +216,7 @@ struct log {
 */
 static DEFINE_RAW_SPINLOCK(logbuf_lock);
+#ifdef CONFIG_PRINTK
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
@@ -228,14 +229,19 @@ static u32 log_first_idx;
 /* index and sequence number of the next record to store in the buffer */
 static u64 log_next_seq;
-#ifdef CONFIG_PRINTK
 static u32 log_next_idx;
+/* the next printk record to write to the console */
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags console_prev;
 /* the next printk record to read after the last 'clear' command */
 static u64 clear_seq;
 static u32 clear_idx;
-#define LOG_LINE_MAX 1024
+#define PREFIX_MAX              32
+#define LOG_LINE_MAX            1024 - PREFIX_MAX
 /* record buffer */
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -360,6 +366,7 @@ static void log_store(int facility, int level,
 struct devkmsg_user {
        u64 seq;
        u32 idx;
+        enum log_flags prev;
        struct mutex lock;
        char buf[8192];
 };
@@ -382,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
        line = buf;
        for (i = 0; i < count; i++) {
-                if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len))
+                if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
+                        ret = -EFAULT;
                        goto out;
+                }
                line += iv[i].iov_len;
        }
@@ -425,6 +434,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        struct log *msg;
        u64 ts_usec;
        size_t i;
+        char cont = '-';
        size_t len;
        ssize_t ret;
@@ -462,8 +472,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
        msg = log_from_idx(user->idx);
        ts_usec = msg->ts_nsec;
        do_div(ts_usec, 1000);
-        len = sprintf(user->buf, "%u,%llu,%llu;",
-                      (msg->facility << 3) | msg->level, user->seq, ts_usec);
+        /*
+         * If we couldn't merge continuation line fragments during the print,
+         * export the stored flags to allow an optional external merge of the
+         * records. Merging the records isn't always neccessarily correct, like
+         * when we hit a race during printing. In most cases though, it produces
+         * better readable output. 'c' in the record flags mark the first
+         * fragment of a line, '+' the following.
+         */
+        if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT))
+                cont = 'c';
+        else if ((msg->flags & LOG_CONT) ||
+                 ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)))
+                cont = '+';
+        len = sprintf(user->buf, "%u,%llu,%llu,%c;",
+                      (msg->facility << 3) | msg->level,
+                      user->seq, ts_usec, cont);
+        user->prev = msg->flags;
        /* escape non-printable characters */
        for (i = 0; i < msg->text_len; i++) {
@@ -646,6 +673,15 @@ void log_buf_kexec_setup(void)
        VMCOREINFO_SYMBOL(log_buf_len);
        VMCOREINFO_SYMBOL(log_first_idx);
        VMCOREINFO_SYMBOL(log_next_idx);
+        /*
+         * Export struct log size and field offsets. User space tools can
+         * parse it and detect any changes to structure down the line.
+         */
+        VMCOREINFO_STRUCT_SIZE(log);
+        VMCOREINFO_OFFSET(log, ts_nsec);
+        VMCOREINFO_OFFSET(log, len);
+        VMCOREINFO_OFFSET(log, text_len);
+        VMCOREINFO_OFFSET(log, dict_len);
 }
 #endif
@@ -876,7 +912,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
                if (buf) {
                        if (print_prefix(msg, syslog, NULL) +
-                            text_len + 1>= size - len)
+                            text_len + 1 >= size - len)
                                break;
                        if (prefix)
@@ -907,7 +943,7 @@ static int syslog_print(char __user *buf, int size)
        struct log *msg;
        int len = 0;
-        text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+        text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;
@@ -930,7 +966,8 @@ static int syslog_print(char __user *buf, int size)
                skip = syslog_partial;
                msg = log_from_idx(syslog_idx);
-                n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX);
+                n = msg_print_text(msg, syslog_prev, true, text,
+                                   LOG_LINE_MAX + PREFIX_MAX);
                if (n - syslog_partial <= size) {
                        /* message fits into buffer, move forward */
                        syslog_idx = log_next(syslog_idx);
@@ -969,7 +1006,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
        char *text;
        int len = 0;
-        text = kmalloc(LOG_LINE_MAX, GFP_KERNEL);
+        text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;
@@ -1022,7 +1059,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                        struct log *msg = log_from_idx(idx);
                        int textlen;
-                        textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);
+                        textlen = msg_print_text(msg, prev, true, text,
+                                                 LOG_LINE_MAX + PREFIX_MAX);
                        if (textlen < 0) {
                                len = textlen;
                                break;
@@ -1349,20 +1387,36 @@ static struct cont {
        u64 ts_nsec;                    /* time of first print */
        u8 level;                       /* log level of first message */
        u8 facility;                    /* log level of first message */
+        enum log_flags flags;           /* prefix, newline flags */
        bool flushed:1;                 /* buffer sealed and committed */
 } cont;
-static void cont_flush(void)
+static void cont_flush(enum log_flags flags)
 {
        if (cont.flushed)
                return;
        if (cont.len == 0)
                return;
-        log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec,
+        if (cont.cons) {
-                  NULL, 0, cont.buf, cont.len);
+                /*
+                 * If a fragment of this line was directly flushed to the
-        cont.flushed = true;
+                 * console; wait for the console to pick up the rest of the
+                 * line. LOG_NOCONS suppresses a duplicated output.
+                 */
+                log_store(cont.facility, cont.level, flags | LOG_NOCONS,
+                          cont.ts_nsec, NULL, 0, cont.buf, cont.len);
+                cont.flags = flags;
+                cont.flushed = true;
+        } else {
+                /*
+                 * If no fragment of this line ever reached the console,
+                 * just submit it to the store and free the buffer.
+                 */
+                log_store(cont.facility, cont.level, flags, 0,
+                          NULL, 0, cont.buf, cont.len);
+                cont.len = 0;
+        }
 }
 static bool cont_add(int facility, int level, const char *text, size_t len)
@@ -1371,7 +1425,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
                return false;
        if (cont.len + len > sizeof(cont.buf)) {
-                cont_flush();
+                /* the line gets too long, split it up in separate records */
+                cont_flush(LOG_CONT);
                return false;
        }
@@ -1380,12 +1435,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len)
                cont.level = level;
                cont.owner = current;
                cont.ts_nsec = local_clock();
+                cont.flags = 0;
                cont.cons = 0;
                cont.flushed = false;
        }
        memcpy(cont.buf + cont.len, text, len);
        cont.len += len;
+        if (cont.len > (sizeof(cont.buf) * 80) / 100)
+                cont_flush(LOG_CONT);
        return true;
 }
@@ -1394,7 +1454,7 @@ static size_t cont_print_text(char *text, size_t size)
        size_t textlen = 0;
        size_t len;
-        if (cont.cons == 0) {
+        if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
                textlen += print_time(cont.ts_nsec, text);
                size -= textlen;
        }
@@ -1409,7 +1469,8 @@ static size_t cont_print_text(char *text, size_t size)
        }
        if (cont.flushed) {
-                text[textlen++] = '\n';
+                if (cont.flags & LOG_NEWLINE)
+                        text[textlen++] = '\n';
                /* got everything, release buffer */
                cont.len = 0;
        }
@@ -1481,17 +1542,23 @@ asmlinkage int vprintk_emit(int facility, int level,
                lflags |= LOG_NEWLINE;
        }
-        /* strip syslog prefix and extract log level or control flags */
+        /* strip kernel syslog prefix and extract log level or control flags */
-        if (text[0] == '<' && text[1] && text[2] == '>') {
+        if (facility == 0) {
-                switch (text[1]) {
+                int kern_level = printk_get_level(text);
-                case '0' ... '7':
-                        if (level == -1)
+                if (kern_level) {
-                                level = text[1] - '0';
+                        const char *end_of_header = printk_skip_level(text);
-                case 'd':       /* KERN_DEFAULT */
+                        switch (kern_level) {
-                        lflags |= LOG_PREFIX;
+                        case '0' ... '7':
-                case 'c':       /* KERN_CONT */
+                                if (level == -1)
-                        text += 3;
+                                        level = kern_level - '0';
-                        text_len -= 3;
+                        case 'd':       /* KERN_DEFAULT */
+                                lflags |= LOG_PREFIX;
+                        case 'c':       /* KERN_CONT */
+                                break;
+                        }
+                        text_len -= end_of_header - text;
+                        text = (char *)end_of_header;
                }
        }
@@ -1507,7 +1574,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                 * or another task also prints continuation lines.
                 */
                if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
-                        cont_flush();
+                        cont_flush(LOG_NEWLINE);
                /* buffer line if possible, otherwise store it right away */
                if (!cont_add(facility, level, text, text_len))
@@ -1525,7 +1592,7 @@ asmlinkage int vprintk_emit(int facility, int level,
                if (cont.len && cont.owner == current) {
                        if (!(lflags & LOG_PREFIX))
                                stored = cont_add(facility, level, text, text_len);
-                        cont_flush();
+                        cont_flush(LOG_NEWLINE);
                }
                if (!stored)
@@ -1616,9 +1683,20 @@ asmlinkage int printk(const char *fmt, ...)
 }
 EXPORT_SYMBOL(printk);
-#else
+#else /* CONFIG_PRINTK */
+#define LOG_LINE_MAX            0
+#define PREFIX_MAX              0
 #define LOG_LINE_MAX 0
+static u64 syslog_seq;
+static u32 syslog_idx;
+static u64 console_seq;
+static u32 console_idx;
+static enum log_flags syslog_prev;
+static u64 log_first_seq;
+static u32 log_first_idx;
+static u64 log_next_seq;
+static enum log_flags console_prev;
 static struct cont {
        size_t len;
        size_t cons;
@@ -1902,10 +1980,34 @@ void wake_up_klogd(void)
                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
 }
-/* the next printk record to write to the console */
+static void console_cont_flush(char *text, size_t size)
-static u64 console_seq;
+{
-static u32 console_idx;
+        unsigned long flags;
-static enum log_flags console_prev;
+        size_t len;
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        if (!cont.len)
+                goto out;
+        /*
+         * We still queue earlier records, likely because the console was
+         * busy. The earlier ones need to be printed before this one, we
+         * did not flush any fragment so far, so just let it queue up.
+         */
+        if (console_seq < log_next_seq && !cont.cons)
+                goto out;
+        len = cont_print_text(text, size);
+        raw_spin_unlock(&logbuf_lock);
+        stop_critical_timings();
+        call_console_drivers(cont.level, text, len);
+        start_critical_timings();
+        local_irq_restore(flags);
+        return;
+out:
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+}
 /**
 * console_unlock - unlock the console system
@@ -1923,7 +2025,7 @@ static enum log_flags console_prev;
 */
 void console_unlock(void)
 {
-        static char text[LOG_LINE_MAX];
+        static char text[LOG_LINE_MAX + PREFIX_MAX];
        static u64 seen_seq;
        unsigned long flags;
        bool wake_klogd = false;
@@ -1937,19 +2039,7 @@ void console_unlock(void)
        console_may_schedule = 0;
        /* flush buffered message fragment immediately to console */
-        raw_spin_lock_irqsave(&logbuf_lock, flags);
+        console_cont_flush(text, sizeof(text));
-        if (cont.len && (cont.cons < cont.len || cont.flushed)) {
-                size_t len;
-                len = cont_print_text(text, sizeof(text));
-                raw_spin_unlock(&logbuf_lock);
-                stop_critical_timings();
-                call_console_drivers(cont.level, text, len);
-                start_critical_timings();
-                local_irq_restore(flags);
-        } else
-                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 again:
        for (;;) {
                struct log *msg;
@@ -1986,6 +2076,7 @@ skip:
                         * will properly dump everything later.
                         */
                        msg->flags &= ~LOG_NOCONS;
+                        console_prev = msg->flags;
                        goto skip;
                }
diff --git a/kernel/resource.c b/kernel/resource.c
index e1d2b8ee76d5..34d45886ee84 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,6 +7,8 @@
 * Arbitrary resource management.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/export.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
@@ -722,14 +724,12 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
        write_lock(&resource_lock);
+        if (!parent)
+                goto skip;
        if ((start < parent->start) || (end > parent->end))
                goto out;
-        for (tmp = res->child; tmp; tmp = tmp->sibling) {
-                if ((tmp->start < start) || (tmp->end > end))
-                        goto out;
-        }
        if (res->sibling && (res->sibling->start <= end))
                goto out;
@@ -741,6 +741,11 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
                        goto out;
        }
+skip:
+        for (tmp = res->child; tmp; tmp = tmp->sibling)
+                if ((tmp->start < start) || (tmp->end > end))
+                        goto out;
        res->start = start;
        res->end = end;
        result = 0;
@@ -788,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root,
                resource_size_t start, resource_size_t end,
                const char *name)
 {
+        int abort = 0;
        write_lock(&resource_lock);
-        __reserve_region_with_split(root, start, end, name);
+        if (root->start > start || root->end < end) {
+                pr_err("requested range [0x%llx-0x%llx] not in root %pr\n",
+                       (unsigned long long)start, (unsigned long long)end,
+                       root);
+                if (start > root->end || end < root->start)
+                        abort = 1;
+                else {
+                        if (end > root->end)
+                                end = root->end;
+                        if (start < root->start)
+                                start = root->start;
+                        pr_err("fixing request to [0x%llx-0x%llx]\n",
+                               (unsigned long long)start,
+                               (unsigned long long)end);
+                }
+                dump_stack();
+        }
+        if (!abort)
+                __reserve_region_with_split(root, start, end, name);
        write_unlock(&resource_lock);
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad732b56ba70..82ad284f823b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
         *
         * sched_move_task() holds both and thus holding either pins the cgroup,
-         * see set_task_rq().
+         * see task_group().
         *
         * Furthermore, all task_rq users should acquire both locks, see
         * task_rq_lock().
@@ -4340,9 +4340,7 @@ recheck:
         */
        if (unlikely(policy == p->policy && (!rt_policy(policy) ||
                        param->sched_priority == p->rt_priority))) {
+                task_rq_unlock(rq, p, &flags);
-                __task_rq_unlock(rq);
-                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                return 0;
        }
@@ -6024,6 +6022,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 * allows us to avoid some pointer chasing select_idle_sibling().
 *
+ * Iterate domains and sched_groups downward, assigning CPUs to be
+ * select_idle_sibling() hw buddy.  Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
 * Also keep a unique ID per domain (we use the first cpu number in
 * the cpumask of the domain), this allows us to quickly tell if
 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6037,8 +6040,40 @@ static void update_top_cache_domain(int cpu)
        int id = cpu;
        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-        if (sd)
+        if (sd) {
+                struct sched_domain *tmp = sd;
+                struct sched_group *sg, *prev;
+                bool right;
+                /*
+                 * Traverse to first CPU in group, and count hops
+                 * to cpu from there, switching direction on each
+                 * hop, never ever pointing the last CPU rightward.
+                 */
+                do {
+                        id = cpumask_first(sched_domain_span(tmp));
+                        prev = sg = tmp->groups;
+                        right = 1;
+                        while (cpumask_first(sched_group_cpus(sg)) != id)
+                                sg = sg->next;
+                        while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+                                prev = sg;
+                                sg = sg->next;
+                                right = !right;
+                        }
+                        /* A CPU went down, never point back to domain start. */
+                        if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+                                right = false;
+                        sg = right ? sg->next : prev;
+                        tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+                } while ((tmp = tmp->child));
                id = cpumask_first(sched_domain_span(sd));
+        }
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_id, cpu) = id;
@@ -7097,34 +7132,66 @@ match2:
        mutex_unlock(&sched_domains_mutex);
 }
+static int num_cpus_frozen;     /* used to mark begin/end of suspend/resume */
 /*
 * Update cpusets according to cpu_active mask.  If cpusets are
 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
 * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
 */
 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
                             void *hcpu)
 {
-        switch (action & ~CPU_TASKS_FROZEN) {
+        switch (action) {
+        case CPU_ONLINE_FROZEN:
+        case CPU_DOWN_FAILED_FROZEN:
+                /*
+                 * num_cpus_frozen tracks how many CPUs are involved in suspend
+                 * resume sequence. As long as this is not the last online
+                 * operation in the resume sequence, just build a single sched
+                 * domain, ignoring cpusets.
+                 */
+                num_cpus_frozen--;
+                if (likely(num_cpus_frozen)) {
+                        partition_sched_domains(1, NULL, NULL);
+                        break;
+                }
+                /*
+                 * This is the last CPU online operation. So fall through and
+                 * restore the original sched domains by considering the
+                 * cpuset configurations.
+                 */
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
-                cpuset_update_active_cpus();
+                cpuset_update_active_cpus(true);
-                return NOTIFY_OK;
+                break;
        default:
                return NOTIFY_DONE;
        }
+        return NOTIFY_OK;
 }
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
                               void *hcpu)
 {
-        switch (action & ~CPU_TASKS_FROZEN) {
+        switch (action) {
        case CPU_DOWN_PREPARE:
-                cpuset_update_active_cpus();
+                cpuset_update_active_cpus(false);
-                return NOTIFY_OK;
+                break;
+        case CPU_DOWN_PREPARE_FROZEN:
+                num_cpus_frozen++;
+                partition_sched_domains(1, NULL, NULL);
+                break;
        default:
                return NOTIFY_DONE;
        }
+        return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
@@ -7589,6 +7656,7 @@ void sched_destroy_group(struct task_group *tg)
 */
 void sched_move_task(struct task_struct *tsk)
 {
+        struct task_group *tg;
        int on_rq, running;
        unsigned long flags;
        struct rq *rq;
@@ -7603,6 +7671,12 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->put_prev_task(rq, tsk);
+        tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+                                lockdep_is_held(&tsk->sighand->siglock)),
+                          struct task_group, css);
+        tg = autogroup_task_group(tsk, tg);
+        tsk->sched_task_group = tg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
                tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586fdf660..23aa789c53ee 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -65,8 +65,8 @@ static int convert_prio(int prio)
 int cpupri_find(struct cpupri *cp, struct task_struct *p,
                struct cpumask *lowest_mask)
 {
-        int                  idx      = 0;
+        int idx = 0;
-        int                  task_pri = convert_prio(p->prio);
+        int task_pri = convert_prio(p->prio);
        if (task_pri >= MAX_RT_PRIO)
                return 0;
@@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 */
 void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 {
-        int                 *currpri = &cp->cpu_to_pri[cpu];
+        int *currpri = &cp->cpu_to_pri[cpu];
-        int                  oldpri  = *currpri;
+        int oldpri = *currpri;
-        int                  do_mb = 0;
+        int do_mb = 0;
        newpri = convert_prio(newpri);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c099cc6eebe3..d0cc03b3e70b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
-        struct sched_group *sg;
-        int i;
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
                return prev_cpu;
        /*
-         * Otherwise, iterate the domains and find an elegible idle cpu.
+         * Otherwise, check assigned siblings to find an elegible idle cpu.
         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
-        for_each_lower_domain(sd) {
-                sg = sd->groups;
-                do {
-                        if (!cpumask_intersects(sched_group_cpus(sg),
-                                                tsk_cpus_allowed(p)))
-                                goto next;
-                        for_each_cpu(i, sched_group_cpus(sg)) {
-                                if (!idle_cpu(i))
-                                        goto next;
-                        }
-                        target = cpumask_first_and(sched_group_cpus(sg),
+        for_each_lower_domain(sd) {
-                                        tsk_cpus_allowed(p));
+                if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
-                        goto done;
+                        continue;
-next:
+                if (idle_cpu(sd->idle_buddy))
-                        sg = sg->next;
+                        return sd->idle_buddy;
-                } while (sg != sd->groups);
        }
-done:
        return target;
 }
@@ -3068,18 +3054,24 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 #define LBF_ALL_PINNED  0x01
 #define LBF_NEED_BREAK  0x02
+#define LBF_SOME_PINNED 0x04
 struct lb_env {
        struct sched_domain     *sd;
-        int                     src_cpu;
        struct rq               *src_rq;
+        int                     src_cpu;
        int                     dst_cpu;
        struct rq               *dst_rq;
+        struct cpumask          *dst_grpmask;
+        int                     new_dst_cpu;
        enum cpu_idle_type      idle;
        long                    imbalance;
+        /* The set of CPUs under consideration for load-balancing */
+        struct cpumask          *cpus;
        unsigned int            flags;
        unsigned int            loop;
@@ -3145,9 +3137,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
         * 3) are cache-hot on their current CPU.
         */
        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+                int new_dst_cpu;
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+                /*
+                 * Remember if this task can be migrated to any other cpu in
+                 * our sched_group. We may want to revisit it if we couldn't
+                 * meet load balance goals by pulling other tasks on src_cpu.
+                 *
+                 * Also avoid computing new_dst_cpu if we have already computed
+                 * one in current iteration.
+                 */
+                if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+                        return 0;
+                new_dst_cpu = cpumask_first_and(env->dst_grpmask,
+                                                tsk_cpus_allowed(p));
+                if (new_dst_cpu < nr_cpu_ids) {
+                        env->flags |= LBF_SOME_PINNED;
+                        env->new_dst_cpu = new_dst_cpu;
+                }
                return 0;
        }
+        /* Record that we found atleast one task that could run on dst_cpu */
        env->flags &= ~LBF_ALL_PINNED;
        if (task_running(env->src_rq, p)) {
@@ -3642,8 +3656,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 */
 static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
-                        int local_group, const struct cpumask *cpus,
+                        int local_group, int *balance, struct sg_lb_stats *sgs)
-                        int *balance, struct sg_lb_stats *sgs)
 {
        unsigned long nr_running, max_nr_running, min_nr_running;
        unsigned long load, max_cpu_load, min_cpu_load;
@@ -3660,7 +3673,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        max_nr_running = 0;
        min_nr_running = ~0UL;
-        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
                nr_running = rq->nr_running;
@@ -3789,8 +3802,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 * @sds: variable to hold the statistics for this sched_domain.
 */
 static inline void update_sd_lb_stats(struct lb_env *env,
-                                      const struct cpumask *cpus,
+                                        int *balance, struct sd_lb_stats *sds)
-                                      int *balance, struct sd_lb_stats *sds)
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
@@ -3807,8 +3819,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(env, sg, load_idx, local_group,
+                update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
-                                   cpus, balance, &sgs);
                if (local_group && !(*balance))
                        return;
@@ -4044,7 +4055,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 * to restore balance.
 *
 * @env: The load balancing environment.
- * @cpus: The set of CPUs under consideration for load-balancing.
 * @balance: Pointer to a variable indicating if this_cpu
 *      is the appropriate cpu to perform load balancing at this_level.
 *
@@ -4054,7 +4064,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 *                 put to idle by rebalancing its tasks onto our group.
 */
 static struct sched_group *
-find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
+find_busiest_group(struct lb_env *env, int *balance)
 {
        struct sd_lb_stats sds;
@@ -4064,7 +4074,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
         * Compute the various statistics relavent for load balancing at
         * this level.
         */
-        update_sd_lb_stats(env, cpus, balance, &sds);
+        update_sd_lb_stats(env, balance, &sds);
        /*
         * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4144,8 +4154,7 @@ ret:
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
 static struct rq *find_busiest_queue(struct lb_env *env,
-                                     struct sched_group *group,
+                                     struct sched_group *group)
-                                     const struct cpumask *cpus)
 {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
@@ -4160,7 +4169,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                if (!capacity)
                        capacity = fix_small_capacity(env->sd, group);
-                if (!cpumask_test_cpu(i, cpus))
+                if (!cpumask_test_cpu(i, env->cpus))
                        continue;
                rq = cpu_rq(i);
@@ -4227,7 +4236,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-        int ld_moved, active_balance = 0;
+        int ld_moved, cur_ld_moved, active_balance = 0;
+        int lb_iterations, max_lb_iterations;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
@@ -4237,16 +4247,19 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .sd             = sd,
                .dst_cpu        = this_cpu,
                .dst_rq         = this_rq,
+                .dst_grpmask    = sched_group_cpus(sd->groups),
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
+                .cpus           = cpus,
        };
        cpumask_copy(cpus, cpu_active_mask);
+        max_lb_iterations = cpumask_weight(env.dst_grpmask);
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        group = find_busiest_group(&env, cpus, balance);
+        group = find_busiest_group(&env, balance);
        if (*balance == 0)
                goto out_balanced;
@@ -4256,7 +4269,7 @@ redo:
                goto out_balanced;
        }
-        busiest = find_busiest_queue(&env, group, cpus);
+        busiest = find_busiest_queue(&env, group);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -4267,6 +4280,7 @@ redo:
        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
        ld_moved = 0;
+        lb_iterations = 1;
        if (busiest->nr_running > 1) {
                /*
                 * Attempt to move tasks. If find_busiest_group has found
@@ -4284,7 +4298,13 @@ more_balance:
                double_rq_lock(this_rq, busiest);
                if (!env.loop)
                        update_h_load(env.src_cpu);
-                ld_moved += move_tasks(&env);
+                /*
+                 * cur_ld_moved - load moved in current iteration
+                 * ld_moved     - cumulative load moved across iterations
+                 */
+                cur_ld_moved = move_tasks(&env);
+                ld_moved += cur_ld_moved;
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
@@ -4296,14 +4316,52 @@ more_balance:
                /*
                 * some other cpu did the load balance for us.
                 */
-                if (ld_moved && this_cpu != smp_processor_id())
+                if (cur_ld_moved && env.dst_cpu != smp_processor_id())
-                        resched_cpu(this_cpu);
+                        resched_cpu(env.dst_cpu);
+                /*
+                 * Revisit (affine) tasks on src_cpu that couldn't be moved to
+                 * us and move them to an alternate dst_cpu in our sched_group
+                 * where they can run. The upper limit on how many times we
+                 * iterate on same src_cpu is dependent on number of cpus in our
+                 * sched_group.
+                 *
+                 * This changes load balance semantics a bit on who can move
+                 * load to a given_cpu. In addition to the given_cpu itself
+                 * (or a ilb_cpu acting on its behalf where given_cpu is
+                 * nohz-idle), we now have balance_cpu in a position to move
+                 * load to given_cpu. In rare situations, this may cause
+                 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
+                 * _independently_ and at _same_ time to move some load to
+                 * given_cpu) causing exceess load to be moved to given_cpu.
+                 * This however should not happen so much in practice and
+                 * moreover subsequent load balance cycles should correct the
+                 * excess load moved.
+                 */
+                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
+                                lb_iterations++ < max_lb_iterations) {
+                        this_rq          = cpu_rq(env.new_dst_cpu);
+                        env.dst_rq       = this_rq;
+                        env.dst_cpu      = env.new_dst_cpu;
+                        env.flags       &= ~LBF_SOME_PINNED;
+                        env.loop         = 0;
+                        env.loop_break   = sched_nr_migrate_break;
+                        /*
+                         * Go back to "more_balance" rather than "redo" since we
+                         * need to continue with same src_cpu.
+                         */
+                        goto more_balance;
+                }
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
-                        if (!cpumask_empty(cpus))
+                        if (!cpumask_empty(cpus)) {
+                                env.loop = 0;
+                                env.loop_break = sched_nr_migrate_break;
                                goto redo;
+                        }
                        goto out_balanced;
                }
        }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55844f24435a..c35a1a7dd4d6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg);
 /*
 * Return the group to which this tasks belongs.
 *
- * We use task_subsys_state_check() and extend the RCU verification with
+ * We cannot use task_subsys_state() and friends because the cgroup
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * subsystem changes that value before the cgroup_subsys::attach() method
- * task it moves into the cgroup. Therefore by holding either of those locks,
+ * is called, therefore we cannot pin it and might observe the wrong value.
- * we pin the task to the current cgroup.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
 */
 static inline struct task_group *task_group(struct task_struct *p)
 {
-        struct task_group *tg;
+        return p->sched_task_group;
-        struct cgroup_subsys_state *css;
-        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                        lockdep_is_held(&p->pi_lock) ||
-                        lockdep_is_held(&task_rq(p)->lock));
-        tg = container_of(css, struct task_group, css);
-        return autogroup_task_group(p, tg);
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
diff --git a/kernel/signal.c b/kernel/signal.c
index 677102789cf2..be4f856d52f8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1971,6 +1971,13 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 void ptrace_notify(int exit_code)
 {
        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
+        if (unlikely(current->task_works)) {
+                if (test_and_clear_ti_thread_flag(current_thread_info(),
+                                                   TIF_NOTIFY_RESUME)) {
+                        smp_mb__after_clear_bit();
+                        task_work_run();
+                }
+        }
        spin_lock_irq(&current->sighand->siglock);
        ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2191,6 +2198,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
        struct signal_struct *signal = current->signal;
        int signr;
+        if (unlikely(current->task_works)) {
+                if (test_and_clear_ti_thread_flag(current_thread_info(),
+                                                   TIF_NOTIFY_RESUME)) {
+                        smp_mb__after_clear_bit();
+                        task_work_run();
+                }
+        }
        if (unlikely(uprobe_deny_signal()))
                return 0;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 671f9594e368..b73e681df09e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void)
        __u32 pending;
        int max_restart = MAX_SOFTIRQ_RESTART;
        int cpu;
+        unsigned long old_flags = current->flags;
+        /*
+         * Mask out PF_MEMALLOC s current task context is borrowed for the
+         * softirq. A softirq handled such as network RX might set PF_MEMALLOC
+         * again if the socket is related to swap
+         */
+        current->flags &= ~PF_MEMALLOC;
        pending = local_softirq_pending();
        account_system_vtime(current);
@@ -265,6 +273,7 @@ restart:
        account_system_vtime(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
+        tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
 #ifndef __ARCH_HAS_DO_SOFTIRQ
diff --git a/kernel/sys.c b/kernel/sys.c
index 2d39a84cd857..241507f23eca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2015,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                break;
                        }
                        me->pdeath_signal = arg2;
-                        error = 0;
                        break;
                case PR_GET_PDEATHSIG:
                        error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2029,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                break;
                        }
                        set_dumpable(me->mm, arg2);
-                        error = 0;
                        break;
                case PR_SET_UNALIGN:
@@ -2056,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                case PR_SET_TIMING:
                        if (arg2 != PR_TIMING_STATISTICAL)
                                error = -EINVAL;
-                        else
-                                error = 0;
                        break;
                case PR_SET_NAME:
                        comm[sizeof(me->comm)-1] = 0;
                        if (strncpy_from_user(comm, (char __user *)arg2,
@@ -2067,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                return -EFAULT;
                        set_task_comm(me, comm);
                        proc_comm_connector(me);
-                        return 0;
+                        break;
                case PR_GET_NAME:
                        get_task_comm(comm, me);
                        if (copy_to_user((char __user *)arg2, comm,
                                         sizeof(comm)))
                                return -EFAULT;
-                        return 0;
+                        break;
                case PR_GET_ENDIAN:
                        error = GET_ENDIAN(me, arg2);
                        break;
                case PR_SET_ENDIAN:
                        error = SET_ENDIAN(me, arg2);
                        break;
                case PR_GET_SECCOMP:
                        error = prctl_get_seccomp();
                        break;
@@ -2108,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                        current->default_timer_slack_ns;
                        else
                                current->timer_slack_ns = arg2;
-                        error = 0;
                        break;
                case PR_MCE_KILL:
                        if (arg4 | arg5)
@@ -2134,7 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        default:
                                return -EINVAL;
                        }
-                        error = 0;
                        break;
                case PR_MCE_KILL_GET:
                        if (arg2 | arg3 | arg4 | arg5)
@@ -2153,7 +2145,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                        break;
                case PR_SET_CHILD_SUBREAPER:
                        me->signal->is_child_subreaper = !!arg2;
-                        error = 0;
                        break;
                case PR_GET_CHILD_SUBREAPER:
                        error = put_user(me->signal->is_child_subreaper,
@@ -2195,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info)
        argv_free(info->argv);
 }
-/**
+static int __orderly_poweroff(void)
- * orderly_poweroff - Trigger an orderly system poweroff
- * @force: force poweroff if command execution fails
- *
- * This may be called from any context to trigger a system shutdown.
- * If the orderly shutdown fails, it will force an immediate shutdown.
- */
-int orderly_poweroff(bool force)
 {
        int argc;
-        char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
+        char **argv;
        static char *envp[] = {
                "HOME=/",
                "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
                NULL
        };
-        int ret = -ENOMEM;
+        int ret;
+        argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
        if (argv == NULL) {
                printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
                       __func__, poweroff_cmd);
-                goto out;
+                return -ENOMEM;
        }
        ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT,
                                      NULL, argv_cleanup, NULL);
-out:
-        if (likely(!ret))
-                return 0;
        if (ret == -ENOMEM)
                argv_free(argv);
-        if (force) {
+        return ret;
+}
+/**
+ * orderly_poweroff - Trigger an orderly system poweroff
+ * @force: force poweroff if command execution fails
+ *
+ * This may be called from any context to trigger a system shutdown.
+ * If the orderly shutdown fails, it will force an immediate shutdown.
+ */
+int orderly_poweroff(bool force)
+{
+        int ret = __orderly_poweroff();
+        if (ret && force) {
                printk(KERN_WARNING "Failed to start orderly shutdown: "
                       "forcing the issue\n");
-                /* I guess this should try to kick off some daemon to
+                /*
-                   sync and poweroff asap.  Or not even bother syncing
+                 * I guess this should try to kick off some daemon to sync and
-                   if we're doing an emergency shutdown? */
+                 * poweroff asap.  Or not even bother syncing if we're doing an
+                 * emergency shutdown?
+                 */
                emergency_sync();
                kernel_power_off();
        }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab11879aeb4..87174ef59161 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/ctype.h>
 #include <linux/kmemcheck.h>
+#include <linux/kmemleak.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -174,6 +175,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp, loff_t *ppos);
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
 static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -410,7 +416,7 @@ static struct ctl_table kern_table[] = {
                .data           = core_pattern,
                .maxlen         = CORENAME_MAX_SIZE,
                .mode           = 0644,
-                .proc_handler   = proc_dostring,
+                .proc_handler   = proc_dostring_coredump,
        },
        {
                .procname       = "core_pipe_limit",
@@ -1095,11 +1101,9 @@ static struct ctl_table vm_table[] = {
                .extra1         = &zero,
        },
        {
-                .procname       = "nr_pdflush_threads",
+                .procname       = "nr_pdflush_threads",
-                .data           = &nr_pdflush_threads,
+                .mode           = 0444 /* read-only */,
-                .maxlen         = sizeof nr_pdflush_threads,
+                .proc_handler   = pdflush_proc_obsolete,
-                .mode           = 0444 /* read-only*/,
-                .proc_handler   = proc_dointvec,
        },
        {
                .procname       = "swappiness",
@@ -1494,11 +1498,29 @@ static struct ctl_table fs_table[] = {
 #endif
 #endif
        {
+                .procname       = "protected_symlinks",
+                .data           = &sysctl_protected_symlinks,
+                .maxlen         = sizeof(int),
+                .mode           = 0600,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        {
+                .procname       = "protected_hardlinks",
+                .data           = &sysctl_protected_hardlinks,
+                .maxlen         = sizeof(int),
+                .mode           = 0600,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &zero,
+                .extra2         = &one,
+        },
+        {
                .procname       = "suid_dumpable",
                .data           = &suid_dumpable,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = proc_dointvec_minmax_coredump,
                .extra1         = &zero,
                .extra2         = &two,
        },
@@ -1551,7 +1573,10 @@ static struct ctl_table dev_table[] = {
 int __init sysctl_init(void)
 {
-        register_sysctl_table(sysctl_base_table);
+        struct ctl_table_header *hdr;
+        hdr = register_sysctl_table(sysctl_base_table);
+        kmemleak_not_leak(hdr);
        return 0;
 }
@@ -2009,6 +2034,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
                                do_proc_dointvec_minmax_conv, &param);
 }
+static void validate_coredump_safety(void)
+{
+        if (suid_dumpable == SUID_DUMPABLE_SAFE &&
+            core_pattern[0] != '/' && core_pattern[0] != '|') {
+                printk(KERN_WARNING "Unsafe core_pattern used with "\
+                        "suid_dumpable=2. Pipe handler or fully qualified "\
+                        "core dump path required.\n");
+        }
+}
+static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+        if (!error)
+                validate_coredump_safety();
+        return error;
+}
+static int proc_dostring_coredump(struct ctl_table *table, int write,
+                  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        int error = proc_dostring(table, write, buffer, lenp, ppos);
+        if (!error)
+                validate_coredump_safety();
+        return error;
+}
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
                                     void __user *buffer,
                                     size_t *lenp, loff_t *ppos,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index a650694883a1..65bdcf198d4e 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = {
        { CTL_INT,      VM_DIRTY_RATIO,                 "dirty_ratio" },
        /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
        /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
-        { CTL_INT,      VM_NR_PDFLUSH_THREADS,          "nr_pdflush_threads" },
+        /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
        { CTL_INT,      VM_OVERCOMMIT_RATIO,            "overcommit_ratio" },
        /* VM_PAGEBUF unused */
        /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 82d1c794066d..91d4e1742a0c 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -3,82 +3,78 @@
 #include <linux/tracehook.h>
 int
-task_work_add(struct task_struct *task, struct task_work *twork, bool notify)
+task_work_add(struct task_struct *task, struct callback_head *twork, bool notify)
 {
+        struct callback_head *last, *first;
        unsigned long flags;
-        int err = -ESRCH;
-#ifndef TIF_NOTIFY_RESUME
-        if (notify)
-                return -ENOTSUPP;
-#endif
        /*
-         * We must not insert the new work if the task has already passed
+         * Not inserting the new work if the task has already passed
-         * exit_task_work(). We rely on do_exit()->raw_spin_unlock_wait()
+         * exit_task_work() is the responisbility of callers.
-         * and check PF_EXITING under pi_lock.
         */
        raw_spin_lock_irqsave(&task->pi_lock, flags);
-        if (likely(!(task->flags & PF_EXITING))) {
+        last = task->task_works;
-                hlist_add_head(&twork->hlist, &task->task_works);
+        first = last ? last->next : twork;
-                err = 0;
+        twork->next = first;
-        }
+        if (last)
+                last->next = twork;
+        task->task_works = twork;
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
        /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
-        if (likely(!err) && notify)
+        if (notify)
                set_notify_resume(task);
-        return err;
+        return 0;
 }
-struct task_work *
+struct callback_head *
 task_work_cancel(struct task_struct *task, task_work_func_t func)
 {
        unsigned long flags;
-        struct task_work *twork;
+        struct callback_head *last, *res = NULL;
-        struct hlist_node *pos;
        raw_spin_lock_irqsave(&task->pi_lock, flags);
-        hlist_for_each_entry(twork, pos, &task->task_works, hlist) {
+        last = task->task_works;
-                if (twork->func == func) {
+        if (last) {
-                        hlist_del(&twork->hlist);
+                struct callback_head *q = last, *p = q->next;
-                        goto found;
+                while (1) {
+                        if (p->func == func) {
+                                q->next = p->next;
+                                if (p == last)
+                                        task->task_works = q == p ? NULL : q;
+                                res = p;
+                                break;
+                        }
+                        if (p == last)
+                                break;
+                        q = p;
+                        p = q->next;
                }
        }
-        twork = NULL;
- found:
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+        return res;
-        return twork;
 }
 void task_work_run(void)
 {
        struct task_struct *task = current;
-        struct hlist_head task_works;
+        struct callback_head *p, *q;
-        struct hlist_node *pos;
-        raw_spin_lock_irq(&task->pi_lock);
+        while (1) {
-        hlist_move_list(&task->task_works, &task_works);
+                raw_spin_lock_irq(&task->pi_lock);
-        raw_spin_unlock_irq(&task->pi_lock);
+                p = task->task_works;
+                task->task_works = NULL;
+                raw_spin_unlock_irq(&task->pi_lock);
-        if (unlikely(hlist_empty(&task_works)))
+                if (unlikely(!p))
-                return;
+                        return;
-        /*
-         * We use hlist to save the space in task_struct, but we want fifo.
-         * Find the last entry, the list should be short, then process them
-         * in reverse order.
-         */
-        for (pos = task_works.first; pos->next; pos = pos->next)
-                ;
-        for (;;) {
+                q = p->next; /* head */
-                struct hlist_node **pprev = pos->pprev;
+                p->next = NULL; /* cut it */
-                struct task_work *twork = container_of(pos, struct task_work,
+                while (q) {
-                                                        hlist);
+                        p = q->next;
-                twork->func(twork);
+                        q->func(q);
+                        q = p;
-                if (pprev == &task_works.first)
+                }
-                        break;
-                pos = container_of(pprev, struct hlist_node, next);
        }
 }
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e66046456f4f..d0a32796550f 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
                                sizeof(struct cgroupstats));
+        if (na == NULL) {
+                rc = -EMSGSIZE;
+                goto err;
+        }
        stats = nla_data(na);
        memset(stats, 0, sizeof(*stats));
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a470154e0408..46da0537c10b 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
 * requested HZ value. It is also not recommended
 * for "tick-less" systems.
 */
-#define NSEC_PER_JIFFY  ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+#define NSEC_PER_JIFFY  ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ))
 /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
 * conversion, the .shift value could be zero. However
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b7fbadc5c973..24174b4d669b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock);
 /* USER_HZ period (usecs): */
 unsigned long                   tick_usec = TICK_USEC;
-/* ACTHZ period (nsecs): */
+/* SHIFTED_HZ period (nsecs): */
 unsigned long                   tick_nsec;
 static u64                      tick_length;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f045cc50832d..2988bc819187 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -65,14 +65,14 @@ struct timekeeper {
         * used instead.
         */
        struct timespec         wall_to_monotonic;
-        /* time spent in suspend */
-        struct timespec         total_sleep_time;
-        /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
-        struct timespec         raw_time;
        /* Offset clock monotonic -> clock realtime */
        ktime_t                 offs_real;
+        /* time spent in suspend */
+        struct timespec         total_sleep_time;
        /* Offset clock monotonic -> clock boottime */
        ktime_t                 offs_boot;
+        /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
+        struct timespec         raw_time;
        /* Seqlock for all timekeeper values */
        seqlock_t               lock;
 };
@@ -108,13 +108,38 @@ static struct timespec tk_xtime(struct timekeeper *tk)
 static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
 {
        tk->xtime_sec = ts->tv_sec;
-        tk->xtime_nsec = ts->tv_nsec << tk->shift;
+        tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
 }
 static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
 {
        tk->xtime_sec += ts->tv_sec;
-        tk->xtime_nsec += ts->tv_nsec << tk->shift;
+        tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
+}
+static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
+{
+        struct timespec tmp;
+        /*
+         * Verify consistency of: offset_real = -wall_to_monotonic
+         * before modifying anything
+         */
+        set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
+                                        -tk->wall_to_monotonic.tv_nsec);
+        WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
+        tk->wall_to_monotonic = wtm;
+        set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
+        tk->offs_real = timespec_to_ktime(tmp);
+}
+static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
+{
+        /* Verify consistency before modifying */
+        WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
+        tk->total_sleep_time    = t;
+        tk->offs_boot           = timespec_to_ktime(t);
 }
 /**
@@ -217,14 +242,6 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
        return nsec + arch_gettimeoffset();
 }
-static void update_rt_offset(struct timekeeper *tk)
-{
-        struct timespec tmp, *wtm = &tk->wall_to_monotonic;
-        set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
-        tk->offs_real = timespec_to_ktime(tmp);
-}
 /* must hold write on timekeeper.lock */
 static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 {
@@ -234,12 +251,10 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
                tk->ntp_error = 0;
                ntp_clear();
        }
-        update_rt_offset(tk);
        xt = tk_xtime(tk);
        update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
 }
 /**
 * timekeeping_forward_now - update clock to the current time
 *
@@ -277,18 +292,19 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 */
 void getnstimeofday(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs = 0;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ts->tv_sec = timekeeper.xtime_sec;
+                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
+                ts->tv_nsec = timekeeping_get_ns(tk);
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        timespec_add_ns(ts, nsecs);
 }
@@ -296,19 +312,18 @@ EXPORT_SYMBOL(getnstimeofday);
 ktime_t ktime_get(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned int seq;
        s64 secs, nsecs;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                secs = timekeeper.xtime_sec +
+                secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
-                                timekeeper.wall_to_monotonic.tv_sec;
+                nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
-                nsecs = timekeeping_get_ns(&timekeeper) +
-                                timekeeper.wall_to_monotonic.tv_nsec;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        /*
         * Use ktime_set/ktime_add_ns to create a proper ktime on
         * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -327,18 +342,19 @@ EXPORT_SYMBOL_GPL(ktime_get);
 */
 void ktime_get_ts(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec tomono;
        unsigned int seq;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ts->tv_sec = timekeeper.xtime_sec;
+                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
+                ts->tv_nsec = timekeeping_get_ns(tk);
-                tomono = timekeeper.wall_to_monotonic;
+                tomono = tk->wall_to_monotonic;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
                                ts->tv_nsec + tomono.tv_nsec);
@@ -358,22 +374,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
 */
 void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs_raw, nsecs_real;
        WARN_ON_ONCE(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                *ts_raw = timekeeper.raw_time;
+                *ts_raw = tk->raw_time;
-                ts_real->tv_sec = timekeeper.xtime_sec;
+                ts_real->tv_sec = tk->xtime_sec;
                ts_real->tv_nsec = 0;
-                nsecs_raw = timekeeping_get_ns_raw(&timekeeper);
+                nsecs_raw = timekeeping_get_ns_raw(tk);
-                nsecs_real = timekeeping_get_ns(&timekeeper);
+                nsecs_real = timekeeping_get_ns(tk);
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        timespec_add_ns(ts_raw, nsecs_raw);
        timespec_add_ns(ts_real, nsecs_real);
@@ -406,28 +423,28 @@ EXPORT_SYMBOL(do_gettimeofday);
 */
 int do_settimeofday(const struct timespec *tv)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec ts_delta, xt;
        unsigned long flags;
        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
-        xt = tk_xtime(&timekeeper);
+        xt = tk_xtime(tk);
        ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
        ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
-        timekeeper.wall_to_monotonic =
+        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
-                        timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
-        tk_set_xtime(&timekeeper, tv);
+        tk_set_xtime(tk, tv);
-        timekeeping_update(&timekeeper, true);
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
@@ -436,7 +453,6 @@ int do_settimeofday(const struct timespec *tv)
 }
 EXPORT_SYMBOL(do_settimeofday);
 /**
 * timekeeping_inject_offset - Adds or subtracts from the current time.
 * @tv:         pointer to the timespec variable containing the offset
@@ -445,23 +461,23 @@ EXPORT_SYMBOL(do_settimeofday);
 */
 int timekeeping_inject_offset(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
-        tk_xtime_add(&timekeeper, ts);
+        tk_xtime_add(tk, ts);
-        timekeeper.wall_to_monotonic =
+        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
-                                timespec_sub(timekeeper.wall_to_monotonic, *ts);
-        timekeeping_update(&timekeeper, true);
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
@@ -477,23 +493,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
 */
 static int change_clocksource(void *data)
 {
+        struct timekeeper *tk = &timekeeper;
        struct clocksource *new, *old;
        unsigned long flags;
        new = (struct clocksource *) data;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
        if (!new->enable || new->enable(new) == 0) {
-                old = timekeeper.clock;
+                old = tk->clock;
-                tk_setup_internals(&timekeeper, new);
+                tk_setup_internals(tk, new);
                if (old->disable)
                        old->disable(old);
        }
-        timekeeping_update(&timekeeper, true);
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        return 0;
 }
@@ -507,7 +524,9 @@ static int change_clocksource(void *data)
 */
 void timekeeping_notify(struct clocksource *clock)
 {
-        if (timekeeper.clock == clock)
+        struct timekeeper *tk = &timekeeper;
+        if (tk->clock == clock)
                return;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
@@ -536,35 +555,36 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
 */
 void getrawmonotonic(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                nsecs = timekeeping_get_ns_raw(&timekeeper);
+                nsecs = timekeeping_get_ns_raw(tk);
-                *ts = timekeeper.raw_time;
+                *ts = tk->raw_time;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        timespec_add_ns(ts, nsecs);
 }
 EXPORT_SYMBOL(getrawmonotonic);
 /**
 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
 int timekeeping_valid_for_hres(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        int ret;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+                ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return ret;
 }
@@ -574,15 +594,16 @@ int timekeeping_valid_for_hres(void)
 */
 u64 timekeeping_max_deferment(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        u64 ret;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ret = timekeeper.clock->max_idle_ns;
+                ret = tk->clock->max_idle_ns;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return ret;
 }
@@ -622,46 +643,43 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)
 */
 void __init timekeeping_init(void)
 {
+        struct timekeeper *tk = &timekeeper;
        struct clocksource *clock;
        unsigned long flags;
-        struct timespec now, boot;
+        struct timespec now, boot, tmp;
        read_persistent_clock(&now);
        read_boot_clock(&boot);
-        seqlock_init(&timekeeper.lock);
+        seqlock_init(&tk->lock);
        ntp_init();
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
-        tk_setup_internals(&timekeeper, clock);
+        tk_setup_internals(tk, clock);
-        tk_set_xtime(&timekeeper, &now);
+        tk_set_xtime(tk, &now);
-        timekeeper.raw_time.tv_sec = 0;
+        tk->raw_time.tv_sec = 0;
-        timekeeper.raw_time.tv_nsec = 0;
+        tk->raw_time.tv_nsec = 0;
        if (boot.tv_sec == 0 && boot.tv_nsec == 0)
-                boot = tk_xtime(&timekeeper);
+                boot = tk_xtime(tk);
-        set_normalized_timespec(&timekeeper.wall_to_monotonic,
+        set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
-                                -boot.tv_sec, -boot.tv_nsec);
+        tk_set_wall_to_mono(tk, tmp);
-        update_rt_offset(&timekeeper);
-        timekeeper.total_sleep_time.tv_sec = 0;
+        tmp.tv_sec = 0;
-        timekeeper.total_sleep_time.tv_nsec = 0;
+        tmp.tv_nsec = 0;
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        tk_set_sleep_time(tk, tmp);
+        write_sequnlock_irqrestore(&tk->lock, flags);
 }
 /* time in seconds when suspend began */
 static struct timespec timekeeping_suspend_time;
-static void update_sleep_time(struct timespec t)
-{
-        timekeeper.total_sleep_time = t;
-        timekeeper.offs_boot = timespec_to_ktime(t);
-}
 /**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @delta: pointer to a timespec delta value
@@ -677,13 +695,11 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
                                        "sleep delta value!\n");
                return;
        }
        tk_xtime_add(tk, delta);
-        tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta);
+        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
-        update_sleep_time(timespec_add(tk->total_sleep_time, *delta));
+        tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
 }
 /**
 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec delta value
@@ -696,6 +712,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
 */
 void timekeeping_inject_sleeptime(struct timespec *delta)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec ts;
@@ -704,21 +721,20 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
        if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
                return;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
-        __timekeeping_inject_sleeptime(&timekeeper, delta);
+        __timekeeping_inject_sleeptime(tk, delta);
-        timekeeping_update(&timekeeper, true);
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
 }
 /**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 *
@@ -728,6 +744,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 */
 static void timekeeping_resume(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec ts;
@@ -735,18 +752,18 @@ static void timekeeping_resume(void)
        clocksource_resume();
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
        if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
                ts = timespec_sub(ts, timekeeping_suspend_time);
-                __timekeeping_inject_sleeptime(&timekeeper, &ts);
+                __timekeeping_inject_sleeptime(tk, &ts);
        }
        /* re-base the last cycle value */
-        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
+        tk->clock->cycle_last = tk->clock->read(tk->clock);
-        timekeeper.ntp_error = 0;
+        tk->ntp_error = 0;
        timekeeping_suspended = 0;
-        timekeeping_update(&timekeeper, false);
+        timekeeping_update(tk, false);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        touch_softlockup_watchdog();
@@ -758,14 +775,15 @@ static void timekeeping_resume(void)
 static int timekeeping_suspend(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec         delta, delta_delta;
        static struct timespec  old_delta;
        read_persistent_clock(&timekeeping_suspend_time);
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
        timekeeping_suspended = 1;
        /*
@@ -774,7 +792,7 @@ static int timekeeping_suspend(void)
         * try to compensate so the difference in system time
         * and persistent_clock time stays close to constant.
         */
-        delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time);
+        delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
        delta_delta = timespec_sub(delta, old_delta);
        if (abs(delta_delta.tv_sec)  >= 2) {
                /*
@@ -787,7 +805,7 @@ static int timekeeping_suspend(void)
                timekeeping_suspend_time =
                        timespec_add(timekeeping_suspend_time, delta_delta);
        }
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
        clocksource_suspend();
@@ -898,7 +916,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
                 * the error. This causes the likely below to be unlikely.
                 *
                 * The proper fix is to avoid rounding up by using
-                 * the high precision timekeeper.xtime_nsec instead of
+                 * the high precision tk->xtime_nsec instead of
                 * xtime.tv_nsec everywhere. Fixing this will take some
                 * time.
                 */
@@ -1003,7 +1021,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 }
 /**
 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
 *
@@ -1024,15 +1041,21 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
                /* Figure out if its a leap sec and apply if needed */
                leap = second_overflow(tk->xtime_sec);
-                tk->xtime_sec += leap;
+                if (unlikely(leap)) {
-                tk->wall_to_monotonic.tv_sec -= leap;
+                        struct timespec ts;
-                if (leap)
-                        clock_was_set_delayed();
+                        tk->xtime_sec += leap;
+                        ts.tv_sec = leap;
+                        ts.tv_nsec = 0;
+                        tk_set_wall_to_mono(tk,
+                                timespec_sub(tk->wall_to_monotonic, ts));
+                        clock_was_set_delayed();
+                }
        }
 }
 /**
 * logarithmic_accumulation - shifted accumulation of cycles
 *
@@ -1076,7 +1099,6 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
        return offset;
 }
 /**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
@@ -1084,21 +1106,22 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 static void update_wall_time(void)
 {
        struct clocksource *clock;
+        struct timekeeper *tk = &timekeeper;
        cycle_t offset;
        int shift = 0, maxshift;
        unsigned long flags;
        s64 remainder;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
                goto out;
-        clock = timekeeper.clock;
+        clock = tk->clock;
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-        offset = timekeeper.cycle_interval;
+        offset = tk->cycle_interval;
 #else
        offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
@@ -1111,19 +1134,19 @@ static void update_wall_time(void)
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
-        shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+        shift = ilog2(offset) - ilog2(tk->cycle_interval);
        shift = max(0, shift);
        /* Bound shift to one less than what overflows tick_length */
        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
        shift = min(shift, maxshift);
-        while (offset >= timekeeper.cycle_interval) {
+        while (offset >= tk->cycle_interval) {
-                offset = logarithmic_accumulation(&timekeeper, offset, shift);
+                offset = logarithmic_accumulation(tk, offset, shift);
-                if(offset < timekeeper.cycle_interval<<shift)
+                if (offset < tk->cycle_interval<<shift)
                        shift--;
        }
        /* correct the clock when NTP error is too big */
-        timekeeping_adjust(&timekeeper, offset);
+        timekeeping_adjust(tk, offset);
        /*
@@ -1135,21 +1158,21 @@ static void update_wall_time(void)
        * the vsyscall implementations are converted to use xtime_nsec
        * (shifted nanoseconds), this can be killed.
        */
-        remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1);
+        remainder = tk->xtime_nsec & ((1 << tk->shift) - 1);
-        timekeeper.xtime_nsec -= remainder;
+        tk->xtime_nsec -= remainder;
-        timekeeper.xtime_nsec += 1 << timekeeper.shift;
+        tk->xtime_nsec += 1 << tk->shift;
-        timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift;
+        tk->ntp_error += remainder << tk->ntp_error_shift;
        /*
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
-        accumulate_nsecs_to_secs(&timekeeper);
+        accumulate_nsecs_to_secs(tk);
-        timekeeping_update(&timekeeper, false);
+        timekeeping_update(tk, false);
 out:
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
 }
@@ -1166,18 +1189,18 @@ out:
 */
 void getboottime(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec boottime = {
-                .tv_sec = timekeeper.wall_to_monotonic.tv_sec +
+                .tv_sec = tk->wall_to_monotonic.tv_sec +
-                                timekeeper.total_sleep_time.tv_sec,
+                                tk->total_sleep_time.tv_sec,
-                .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec +
+                .tv_nsec = tk->wall_to_monotonic.tv_nsec +
-                                timekeeper.total_sleep_time.tv_nsec
+                                tk->total_sleep_time.tv_nsec
        };
        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(getboottime);
 /**
 * get_monotonic_boottime - Returns monotonic time since boot
 * @ts:         pointer to the timespec to be set
@@ -1189,19 +1212,20 @@ EXPORT_SYMBOL_GPL(getboottime);
 */
 void get_monotonic_boottime(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec tomono, sleep;
        unsigned int seq;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ts->tv_sec = timekeeper.xtime_sec;
+                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
+                ts->tv_nsec = timekeeping_get_ns(tk);
-                tomono = timekeeper.wall_to_monotonic;
+                tomono = tk->wall_to_monotonic;
-                sleep = timekeeper.total_sleep_time;
+                sleep = tk->total_sleep_time;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
                        ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
@@ -1231,31 +1255,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
 */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-        *ts = timespec_add(*ts, timekeeper.total_sleep_time);
+        struct timekeeper *tk = &timekeeper;
+        *ts = timespec_add(*ts, tk->total_sleep_time);
 }
 EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 unsigned long get_seconds(void)
 {
-        return timekeeper.xtime_sec;
+        struct timekeeper *tk = &timekeeper;
+        return tk->xtime_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 struct timespec __current_kernel_time(void)
 {
-        return tk_xtime(&timekeeper);
+        struct timekeeper *tk = &timekeeper;
+        return tk_xtime(tk);
 }
 struct timespec current_kernel_time(void)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec now;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                now = tk_xtime(&timekeeper);
+                now = tk_xtime(tk);
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return now;
 }
@@ -1263,15 +1294,16 @@ EXPORT_SYMBOL(current_kernel_time);
 struct timespec get_monotonic_coarse(void)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec now, mono;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                now = tk_xtime(&timekeeper);
+                now = tk_xtime(tk);
-                mono = timekeeper.wall_to_monotonic;
+                mono = tk->wall_to_monotonic;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
                                now.tv_nsec + mono.tv_nsec);
@@ -1300,14 +1332,15 @@ void do_timer(unsigned long ticks)
 void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
                                struct timespec *wtom, struct timespec *sleep)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                *xtim = tk_xtime(&timekeeper);
+                *xtim = tk_xtime(tk);
-                *wtom = timekeeper.wall_to_monotonic;
+                *wtom = tk->wall_to_monotonic;
-                *sleep = timekeeper.total_sleep_time;
+                *sleep = tk->total_sleep_time;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
 }
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1321,19 +1354,20 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 */
 ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 {
+        struct timekeeper *tk = &timekeeper;
        ktime_t now;
        unsigned int seq;
        u64 secs, nsecs;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                secs = timekeeper.xtime_sec;
+                secs = tk->xtime_sec;
-                nsecs = timekeeping_get_ns(&timekeeper);
+                nsecs = timekeeping_get_ns(tk);
-                *offs_real = timekeeper.offs_real;
+                *offs_real = tk->offs_real;
-                *offs_boot = timekeeper.offs_boot;
+                *offs_boot = tk->offs_boot;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        now = ktime_add_ns(ktime_set(secs, 0), nsecs);
        now = ktime_sub(now, *offs_real);
@@ -1346,19 +1380,19 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 */
 ktime_t ktime_get_monotonic_offset(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        struct timespec wtom;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                wtom = timekeeper.wall_to_monotonic;
+                wtom = tk->wall_to_monotonic;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return timespec_to_ktime(wtom);
 }
 EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
 /**
 * xtime_update() - advances the timekeeping infrastructure
 * @ticks:      number of ticks, that have elapsed since the last call.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a120f98c4112..5c38c81496ce 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3187,10 +3187,10 @@ static int tracing_set_tracer(const char *buf)
        }
        destroy_trace_option_files(topts);
-        current_trace = t;
+        current_trace = &nop_trace;
-        topts = create_trace_option_files(current_trace);
+        topts = create_trace_option_files(t);
-        if (current_trace->use_max_tr) {
+        if (t->use_max_tr) {
                int cpu;
                /* we need to make per cpu buffer sizes equivalent */
                for_each_tracing_cpu(cpu) {
@@ -3210,6 +3210,7 @@ static int tracing_set_tracer(const char *buf)
                        goto out;
        }
+        current_trace = t;
        trace_branch_enable(tr);
 out:
        mutex_unlock(&trace_types_lock);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 9824419c8404..84b1e045faba 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -282,7 +282,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
        head = this_cpu_ptr(event_function.perf_events);
        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
-                              1, &regs, head);
+                              1, &regs, head, NULL);
 #undef ENTRY_SIZE
 }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index fdff65dff1bb..483162a9f908 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,6 +13,7 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/pstore.h>
 #include <linux/fs.h>
 #include "trace.h"
@@ -75,6 +76,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
        preempt_enable_notrace();
 }
+/* Our two options */
+enum {
+        TRACE_FUNC_OPT_STACK    = 0x1,
+        TRACE_FUNC_OPT_PSTORE   = 0x2,
+};
+static struct tracer_flags func_flags;
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip,
                    struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -100,6 +109,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
        disabled = atomic_inc_return(&data->disabled);
        if (likely(disabled == 1)) {
+                /*
+                 * So far tracing doesn't support multiple buffers, so
+                 * we make an explicit call for now.
+                 */
+                if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE))
+                        pstore_ftrace_call(ip, parent_ip);
                pc = preempt_count();
                trace_function(tr, ip, parent_ip, flags, pc);
        }
@@ -162,15 +177,13 @@ static struct ftrace_ops trace_stack_ops __read_mostly =
        .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
 };
-/* Our two options */
-enum {
-        TRACE_FUNC_OPT_STACK = 0x1,
-};
 static struct tracer_opt func_opts[] = {
 #ifdef CONFIG_STACKTRACE
        { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
 #endif
+#ifdef CONFIG_PSTORE_FTRACE
+        { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) },
+#endif
        { } /* Always set a last empty entry */
 };
@@ -208,10 +221,11 @@ static void tracing_stop_function_trace(void)
 static int func_set_flag(u32 old_flags, u32 bit, int set)
 {
-        if (bit == TRACE_FUNC_OPT_STACK) {
+        switch (bit) {
+        case TRACE_FUNC_OPT_STACK:
                /* do nothing if already set */
                if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
-                        return 0;
+                        break;
                if (set) {
                        unregister_ftrace_function(&trace_ops);
@@ -221,10 +235,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
                        register_ftrace_function(&trace_ops);
                }
-                return 0;
+                break;
+        case TRACE_FUNC_OPT_PSTORE:
+                break;
+        default:
+                return -EINVAL;
        }
-        return -EINVAL;
+        return 0;
 }
 static struct tracer function_trace __read_mostly =
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b31d3d5699fe..1a2117043bb1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+        perf_trace_buf_submit(entry, size, rctx,
+                                        entry->ip, 1, regs, head, NULL);
 }
 /* Kretprobe profile handler */
@@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+        perf_trace_buf_submit(entry, size, rctx,
+                                        entry->ret_ip, 1, regs, head, NULL);
 }
 #endif  /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 96fc73369099..60e4d7875672 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -532,7 +532,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
                               (unsigned long *)&rec->args);
        head = this_cpu_ptr(sys_data->enter_event->perf_events);
-        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -608,7 +608,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        rec->ret = syscall_get_return_value(current, regs);
        head = this_cpu_ptr(sys_data->exit_event->perf_events);
-        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 int perf_sysexit_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2b36ac68549e..03003cd7dd96 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
 out:
        preempt_enable();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4b1dfba70f7c..69add8a9da68 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -575,7 +575,7 @@ out:
 /*
 * Create/destroy watchdog threads as CPUs come and go:
 */
-static int __cpuinit
+static int
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
@@ -610,10 +610,27 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-static struct notifier_block __cpuinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
        .notifier_call = cpu_callback
 };
+#ifdef CONFIG_SUSPEND
+/*
+ * On exit from suspend we force an offline->online transition on the boot CPU
+ * so that the PMU state that was lost while in suspended state gets set up
+ * properly for the boot CPU.  This information is required for restarting the
+ * NMI watchdog.
+ */
+void lockup_detector_bootcpu_resume(void)
+{
+        void *cpu = (void *)(long)smp_processor_id();
+        cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu);
+        cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu);
+        cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu);
+}
+#endif
 void __init lockup_detector_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9a3128dc67df..692d97628a10 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -45,32 +45,41 @@
 #include "workqueue_sched.h"
 enum {
-        /* global_cwq flags */
+        /*
-        GCWQ_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
+         * global_cwq flags
-        GCWQ_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
+         *
-        GCWQ_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
+         * A bound gcwq is either associated or disassociated with its CPU.
-        GCWQ_FREEZING           = 1 << 3,       /* freeze in progress */
+         * While associated (!DISASSOCIATED), all workers are bound to the
-        GCWQ_HIGHPRI_PENDING    = 1 << 4,       /* highpri works on queue */
+         * CPU and none has %WORKER_UNBOUND set and concurrency management
+         * is in effect.
+         *
+         * While DISASSOCIATED, the cpu may be offline and all workers have
+         * %WORKER_UNBOUND set and concurrency management disabled, and may
+         * be executing on any CPU.  The gcwq behaves as an unbound one.
+         *
+         * Note that DISASSOCIATED can be flipped only while holding
+         * managership of all pools on the gcwq to avoid changing binding
+         * state while create_worker() is in progress.
+         */
+        GCWQ_DISASSOCIATED      = 1 << 0,       /* cpu can't serve workers */
+        GCWQ_FREEZING           = 1 << 1,       /* freeze in progress */
+        /* pool flags */
+        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
        /* worker flags */
        WORKER_STARTED          = 1 << 0,       /* started */
        WORKER_DIE              = 1 << 1,       /* die die die */
        WORKER_IDLE             = 1 << 2,       /* is idle */
        WORKER_PREP             = 1 << 3,       /* preparing to run works */
-        WORKER_ROGUE            = 1 << 4,       /* not bound to any cpu */
        WORKER_REBIND           = 1 << 5,       /* mom is home, come back */
        WORKER_CPU_INTENSIVE    = 1 << 6,       /* cpu intensive */
        WORKER_UNBOUND          = 1 << 7,       /* worker is unbound */
-        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND |
+        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND |
-                                  WORKER_CPU_INTENSIVE | WORKER_UNBOUND,
+                                  WORKER_CPU_INTENSIVE,
-        /* gcwq->trustee_state */
+        NR_WORKER_POOLS         = 2,            /* # worker pools per gcwq */
-        TRUSTEE_START           = 0,            /* start */
-        TRUSTEE_IN_CHARGE       = 1,            /* trustee in charge of gcwq */
-        TRUSTEE_BUTCHER         = 2,            /* butcher workers */
-        TRUSTEE_RELEASE         = 3,            /* release workers */
-        TRUSTEE_DONE            = 4,            /* trustee is done */
        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
        BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
@@ -84,13 +93,13 @@ enum {
                                                   (min two ticks) */
        MAYDAY_INTERVAL         = HZ / 10,      /* and then every 100ms */
        CREATE_COOLDOWN         = HZ,           /* time to breath after fail */
-        TRUSTEE_COOLDOWN        = HZ / 10,      /* for trustee draining */
        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give -20.
         */
        RESCUER_NICE_LEVEL      = -20,
+        HIGHPRI_NICE_LEVEL      = -20,
 };
 /*
@@ -115,6 +124,8 @@ enum {
 */
 struct global_cwq;
+struct worker_pool;
+struct idle_rebind;
 /*
 * The poor guys doing the actual heavy lifting.  All on-duty workers
@@ -131,12 +142,31 @@ struct worker {
        struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
        struct list_head        scheduled;      /* L: scheduled works */
        struct task_struct      *task;          /* I: worker task */
-        struct global_cwq       *gcwq;          /* I: the associated gcwq */
+        struct worker_pool      *pool;          /* I: the associated pool */
        /* 64 bytes boundary on 64bit, 32 on 32bit */
        unsigned long           last_active;    /* L: last active timestamp */
        unsigned int            flags;          /* X: flags */
        int                     id;             /* I: worker id */
-        struct work_struct      rebind_work;    /* L: rebind worker to cpu */
+        /* for rebinding worker to CPU */
+        struct idle_rebind      *idle_rebind;   /* L: for idle worker */
+        struct work_struct      rebind_work;    /* L: for busy worker */
+};
+struct worker_pool {
+        struct global_cwq       *gcwq;          /* I: the owning gcwq */
+        unsigned int            flags;          /* X: flags */
+        struct list_head        worklist;       /* L: list of pending works */
+        int                     nr_workers;     /* L: total number of workers */
+        int                     nr_idle;        /* L: currently idle ones */
+        struct list_head        idle_list;      /* X: list of idle workers */
+        struct timer_list       idle_timer;     /* L: worker idle timeout */
+        struct timer_list       mayday_timer;   /* L: SOS timer for workers */
+        struct mutex            manager_mutex;  /* mutex manager should hold */
+        struct ida              worker_ida;     /* L: for worker IDs */
 };
 /*
@@ -146,27 +176,16 @@ struct worker {
 */
 struct global_cwq {
        spinlock_t              lock;           /* the gcwq lock */
-        struct list_head        worklist;       /* L: list of pending works */
        unsigned int            cpu;            /* I: the associated cpu */
        unsigned int            flags;          /* L: GCWQ_* flags */
-        int                     nr_workers;     /* L: total number of workers */
+        /* workers are chained either in busy_hash or pool idle_list */
-        int                     nr_idle;        /* L: currently idle ones */
-        /* workers are chained either in the idle_list or busy_hash */
-        struct list_head        idle_list;      /* X: list of idle workers */
        struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
                                                /* L: hash of busy workers */
-        struct timer_list       idle_timer;     /* L: worker idle timeout */
+        struct worker_pool      pools[2];       /* normal and highpri pools */
-        struct timer_list       mayday_timer;   /* L: SOS timer for dworkers */
-        struct ida              worker_ida;     /* L: for worker IDs */
-        struct task_struct      *trustee;       /* L: for gcwq shutdown */
+        wait_queue_head_t       rebind_hold;    /* rebind hold wait */
-        unsigned int            trustee_state;  /* L: trustee state */
-        wait_queue_head_t       trustee_wait;   /* trustee wait */
-        struct worker           *first_idle;    /* L: first idle worker */
 } ____cacheline_aligned_in_smp;
 /*
@@ -175,7 +194,7 @@ struct global_cwq {
 * aligned at two's power of the number of flag bits.
 */
 struct cpu_workqueue_struct {
-        struct global_cwq       *gcwq;          /* I: the associated gcwq */
+        struct worker_pool      *pool;          /* I: the associated pool */
        struct workqueue_struct *wq;            /* I: the owning workqueue */
        int                     work_color;     /* L: current color */
        int                     flush_color;    /* L: flushing color */
@@ -264,6 +283,10 @@ EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
+#define for_each_worker_pool(pool, gcwq)                                \
+        for ((pool) = &(gcwq)->pools[0];                                \
+             (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
 #define for_each_busy_worker(worker, i, pos, gcwq)                      \
        for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
                hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -444,7 +467,7 @@ static bool workqueue_freezing;		/* W: have wqs started freezing? */
 * try_to_wake_up().  Put it in a separate cacheline.
 */
 static DEFINE_PER_CPU(struct global_cwq, global_cwq);
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
+static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
 /*
 * Global cpu workqueue and nr_running counter for unbound gcwq.  The
@@ -452,10 +475,17 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running);
 * workers have WORKER_UNBOUND set.
 */
 static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0);       /* always 0 */
+static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
+        [0 ... NR_WORKER_POOLS - 1]     = ATOMIC_INIT(0),       /* always 0 */
+};
 static int worker_thread(void *__worker);
+static int worker_pool_pri(struct worker_pool *pool)
+{
+        return pool - pool->gcwq->pools;
+}
 static struct global_cwq *get_gcwq(unsigned int cpu)
 {
        if (cpu != WORK_CPU_UNBOUND)
@@ -464,12 +494,15 @@ static struct global_cwq *get_gcwq(unsigned int cpu)
                return &unbound_global_cwq;
 }
-static atomic_t *get_gcwq_nr_running(unsigned int cpu)
+static atomic_t *get_pool_nr_running(struct worker_pool *pool)
 {
+        int cpu = pool->gcwq->cpu;
+        int idx = worker_pool_pri(pool);
        if (cpu != WORK_CPU_UNBOUND)
-                return &per_cpu(gcwq_nr_running, cpu);
+                return &per_cpu(pool_nr_running, cpu)[idx];
        else
-                return &unbound_gcwq_nr_running;
+                return &unbound_pool_nr_running[idx];
 }
 static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
@@ -555,7 +588,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
        if (data & WORK_STRUCT_CWQ)
                return ((struct cpu_workqueue_struct *)
-                        (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq;
+                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
        cpu = data >> WORK_STRUCT_FLAG_BITS;
        if (cpu == WORK_CPU_NONE)
@@ -566,60 +599,62 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)
 }
 /*
- * Policy functions.  These define the policies on how the global
+ * Policy functions.  These define the policies on how the global worker
- * worker pool is managed.  Unless noted otherwise, these functions
+ * pools are managed.  Unless noted otherwise, these functions assume that
- * assume that they're being called with gcwq->lock held.
+ * they're being called with gcwq->lock held.
 */
-static bool __need_more_worker(struct global_cwq *gcwq)
+static bool __need_more_worker(struct worker_pool *pool)
 {
-        return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) ||
+        return !atomic_read(get_pool_nr_running(pool));
-                gcwq->flags & GCWQ_HIGHPRI_PENDING;
 }
 /*
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
+ *
+ * Note that, because unbound workers never contribute to nr_running, this
+ * function will always return %true for unbound gcwq as long as the
+ * worklist isn't empty.
 */
-static bool need_more_worker(struct global_cwq *gcwq)
+static bool need_more_worker(struct worker_pool *pool)
 {
-        return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq);
+        return !list_empty(&pool->worklist) && __need_more_worker(pool);
 }
 /* Can I start working?  Called from busy but !running workers. */
-static bool may_start_working(struct global_cwq *gcwq)
+static bool may_start_working(struct worker_pool *pool)
 {
-        return gcwq->nr_idle;
+        return pool->nr_idle;
 }
 /* Do I need to keep working?  Called from currently running workers. */
-static bool keep_working(struct global_cwq *gcwq)
+static bool keep_working(struct worker_pool *pool)
 {
-        atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+        atomic_t *nr_running = get_pool_nr_running(pool);
-        return !list_empty(&gcwq->worklist) &&
+        return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
-                (atomic_read(nr_running) <= 1 ||
-                 gcwq->flags & GCWQ_HIGHPRI_PENDING);
 }
 /* Do we need a new worker?  Called from manager. */
-static bool need_to_create_worker(struct global_cwq *gcwq)
+static bool need_to_create_worker(struct worker_pool *pool)
 {
-        return need_more_worker(gcwq) && !may_start_working(gcwq);
+        return need_more_worker(pool) && !may_start_working(pool);
 }
 /* Do I need to be the manager? */
-static bool need_to_manage_workers(struct global_cwq *gcwq)
+static bool need_to_manage_workers(struct worker_pool *pool)
 {
-        return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS;
+        return need_to_create_worker(pool) ||
+                (pool->flags & POOL_MANAGE_WORKERS);
 }
 /* Do we have too many workers and should some go away? */
-static bool too_many_workers(struct global_cwq *gcwq)
+static bool too_many_workers(struct worker_pool *pool)
 {
-        bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS;
+        bool managing = mutex_is_locked(&pool->manager_mutex);
-        int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */
+        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
-        int nr_busy = gcwq->nr_workers - nr_idle;
+        int nr_busy = pool->nr_workers - nr_idle;
        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
 }
@@ -629,26 +664,26 @@ static bool too_many_workers(struct global_cwq *gcwq)
 */
 /* Return the first worker.  Safe with preemption disabled */
-static struct worker *first_worker(struct global_cwq *gcwq)
+static struct worker *first_worker(struct worker_pool *pool)
 {
-        if (unlikely(list_empty(&gcwq->idle_list)))
+        if (unlikely(list_empty(&pool->idle_list)))
                return NULL;
-        return list_first_entry(&gcwq->idle_list, struct worker, entry);
+        return list_first_entry(&pool->idle_list, struct worker, entry);
 }
 /**
 * wake_up_worker - wake up an idle worker
- * @gcwq: gcwq to wake worker for
+ * @pool: worker pool to wake worker from
 *
- * Wake up the first idle worker of @gcwq.
+ * Wake up the first idle worker of @pool.
 *
 * CONTEXT:
 * spin_lock_irq(gcwq->lock).
 */
-static void wake_up_worker(struct global_cwq *gcwq)
+static void wake_up_worker(struct worker_pool *pool)
 {
-        struct worker *worker = first_worker(gcwq);
+        struct worker *worker = first_worker(pool);
        if (likely(worker))
                wake_up_process(worker->task);
@@ -670,7 +705,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
        struct worker *worker = kthread_data(task);
        if (!(worker->flags & WORKER_NOT_RUNNING))
-                atomic_inc(get_gcwq_nr_running(cpu));
+                atomic_inc(get_pool_nr_running(worker->pool));
 }
 /**
@@ -692,8 +727,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
                                       unsigned int cpu)
 {
        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
-        struct global_cwq *gcwq = get_gcwq(cpu);
+        struct worker_pool *pool = worker->pool;
-        atomic_t *nr_running = get_gcwq_nr_running(cpu);
+        atomic_t *nr_running = get_pool_nr_running(pool);
        if (worker->flags & WORKER_NOT_RUNNING)
                return NULL;
@@ -706,14 +741,14 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
         * worklist not empty test sequence is in insert_work().
         * Please read comment there.
         *
-         * NOT_RUNNING is clear.  This means that trustee is not in
+         * NOT_RUNNING is clear.  This means that we're bound to and
-         * charge and we're running on the local cpu w/ rq lock held
+         * running on the local cpu w/ rq lock held and preemption
-         * and preemption disabled, which in turn means that none else
+         * disabled, which in turn means that none else could be
-         * could be manipulating idle_list, so dereferencing idle_list
+         * manipulating idle_list, so dereferencing idle_list without gcwq
-         * without gcwq lock is safe.
+         * lock is safe.
         */
-        if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist))
+        if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
-                to_wakeup = first_worker(gcwq);
+                to_wakeup = first_worker(pool);
        return to_wakeup ? to_wakeup->task : NULL;
 }
@@ -733,7 +768,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
                                    bool wakeup)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        WARN_ON_ONCE(worker->task != current);
@@ -744,12 +779,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
         */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
-                atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
+                atomic_t *nr_running = get_pool_nr_running(pool);
                if (wakeup) {
                        if (atomic_dec_and_test(nr_running) &&
-                            !list_empty(&gcwq->worklist))
+                            !list_empty(&pool->worklist))
-                                wake_up_worker(gcwq);
+                                wake_up_worker(pool);
                } else
                        atomic_dec(nr_running);
        }
@@ -769,7 +804,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        unsigned int oflags = worker->flags;
        WARN_ON_ONCE(worker->task != current);
@@ -783,7 +818,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
-                        atomic_inc(get_gcwq_nr_running(gcwq->cpu));
+                        atomic_inc(get_pool_nr_running(pool));
 }
 /**
@@ -867,43 +902,6 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 }
 /**
- * gcwq_determine_ins_pos - find insertion position
- * @gcwq: gcwq of interest
- * @cwq: cwq a work is being queued for
- *
- * A work for @cwq is about to be queued on @gcwq, determine insertion
- * position for the work.  If @cwq is for HIGHPRI wq, the work is
- * queued at the head of the queue but in FIFO order with respect to
- * other HIGHPRI works; otherwise, at the end of the queue.  This
- * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that
- * there are HIGHPRI works pending.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to inserstion position.
- */
-static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq,
-                                               struct cpu_workqueue_struct *cwq)
-{
-        struct work_struct *twork;
-        if (likely(!(cwq->wq->flags & WQ_HIGHPRI)))
-                return &gcwq->worklist;
-        list_for_each_entry(twork, &gcwq->worklist, entry) {
-                struct cpu_workqueue_struct *tcwq = get_work_cwq(twork);
-                if (!(tcwq->wq->flags & WQ_HIGHPRI))
-                        break;
-        }
-        gcwq->flags |= GCWQ_HIGHPRI_PENDING;
-        return &twork->entry;
-}
-/**
 * insert_work - insert a work into gcwq
 * @cwq: cwq @work belongs to
 * @work: work to insert
@@ -920,7 +918,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
                        struct work_struct *work, struct list_head *head,
                        unsigned int extra_flags)
 {
-        struct global_cwq *gcwq = cwq->gcwq;
+        struct worker_pool *pool = cwq->pool;
        /* we own @work, set data and link */
        set_work_cwq(work, cwq, extra_flags);
@@ -940,8 +938,8 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
         */
        smp_mb();
-        if (__need_more_worker(gcwq))
+        if (__need_more_worker(pool))
-                wake_up_worker(gcwq);
+                wake_up_worker(pool);
 }
 /*
@@ -1043,7 +1041,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
        if (likely(cwq->nr_active < cwq->max_active)) {
                trace_workqueue_activate_work(work);
                cwq->nr_active++;
-                worklist = gcwq_determine_ins_pos(gcwq, cwq);
+                worklist = &cwq->pool->worklist;
        } else {
                work_flags |= WORK_STRUCT_DELAYED;
                worklist = &cwq->delayed_works;
@@ -1192,7 +1190,8 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 */
 static void worker_enter_idle(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        BUG_ON(worker->flags & WORKER_IDLE);
        BUG_ON(!list_empty(&worker->entry) &&
@@ -1200,27 +1199,24 @@ static void worker_enter_idle(struct worker *worker)
        /* can't use worker_set_flags(), also called from start_worker() */
        worker->flags |= WORKER_IDLE;
-        gcwq->nr_idle++;
+        pool->nr_idle++;
        worker->last_active = jiffies;
        /* idle_list is LIFO */
-        list_add(&worker->entry, &gcwq->idle_list);
+        list_add(&worker->entry, &pool->idle_list);
-        if (likely(!(worker->flags & WORKER_ROGUE))) {
+        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
-                if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer))
+                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
-                        mod_timer(&gcwq->idle_timer,
-                                  jiffies + IDLE_WORKER_TIMEOUT);
-        } else
-                wake_up_all(&gcwq->trustee_wait);
        /*
-         * Sanity check nr_running.  Because trustee releases gcwq->lock
+         * Sanity check nr_running.  Because gcwq_unbind_fn() releases
-         * between setting %WORKER_ROGUE and zapping nr_running, the
+         * gcwq->lock between setting %WORKER_UNBOUND and zapping
-         * warning may trigger spuriously.  Check iff trustee is idle.
+         * nr_running, the warning may trigger spuriously.  Check iff
+         * unbind is not in progress.
         */
-        WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+        WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
-                     gcwq->nr_workers == gcwq->nr_idle &&
+                     pool->nr_workers == pool->nr_idle &&
-                     atomic_read(get_gcwq_nr_running(gcwq->cpu)));
+                     atomic_read(get_pool_nr_running(pool)));
 }
 /**
@@ -1234,11 +1230,11 @@ static void worker_enter_idle(struct worker *worker)
 */
 static void worker_leave_idle(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        BUG_ON(!(worker->flags & WORKER_IDLE));
        worker_clr_flags(worker, WORKER_IDLE);
-        gcwq->nr_idle--;
+        pool->nr_idle--;
        list_del_init(&worker->entry);
 }
@@ -1258,11 +1254,11 @@ static void worker_leave_idle(struct worker *worker)
 * verbatim as it's best effort and blocking and gcwq may be
 * [dis]associated in the meantime.
 *
- * This function tries set_cpus_allowed() and locks gcwq and verifies
+ * This function tries set_cpus_allowed() and locks gcwq and verifies the
- * the binding against GCWQ_DISASSOCIATED which is set during
+ * binding against %GCWQ_DISASSOCIATED which is set during
- * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters
+ * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
- * idle state or fetches works without dropping lock, it can guarantee
+ * enters idle state or fetches works without dropping lock, it can
- * the scheduling requirement described in the first paragraph.
+ * guarantee the scheduling requirement described in the first paragraph.
 *
 * CONTEXT:
 * Might sleep.  Called without any lock but returns with gcwq->lock
@@ -1275,7 +1271,7 @@ static void worker_leave_idle(struct worker *worker)
 static bool worker_maybe_bind_and_lock(struct worker *worker)
 __acquires(&gcwq->lock)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct global_cwq *gcwq = worker->pool->gcwq;
        struct task_struct *task = worker->task;
        while (true) {
@@ -1308,16 +1304,40 @@ __acquires(&gcwq->lock)
        }
 }
+struct idle_rebind {
+        int                     cnt;            /* # workers to be rebound */
+        struct completion       done;           /* all workers rebound */
+};
+/*
+ * Rebind an idle @worker to its CPU.  During CPU onlining, this has to
+ * happen synchronously for idle workers.  worker_thread() will test
+ * %WORKER_REBIND before leaving idle and call this function.
+ */
+static void idle_worker_rebind(struct worker *worker)
+{
+        struct global_cwq *gcwq = worker->pool->gcwq;
+        /* CPU must be online at this point */
+        WARN_ON(!worker_maybe_bind_and_lock(worker));
+        if (!--worker->idle_rebind->cnt)
+                complete(&worker->idle_rebind->done);
+        spin_unlock_irq(&worker->pool->gcwq->lock);
+        /* we did our part, wait for rebind_workers() to finish up */
+        wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+}
 /*
- * Function for worker->rebind_work used to rebind rogue busy workers
+ * Function for @worker->rebind.work used to rebind unbound busy workers to
- * to the associated cpu which is coming back online.  This is
+ * the associated cpu which is coming back online.  This is scheduled by
- * scheduled by cpu up but can race with other cpu hotplug operations
+ * cpu up but can race with other cpu hotplug operations and may be
- * and may be executed twice without intervening cpu down.
+ * executed twice without intervening cpu down.
 */
-static void worker_rebind_fn(struct work_struct *work)
+static void busy_worker_rebind_fn(struct work_struct *work)
 {
        struct worker *worker = container_of(work, struct worker, rebind_work);
-        struct global_cwq *gcwq = worker->gcwq;
+        struct global_cwq *gcwq = worker->pool->gcwq;
        if (worker_maybe_bind_and_lock(worker))
                worker_clr_flags(worker, WORKER_REBIND);
@@ -1325,6 +1345,112 @@ static void worker_rebind_fn(struct work_struct *work)
        spin_unlock_irq(&gcwq->lock);
 }
+/**
+ * rebind_workers - rebind all workers of a gcwq to the associated CPU
+ * @gcwq: gcwq of interest
+ *
+ * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
+ * is different for idle and busy ones.
+ *
+ * The idle ones should be rebound synchronously and idle rebinding should
+ * be complete before any worker starts executing work items with
+ * concurrency management enabled; otherwise, scheduler may oops trying to
+ * wake up non-local idle worker from wq_worker_sleeping().
+ *
+ * This is achieved by repeatedly requesting rebinding until all idle
+ * workers are known to have been rebound under @gcwq->lock and holding all
+ * idle workers from becoming busy until idle rebinding is complete.
+ *
+ * Once idle workers are rebound, busy workers can be rebound as they
+ * finish executing their current work items.  Queueing the rebind work at
+ * the head of their scheduled lists is enough.  Note that nr_running will
+ * be properbly bumped as busy workers rebind.
+ *
+ * On return, all workers are guaranteed to either be bound or have rebind
+ * work item scheduled.
+ */
+static void rebind_workers(struct global_cwq *gcwq)
+        __releases(&gcwq->lock) __acquires(&gcwq->lock)
+{
+        struct idle_rebind idle_rebind;
+        struct worker_pool *pool;
+        struct worker *worker;
+        struct hlist_node *pos;
+        int i;
+        lockdep_assert_held(&gcwq->lock);
+        for_each_worker_pool(pool, gcwq)
+                lockdep_assert_held(&pool->manager_mutex);
+        /*
+         * Rebind idle workers.  Interlocked both ways.  We wait for
+         * workers to rebind via @idle_rebind.done.  Workers will wait for
+         * us to finish up by watching %WORKER_REBIND.
+         */
+        init_completion(&idle_rebind.done);
+retry:
+        idle_rebind.cnt = 1;
+        INIT_COMPLETION(idle_rebind.done);
+        /* set REBIND and kick idle ones, we'll wait for these later */
+        for_each_worker_pool(pool, gcwq) {
+                list_for_each_entry(worker, &pool->idle_list, entry) {
+                        if (worker->flags & WORKER_REBIND)
+                                continue;
+                        /* morph UNBOUND to REBIND */
+                        worker->flags &= ~WORKER_UNBOUND;
+                        worker->flags |= WORKER_REBIND;
+                        idle_rebind.cnt++;
+                        worker->idle_rebind = &idle_rebind;
+                        /* worker_thread() will call idle_worker_rebind() */
+                        wake_up_process(worker->task);
+                }
+        }
+        if (--idle_rebind.cnt) {
+                spin_unlock_irq(&gcwq->lock);
+                wait_for_completion(&idle_rebind.done);
+                spin_lock_irq(&gcwq->lock);
+                /* busy ones might have become idle while waiting, retry */
+                goto retry;
+        }
+        /*
+         * All idle workers are rebound and waiting for %WORKER_REBIND to
+         * be cleared inside idle_worker_rebind().  Clear and release.
+         * Clearing %WORKER_REBIND from this foreign context is safe
+         * because these workers are still guaranteed to be idle.
+         */
+        for_each_worker_pool(pool, gcwq)
+                list_for_each_entry(worker, &pool->idle_list, entry)
+                        worker->flags &= ~WORKER_REBIND;
+        wake_up_all(&gcwq->rebind_hold);
+        /* rebind busy workers */
+        for_each_busy_worker(worker, i, pos, gcwq) {
+                struct work_struct *rebind_work = &worker->rebind_work;
+                /* morph UNBOUND to REBIND */
+                worker->flags &= ~WORKER_UNBOUND;
+                worker->flags |= WORKER_REBIND;
+                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
+                                     work_data_bits(rebind_work)))
+                        continue;
+                /* wq doesn't matter, use the default one */
+                debug_work_activate(rebind_work);
+                insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
+                            worker->scheduled.next,
+                            work_color_to_flags(WORK_NO_COLOR));
+        }
+}
 static struct worker *alloc_worker(void)
 {
        struct worker *worker;
@@ -1333,7 +1459,7 @@ static struct worker *alloc_worker(void)
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
-                INIT_WORK(&worker->rebind_work, worker_rebind_fn);
+                INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
@@ -1342,10 +1468,9 @@ static struct worker *alloc_worker(void)
 /**
 * create_worker - create a new workqueue worker
- * @gcwq: gcwq the new worker will belong to
+ * @pool: pool the new worker will belong to
- * @bind: whether to set affinity to @cpu or not
 *
- * Create a new worker which is bound to @gcwq.  The returned worker
+ * Create a new worker which is bound to @pool.  The returned worker
 * can be started by calling start_worker() or destroyed using
 * destroy_worker().
 *
@@ -1355,16 +1480,17 @@ static struct worker *alloc_worker(void)
 * RETURNS:
 * Pointer to the newly created worker.
 */
-static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
+static struct worker *create_worker(struct worker_pool *pool)
 {
-        bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND;
+        struct global_cwq *gcwq = pool->gcwq;
+        const char *pri = worker_pool_pri(pool) ? "H" : "";
        struct worker *worker = NULL;
        int id = -1;
        spin_lock_irq(&gcwq->lock);
-        while (ida_get_new(&gcwq->worker_ida, &id)) {
+        while (ida_get_new(&pool->worker_ida, &id)) {
                spin_unlock_irq(&gcwq->lock);
-                if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL))
+                if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
                        goto fail;
                spin_lock_irq(&gcwq->lock);
        }
@@ -1374,38 +1500,43 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
        if (!worker)
                goto fail;
-        worker->gcwq = gcwq;
+        worker->pool = pool;
        worker->id = id;
-        if (!on_unbound_cpu)
+        if (gcwq->cpu != WORK_CPU_UNBOUND)
                worker->task = kthread_create_on_node(worker_thread,
-                                                      worker,
+                                        worker, cpu_to_node(gcwq->cpu),
-                                                      cpu_to_node(gcwq->cpu),
+                                        "kworker/%u:%d%s", gcwq->cpu, id, pri);
-                                                      "kworker/%u:%d", gcwq->cpu, id);
        else
                worker->task = kthread_create(worker_thread, worker,
-                                              "kworker/u:%d", id);
+                                              "kworker/u:%d%s", id, pri);
        if (IS_ERR(worker->task))
                goto fail;
+        if (worker_pool_pri(pool))
+                set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
        /*
-         * A rogue worker will become a regular one if CPU comes
+         * Determine CPU binding of the new worker depending on
-         * online later on.  Make sure every worker has
+         * %GCWQ_DISASSOCIATED.  The caller is responsible for ensuring the
-         * PF_THREAD_BOUND set.
+         * flag remains stable across this function.  See the comments
+         * above the flag definition for details.
+         *
+         * As an unbound worker may later become a regular one if CPU comes
+         * online, make sure every worker has %PF_THREAD_BOUND set.
         */
-        if (bind && !on_unbound_cpu)
+        if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
                kthread_bind(worker->task, gcwq->cpu);
-        else {
+        } else {
                worker->task->flags |= PF_THREAD_BOUND;
-                if (on_unbound_cpu)
+                worker->flags |= WORKER_UNBOUND;
-                        worker->flags |= WORKER_UNBOUND;
        }
        return worker;
 fail:
        if (id >= 0) {
                spin_lock_irq(&gcwq->lock);
-                ida_remove(&gcwq->worker_ida, id);
+                ida_remove(&pool->worker_ida, id);
                spin_unlock_irq(&gcwq->lock);
        }
        kfree(worker);
@@ -1424,7 +1555,7 @@ fail:
 static void start_worker(struct worker *worker)
 {
        worker->flags |= WORKER_STARTED;
-        worker->gcwq->nr_workers++;
+        worker->pool->nr_workers++;
        worker_enter_idle(worker);
        wake_up_process(worker->task);
 }
@@ -1440,7 +1571,8 @@ static void start_worker(struct worker *worker)
 */
 static void destroy_worker(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        int id = worker->id;
        /* sanity check frenzy */
@@ -1448,9 +1580,9 @@ static void destroy_worker(struct worker *worker)
        BUG_ON(!list_empty(&worker->scheduled));
        if (worker->flags & WORKER_STARTED)
-                gcwq->nr_workers--;
+                pool->nr_workers--;
        if (worker->flags & WORKER_IDLE)
-                gcwq->nr_idle--;
+                pool->nr_idle--;
        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
@@ -1461,29 +1593,30 @@ static void destroy_worker(struct worker *worker)
        kfree(worker);
        spin_lock_irq(&gcwq->lock);
-        ida_remove(&gcwq->worker_ida, id);
+        ida_remove(&pool->worker_ida, id);
 }
-static void idle_worker_timeout(unsigned long __gcwq)
+static void idle_worker_timeout(unsigned long __pool)
 {
-        struct global_cwq *gcwq = (void *)__gcwq;
+        struct worker_pool *pool = (void *)__pool;
+        struct global_cwq *gcwq = pool->gcwq;
        spin_lock_irq(&gcwq->lock);
-        if (too_many_workers(gcwq)) {
+        if (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;
                /* idle_list is kept in LIFO order, check the last one */
-                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+                worker = list_entry(pool->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                if (time_before(jiffies, expires))
-                        mod_timer(&gcwq->idle_timer, expires);
+                        mod_timer(&pool->idle_timer, expires);
                else {
                        /* it's been idle for too long, wake up manager */
-                        gcwq->flags |= GCWQ_MANAGE_WORKERS;
+                        pool->flags |= POOL_MANAGE_WORKERS;
-                        wake_up_worker(gcwq);
+                        wake_up_worker(pool);
                }
        }
@@ -1500,7 +1633,7 @@ static bool send_mayday(struct work_struct *work)
                return false;
        /* mayday mayday mayday */
-        cpu = cwq->gcwq->cpu;
+        cpu = cwq->pool->gcwq->cpu;
        /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
        if (cpu == WORK_CPU_UNBOUND)
                cpu = 0;
@@ -1509,37 +1642,38 @@ static bool send_mayday(struct work_struct *work)
        return true;
 }
-static void gcwq_mayday_timeout(unsigned long __gcwq)
+static void gcwq_mayday_timeout(unsigned long __pool)
 {
-        struct global_cwq *gcwq = (void *)__gcwq;
+        struct worker_pool *pool = (void *)__pool;
+        struct global_cwq *gcwq = pool->gcwq;
        struct work_struct *work;
        spin_lock_irq(&gcwq->lock);
-        if (need_to_create_worker(gcwq)) {
+        if (need_to_create_worker(pool)) {
                /*
                 * We've been trying to create a new worker but
                 * haven't been successful.  We might be hitting an
                 * allocation deadlock.  Send distress signals to
                 * rescuers.
                 */
-                list_for_each_entry(work, &gcwq->worklist, entry)
+                list_for_each_entry(work, &pool->worklist, entry)
                        send_mayday(work);
        }
        spin_unlock_irq(&gcwq->lock);
-        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL);
+        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
 /**
 * maybe_create_worker - create a new worker if necessary
- * @gcwq: gcwq to create a new worker for
+ * @pool: pool to create a new worker for
 *
- * Create a new worker for @gcwq if necessary.  @gcwq is guaranteed to
+ * Create a new worker for @pool if necessary.  @pool is guaranteed to
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
- * sent to all rescuers with works scheduled on @gcwq to resolve
+ * sent to all rescuers with works scheduled on @pool to resolve
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be false and
@@ -1554,52 +1688,54 @@ static void gcwq_mayday_timeout(unsigned long __gcwq)
 * false if no action was taken and gcwq->lock stayed locked, true
 * otherwise.
 */
-static bool maybe_create_worker(struct global_cwq *gcwq)
+static bool maybe_create_worker(struct worker_pool *pool)
 __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
-        if (!need_to_create_worker(gcwq))
+        struct global_cwq *gcwq = pool->gcwq;
+        if (!need_to_create_worker(pool))
                return false;
 restart:
        spin_unlock_irq(&gcwq->lock);
        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
-        mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
+        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
        while (true) {
                struct worker *worker;
-                worker = create_worker(gcwq, true);
+                worker = create_worker(pool);
                if (worker) {
-                        del_timer_sync(&gcwq->mayday_timer);
+                        del_timer_sync(&pool->mayday_timer);
                        spin_lock_irq(&gcwq->lock);
                        start_worker(worker);
-                        BUG_ON(need_to_create_worker(gcwq));
+                        BUG_ON(need_to_create_worker(pool));
                        return true;
                }
-                if (!need_to_create_worker(gcwq))
+                if (!need_to_create_worker(pool))
                        break;
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(CREATE_COOLDOWN);
-                if (!need_to_create_worker(gcwq))
+                if (!need_to_create_worker(pool))
                        break;
        }
-        del_timer_sync(&gcwq->mayday_timer);
+        del_timer_sync(&pool->mayday_timer);
        spin_lock_irq(&gcwq->lock);
-        if (need_to_create_worker(gcwq))
+        if (need_to_create_worker(pool))
                goto restart;
        return true;
 }
 /**
 * maybe_destroy_worker - destroy workers which have been idle for a while
- * @gcwq: gcwq to destroy workers for
+ * @pool: pool to destroy workers for
 *
- * Destroy @gcwq workers which have been idle for longer than
+ * Destroy @pool workers which have been idle for longer than
 * IDLE_WORKER_TIMEOUT.
 *
 * LOCKING:
@@ -1610,19 +1746,19 @@ restart:
 * false if no action was taken and gcwq->lock stayed locked, true
 * otherwise.
 */
-static bool maybe_destroy_workers(struct global_cwq *gcwq)
+static bool maybe_destroy_workers(struct worker_pool *pool)
 {
        bool ret = false;
-        while (too_many_workers(gcwq)) {
+        while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;
-                worker = list_entry(gcwq->idle_list.prev, struct worker, entry);
+                worker = list_entry(pool->idle_list.prev, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                if (time_before(jiffies, expires)) {
-                        mod_timer(&gcwq->idle_timer, expires);
+                        mod_timer(&pool->idle_timer, expires);
                        break;
                }
@@ -1655,31 +1791,22 @@ static bool maybe_destroy_workers(struct global_cwq *gcwq)
 */
 static bool manage_workers(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
        bool ret = false;
-        if (gcwq->flags & GCWQ_MANAGING_WORKERS)
+        if (!mutex_trylock(&pool->manager_mutex))
                return ret;
-        gcwq->flags &= ~GCWQ_MANAGE_WORKERS;
+        pool->flags &= ~POOL_MANAGE_WORKERS;
-        gcwq->flags |= GCWQ_MANAGING_WORKERS;
        /*
         * Destroy and then create so that may_start_working() is true
         * on return.
         */
-        ret |= maybe_destroy_workers(gcwq);
+        ret |= maybe_destroy_workers(pool);
-        ret |= maybe_create_worker(gcwq);
+        ret |= maybe_create_worker(pool);
-        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
-        /*
-         * The trustee might be waiting to take over the manager
-         * position, tell it we're done.
-         */
-        if (unlikely(gcwq->trustee))
-                wake_up_all(&gcwq->trustee_wait);
+        mutex_unlock(&pool->manager_mutex);
        return ret;
 }
@@ -1728,10 +1855,9 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
 {
        struct work_struct *work = list_first_entry(&cwq->delayed_works,
                                                    struct work_struct, entry);
-        struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
        trace_workqueue_activate_work(work);
-        move_linked_works(work, pos, NULL);
+        move_linked_works(work, &cwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
        cwq->nr_active++;
 }
@@ -1804,7 +1930,8 @@ __releases(&gcwq->lock)
 __acquires(&gcwq->lock)
 {
        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
-        struct global_cwq *gcwq = cwq->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        struct hlist_head *bwh = busy_worker_head(gcwq, work);
        bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
        work_func_t f = work->func;
@@ -1823,6 +1950,15 @@ __acquires(&gcwq->lock)
        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
 #endif
        /*
+         * Ensure we're on the correct CPU.  DISASSOCIATED test is
+         * necessary to avoid spurious warnings from rescuers servicing the
+         * unbound or a disassociated gcwq.
+         */
+        WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) &&
+                     !(gcwq->flags & GCWQ_DISASSOCIATED) &&
+                     raw_smp_processor_id() != gcwq->cpu);
+        /*
         * A single work shouldn't be executed concurrently by
         * multiple workers on a single cpu.  Check whether anyone is
         * already processing the work.  If so, defer the work to the
@@ -1846,27 +1982,19 @@ __acquires(&gcwq->lock)
        list_del_init(&work->entry);
        /*
-         * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI,
-         * wake up another worker; otherwise, clear HIGHPRI_PENDING.
-         */
-        if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) {
-                struct work_struct *nwork = list_first_entry(&gcwq->worklist,
-                                                struct work_struct, entry);
-                if (!list_empty(&gcwq->worklist) &&
-                    get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI)
-                        wake_up_worker(gcwq);
-                else
-                        gcwq->flags &= ~GCWQ_HIGHPRI_PENDING;
-        }
-        /*
         * CPU intensive works don't participate in concurrency
         * management.  They're the scheduler's responsibility.
         */
        if (unlikely(cpu_intensive))
                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+        /*
+         * Unbound gcwq isn't concurrency managed and work items should be
+         * executed ASAP.  Wake up another worker if necessary.
+         */
+        if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+                wake_up_worker(pool);
        spin_unlock_irq(&gcwq->lock);
        work_clear_pending(work);
@@ -1939,28 +2067,38 @@ static void process_scheduled_works(struct worker *worker)
 static int worker_thread(void *__worker)
 {
        struct worker *worker = __worker;
-        struct global_cwq *gcwq = worker->gcwq;
+        struct worker_pool *pool = worker->pool;
+        struct global_cwq *gcwq = pool->gcwq;
        /* tell the scheduler that this is a workqueue worker */
        worker->task->flags |= PF_WQ_WORKER;
 woke_up:
        spin_lock_irq(&gcwq->lock);
-        /* DIE can be set only while we're idle, checking here is enough */
+        /*
-        if (worker->flags & WORKER_DIE) {
+         * DIE can be set only while idle and REBIND set while busy has
+         * @worker->rebind_work scheduled.  Checking here is enough.
+         */
+        if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) {
                spin_unlock_irq(&gcwq->lock);
-                worker->task->flags &= ~PF_WQ_WORKER;
-                return 0;
+                if (worker->flags & WORKER_DIE) {
+                        worker->task->flags &= ~PF_WQ_WORKER;
+                        return 0;
+                }
+                idle_worker_rebind(worker);
+                goto woke_up;
        }
        worker_leave_idle(worker);
 recheck:
        /* no more worker necessary? */
-        if (!need_more_worker(gcwq))
+        if (!need_more_worker(pool))
                goto sleep;
        /* do we need to manage? */
-        if (unlikely(!may_start_working(gcwq)) && manage_workers(worker))
+        if (unlikely(!may_start_working(pool)) && manage_workers(worker))
                goto recheck;
        /*
@@ -1979,7 +2117,7 @@ recheck:
        do {
                struct work_struct *work =
-                        list_first_entry(&gcwq->worklist,
+                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);
                if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
@@ -1991,11 +2129,11 @@ recheck:
                        move_linked_works(work, &worker->scheduled, NULL);
                        process_scheduled_works(worker);
                }
-        } while (keep_working(gcwq));
+        } while (keep_working(pool));
        worker_set_flags(worker, WORKER_PREP, false);
 sleep:
-        if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker))
+        if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
                goto recheck;
        /*
@@ -2053,14 +2191,15 @@ repeat:
        for_each_mayday_cpu(cpu, wq->mayday_mask) {
                unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
                struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
-                struct global_cwq *gcwq = cwq->gcwq;
+                struct worker_pool *pool = cwq->pool;
+                struct global_cwq *gcwq = pool->gcwq;
                struct work_struct *work, *n;
                __set_current_state(TASK_RUNNING);
                mayday_clear_cpu(cpu, wq->mayday_mask);
                /* migrate to the target cpu if possible */
-                rescuer->gcwq = gcwq;
+                rescuer->pool = pool;
                worker_maybe_bind_and_lock(rescuer);
                /*
@@ -2068,7 +2207,7 @@ repeat:
                 * process'em.
                 */
                BUG_ON(!list_empty(&rescuer->scheduled));
-                list_for_each_entry_safe(work, n, &gcwq->worklist, entry)
+                list_for_each_entry_safe(work, n, &pool->worklist, entry)
                        if (get_work_cwq(work) == cwq)
                                move_linked_works(work, scheduled, &n);
@@ -2079,8 +2218,8 @@ repeat:
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
                 */
-                if (keep_working(gcwq))
+                if (keep_working(pool))
-                        wake_up_worker(gcwq);
+                        wake_up_worker(pool);
                spin_unlock_irq(&gcwq->lock);
        }
@@ -2205,7 +2344,7 @@ static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
-                struct global_cwq *gcwq = cwq->gcwq;
+                struct global_cwq *gcwq = cwq->pool->gcwq;
                spin_lock_irq(&gcwq->lock);
@@ -2421,9 +2560,9 @@ reflush:
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
                bool drained;
-                spin_lock_irq(&cwq->gcwq->lock);
+                spin_lock_irq(&cwq->pool->gcwq->lock);
                drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
-                spin_unlock_irq(&cwq->gcwq->lock);
+                spin_unlock_irq(&cwq->pool->gcwq->lock);
                if (drained)
                        continue;
@@ -2463,7 +2602,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                 */
                smp_rmb();
                cwq = get_work_cwq(work);
-                if (unlikely(!cwq || gcwq != cwq->gcwq))
+                if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
                        goto already_gone;
        } else if (wait_executing) {
                worker = find_worker_executing_work(gcwq, work);
@@ -2984,13 +3123,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        if (flags & WQ_MEM_RECLAIM)
                flags |= WQ_RESCUER;
-        /*
-         * Unbound workqueues aren't concurrency managed and should be
-         * dispatched to workers immediately.
-         */
-        if (flags & WQ_UNBOUND)
-                flags |= WQ_HIGHPRI;
        max_active = max_active ?: WQ_DFL_ACTIVE;
        max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -3011,9 +3143,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
                struct global_cwq *gcwq = get_gcwq(cpu);
+                int pool_idx = (bool)(flags & WQ_HIGHPRI);
                BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
-                cwq->gcwq = gcwq;
+                cwq->pool = &gcwq->pools[pool_idx];
                cwq->wq = wq;
                cwq->flush_color = -1;
                cwq->max_active = max_active;
@@ -3225,369 +3358,143 @@ EXPORT_SYMBOL_GPL(work_busy);
 * gcwqs serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
- * This is solved by allowing a gcwq to be detached from CPU, running
+ * This is solved by allowing a gcwq to be disassociated from the CPU
- * it with unbound (rogue) workers and allowing it to be reattached
+ * running as an unbound one and allowing it to be reattached later if the
- * later if the cpu comes back online.  A separate thread is created
+ * cpu comes back online.
- * to govern a gcwq in such state and is called the trustee of the
- * gcwq.
- *
- * Trustee states and their descriptions.
- *
- * START        Command state used on startup.  On CPU_DOWN_PREPARE, a
- *              new trustee is started with this state.
- *
- * IN_CHARGE    Once started, trustee will enter this state after
- *              assuming the manager role and making all existing
- *              workers rogue.  DOWN_PREPARE waits for trustee to
- *              enter this state.  After reaching IN_CHARGE, trustee
- *              tries to execute the pending worklist until it's empty
- *              and the state is set to BUTCHER, or the state is set
- *              to RELEASE.
- *
- * BUTCHER      Command state which is set by the cpu callback after
- *              the cpu has went down.  Once this state is set trustee
- *              knows that there will be no new works on the worklist
- *              and once the worklist is empty it can proceed to
- *              killing idle workers.
- *
- * RELEASE      Command state which is set by the cpu callback if the
- *              cpu down has been canceled or it has come online
- *              again.  After recognizing this state, trustee stops
- *              trying to drain or butcher and clears ROGUE, rebinds
- *              all remaining workers back to the cpu and releases
- *              manager role.
- *
- * DONE         Trustee will enter this state after BUTCHER or RELEASE
- *              is complete.
- *
- *          trustee                 CPU                draining
- *         took over                down               complete
- * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE
- *                        |                     |                  ^
- *                        | CPU is back online  v   return workers |
- *                         ----------------> RELEASE --------------
 */
-/**
+/* claim manager positions of all pools */
- * trustee_wait_event_timeout - timed event wait for trustee
+static void gcwq_claim_management_and_lock(struct global_cwq *gcwq)
- * @cond: condition to wait for
- * @timeout: timeout in jiffies
- *
- * wait_event_timeout() for trustee to use.  Handles locking and
- * checks for RELEASE request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * Positive indicating left time if @cond is satisfied, 0 if timed
- * out, -1 if canceled.
- */
-#define trustee_wait_event_timeout(cond, timeout) ({                    \
-        long __ret = (timeout);                                         \
-        while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \
-               __ret) {                                                 \
-                spin_unlock_irq(&gcwq->lock);                           \
-                __wait_event_timeout(gcwq->trustee_wait, (cond) ||      \
-                        (gcwq->trustee_state == TRUSTEE_RELEASE),       \
-                        __ret);                                         \
-                spin_lock_irq(&gcwq->lock);                             \
-        }                                                               \
-        gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret);          \
-})
-/**
- * trustee_wait_event - event wait for trustee
- * @cond: condition to wait for
- *
- * wait_event() for trustee to use.  Automatically handles locking and
- * checks for CANCEL request.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by trustee.
- *
- * RETURNS:
- * 0 if @cond is satisfied, -1 if canceled.
- */
-#define trustee_wait_event(cond) ({                                     \
-        long __ret1;                                                    \
-        __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\
-        __ret1 < 0 ? -1 : 0;                                            \
-})
-static int __cpuinit trustee_thread(void *__gcwq)
 {
-        struct global_cwq *gcwq = __gcwq;
+        struct worker_pool *pool;
-        struct worker *worker;
-        struct work_struct *work;
-        struct hlist_node *pos;
-        long rc;
-        int i;
-        BUG_ON(gcwq->cpu != smp_processor_id());
+        for_each_worker_pool(pool, gcwq)
+                mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools);
        spin_lock_irq(&gcwq->lock);
-        /*
+}
-         * Claim the manager position and make all workers rogue.
-         * Trustee must be bound to the target cpu and can't be
-         * cancelled.
-         */
-        BUG_ON(gcwq->cpu != smp_processor_id());
-        rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS));
-        BUG_ON(rc < 0);
-        gcwq->flags |= GCWQ_MANAGING_WORKERS;
-        list_for_each_entry(worker, &gcwq->idle_list, entry)
-                worker->flags |= WORKER_ROGUE;
-        for_each_busy_worker(worker, i, pos, gcwq)
+/* release manager positions */
-                worker->flags |= WORKER_ROGUE;
+static void gcwq_release_management_and_unlock(struct global_cwq *gcwq)
+{
+        struct worker_pool *pool;
-        /*
-         * Call schedule() so that we cross rq->lock and thus can
-         * guarantee sched callbacks see the rogue flag.  This is
-         * necessary as scheduler callbacks may be invoked from other
-         * cpus.
-         */
        spin_unlock_irq(&gcwq->lock);
-        schedule();
+        for_each_worker_pool(pool, gcwq)
-        spin_lock_irq(&gcwq->lock);
+                mutex_unlock(&pool->manager_mutex);
+}
-        /*
+static void gcwq_unbind_fn(struct work_struct *work)
-         * Sched callbacks are disabled now.  Zap nr_running.  After
+{
-         * this, nr_running stays zero and need_more_worker() and
+        struct global_cwq *gcwq = get_gcwq(smp_processor_id());
-         * keep_working() are always true as long as the worklist is
+        struct worker_pool *pool;
-         * not empty.
+        struct worker *worker;
-         */
+        struct hlist_node *pos;
-        atomic_set(get_gcwq_nr_running(gcwq->cpu), 0);
+        int i;
-        spin_unlock_irq(&gcwq->lock);
+        BUG_ON(gcwq->cpu != smp_processor_id());
-        del_timer_sync(&gcwq->idle_timer);
-        spin_lock_irq(&gcwq->lock);
-        /*
+        gcwq_claim_management_and_lock(gcwq);
-         * We're now in charge.  Notify and proceed to drain.  We need
-         * to keep the gcwq running during the whole CPU down
-         * procedure as other cpu hotunplug callbacks may need to
-         * flush currently running tasks.
-         */
-        gcwq->trustee_state = TRUSTEE_IN_CHARGE;
-        wake_up_all(&gcwq->trustee_wait);
        /*
-         * The original cpu is in the process of dying and may go away
+         * We've claimed all manager positions.  Make all workers unbound
-         * anytime now.  When that happens, we and all workers would
+         * and set DISASSOCIATED.  Before this, all workers except for the
-         * be migrated to other cpus.  Try draining any left work.  We
+         * ones which are still executing works from before the last CPU
-         * want to get it over with ASAP - spam rescuers, wake up as
+         * down must be on the cpu.  After this, they may become diasporas.
-         * many idlers as necessary and create new ones till the
-         * worklist is empty.  Note that if the gcwq is frozen, there
-         * may be frozen works in freezable cwqs.  Don't declare
-         * completion while frozen.
         */
-        while (gcwq->nr_workers != gcwq->nr_idle ||
+        for_each_worker_pool(pool, gcwq)
-               gcwq->flags & GCWQ_FREEZING ||
+                list_for_each_entry(worker, &pool->idle_list, entry)
-               gcwq->trustee_state == TRUSTEE_IN_CHARGE) {
+                        worker->flags |= WORKER_UNBOUND;
-                int nr_works = 0;
-                list_for_each_entry(work, &gcwq->worklist, entry) {
-                        send_mayday(work);
-                        nr_works++;
-                }
-                list_for_each_entry(worker, &gcwq->idle_list, entry) {
+        for_each_busy_worker(worker, i, pos, gcwq)
-                        if (!nr_works--)
+                worker->flags |= WORKER_UNBOUND;
-                                break;
-                        wake_up_process(worker->task);
-                }
-                if (need_to_create_worker(gcwq)) {
+        gcwq->flags |= GCWQ_DISASSOCIATED;
-                        spin_unlock_irq(&gcwq->lock);
-                        worker = create_worker(gcwq, false);
-                        spin_lock_irq(&gcwq->lock);
-                        if (worker) {
-                                worker->flags |= WORKER_ROGUE;
-                                start_worker(worker);
-                        }
-                }
-                /* give a breather */
+        gcwq_release_management_and_unlock(gcwq);
-                if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0)
-                        break;
-        }
        /*
-         * Either all works have been scheduled and cpu is down, or
+         * Call schedule() so that we cross rq->lock and thus can guarantee
-         * cpu down has already been canceled.  Wait for and butcher
+         * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
-         * all workers till we're canceled.
+         * as scheduler callbacks may be invoked from other cpus.
         */
-        do {
+        schedule();
-                rc = trustee_wait_event(!list_empty(&gcwq->idle_list));
-                while (!list_empty(&gcwq->idle_list))
-                        destroy_worker(list_first_entry(&gcwq->idle_list,
-                                                        struct worker, entry));
-        } while (gcwq->nr_workers && rc >= 0);
        /*
-         * At this point, either draining has completed and no worker
+         * Sched callbacks are disabled now.  Zap nr_running.  After this,
-         * is left, or cpu down has been canceled or the cpu is being
+         * nr_running stays zero and need_more_worker() and keep_working()
-         * brought back up.  There shouldn't be any idle one left.
+         * are always true as long as the worklist is not empty.  @gcwq now
-         * Tell the remaining busy ones to rebind once it finishes the
+         * behaves as unbound (in terms of concurrency management) gcwq
-         * currently scheduled works by scheduling the rebind_work.
+         * which is served by workers tied to the CPU.
+         *
+         * On return from this function, the current worker would trigger
+         * unbound chain execution of pending work items if other workers
+         * didn't already.
         */
-        WARN_ON(!list_empty(&gcwq->idle_list));
+        for_each_worker_pool(pool, gcwq)
+                atomic_set(get_pool_nr_running(pool), 0);
-        for_each_busy_worker(worker, i, pos, gcwq) {
-                struct work_struct *rebind_work = &worker->rebind_work;
-                /*
-                 * Rebind_work may race with future cpu hotplug
-                 * operations.  Use a separate flag to mark that
-                 * rebinding is scheduled.
-                 */
-                worker->flags |= WORKER_REBIND;
-                worker->flags &= ~WORKER_ROGUE;
-                /* queue rebind_work, wq doesn't matter, use the default one */
-                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
-                                     work_data_bits(rebind_work)))
-                        continue;
-                debug_work_activate(rebind_work);
-                insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work,
-                            worker->scheduled.next,
-                            work_color_to_flags(WORK_NO_COLOR));
-        }
-        /* relinquish manager role */
-        gcwq->flags &= ~GCWQ_MANAGING_WORKERS;
-        /* notify completion */
-        gcwq->trustee = NULL;
-        gcwq->trustee_state = TRUSTEE_DONE;
-        wake_up_all(&gcwq->trustee_wait);
-        spin_unlock_irq(&gcwq->lock);
-        return 0;
 }
-/**
+/*
- * wait_trustee_state - wait for trustee to enter the specified state
+ * Workqueues should be brought up before normal priority CPU notifiers.
- * @gcwq: gcwq the trustee of interest belongs to
+ * This will be registered high priority CPU notifier.
- * @state: target state to wait for
- *
- * Wait for the trustee to reach @state.  DONE is already matched.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
- * multiple times.  To be used by cpu_callback.
 */
-static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state)
+static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
-__releases(&gcwq->lock)
+                                               unsigned long action,
-__acquires(&gcwq->lock)
+                                               void *hcpu)
-{
-        if (!(gcwq->trustee_state == state ||
-              gcwq->trustee_state == TRUSTEE_DONE)) {
-                spin_unlock_irq(&gcwq->lock);
-                __wait_event(gcwq->trustee_wait,
-                             gcwq->trustee_state == state ||
-                             gcwq->trustee_state == TRUSTEE_DONE);
-                spin_lock_irq(&gcwq->lock);
-        }
-}
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
-                                                unsigned long action,
-                                                void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
        struct global_cwq *gcwq = get_gcwq(cpu);
-        struct task_struct *new_trustee = NULL;
+        struct worker_pool *pool;
-        struct worker *uninitialized_var(new_worker);
-        unsigned long flags;
-        action &= ~CPU_TASKS_FROZEN;
-        switch (action) {
+        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_DOWN_PREPARE:
-                new_trustee = kthread_create(trustee_thread, gcwq,
-                                             "workqueue_trustee/%d\n", cpu);
-                if (IS_ERR(new_trustee))
-                        return notifier_from_errno(PTR_ERR(new_trustee));
-                kthread_bind(new_trustee, cpu);
-                /* fall through */
        case CPU_UP_PREPARE:
-                BUG_ON(gcwq->first_idle);
+                for_each_worker_pool(pool, gcwq) {
-                new_worker = create_worker(gcwq, false);
+                        struct worker *worker;
-                if (!new_worker) {
-                        if (new_trustee)
-                                kthread_stop(new_trustee);
-                        return NOTIFY_BAD;
-                }
-        }
-        /* some are called w/ irq disabled, don't disturb irq status */
-        spin_lock_irqsave(&gcwq->lock, flags);
-        switch (action) {
+                        if (pool->nr_workers)
-        case CPU_DOWN_PREPARE:
+                                continue;
-                /* initialize trustee and tell it to acquire the gcwq */
-                BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE);
-                gcwq->trustee = new_trustee;
-                gcwq->trustee_state = TRUSTEE_START;
-                wake_up_process(gcwq->trustee);
-                wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE);
-                /* fall through */
-        case CPU_UP_PREPARE:
-                BUG_ON(gcwq->first_idle);
-                gcwq->first_idle = new_worker;
-                break;
-        case CPU_DYING:
+                        worker = create_worker(pool);
-                /*
+                        if (!worker)
-                 * Before this, the trustee and all workers except for
+                                return NOTIFY_BAD;
-                 * the ones which are still executing works from
-                 * before the last CPU down must be on the cpu.  After
-                 * this, they'll all be diasporas.
-                 */
-                gcwq->flags |= GCWQ_DISASSOCIATED;
-                break;
-        case CPU_POST_DEAD:
+                        spin_lock_irq(&gcwq->lock);
-                gcwq->trustee_state = TRUSTEE_BUTCHER;
+                        start_worker(worker);
-                /* fall through */
+                        spin_unlock_irq(&gcwq->lock);
-        case CPU_UP_CANCELED:
+                }
-                destroy_worker(gcwq->first_idle);
-                gcwq->first_idle = NULL;
                break;
        case CPU_DOWN_FAILED:
        case CPU_ONLINE:
+                gcwq_claim_management_and_lock(gcwq);
                gcwq->flags &= ~GCWQ_DISASSOCIATED;
-                if (gcwq->trustee_state != TRUSTEE_DONE) {
+                rebind_workers(gcwq);
-                        gcwq->trustee_state = TRUSTEE_RELEASE;
+                gcwq_release_management_and_unlock(gcwq);
-                        wake_up_process(gcwq->trustee);
-                        wait_trustee_state(gcwq, TRUSTEE_DONE);
-                }
-                /*
-                 * Trustee is done and there might be no worker left.
-                 * Put the first_idle in and request a real manager to
-                 * take a look.
-                 */
-                spin_unlock_irq(&gcwq->lock);
-                kthread_bind(gcwq->first_idle->task, cpu);
-                spin_lock_irq(&gcwq->lock);
-                gcwq->flags |= GCWQ_MANAGE_WORKERS;
-                start_worker(gcwq->first_idle);
-                gcwq->first_idle = NULL;
                break;
        }
+        return NOTIFY_OK;
+}
-        spin_unlock_irqrestore(&gcwq->lock, flags);
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+                                                 unsigned long action,
+                                                 void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+        struct work_struct unbind_work;
-        return notifier_from_errno(0);
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_DOWN_PREPARE:
+                /* unbinding should happen on the local CPU */
+                INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+                schedule_work_on(cpu, &unbind_work);
+                flush_work(&unbind_work);
+                break;
+        }
+        return NOTIFY_OK;
 }
 #ifdef CONFIG_SMP
@@ -3746,6 +3653,7 @@ void thaw_workqueues(void)
        for_each_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker_pool *pool;
                struct workqueue_struct *wq;
                spin_lock_irq(&gcwq->lock);
@@ -3767,7 +3675,8 @@ void thaw_workqueues(void)
                                cwq_activate_first_delayed(cwq);
                }
-                wake_up_worker(gcwq);
+                for_each_worker_pool(pool, gcwq)
+                        wake_up_worker(pool);
                spin_unlock_irq(&gcwq->lock);
        }
@@ -3783,46 +3692,57 @@ static int __init init_workqueues(void)
        unsigned int cpu;
        int i;
-        cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
+        cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
        /* initialize gcwqs */
        for_each_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker_pool *pool;
                spin_lock_init(&gcwq->lock);
-                INIT_LIST_HEAD(&gcwq->worklist);
                gcwq->cpu = cpu;
                gcwq->flags |= GCWQ_DISASSOCIATED;
-                INIT_LIST_HEAD(&gcwq->idle_list);
                for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
                        INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
-                init_timer_deferrable(&gcwq->idle_timer);
+                for_each_worker_pool(pool, gcwq) {
-                gcwq->idle_timer.function = idle_worker_timeout;
+                        pool->gcwq = gcwq;
-                gcwq->idle_timer.data = (unsigned long)gcwq;
+                        INIT_LIST_HEAD(&pool->worklist);
+                        INIT_LIST_HEAD(&pool->idle_list);
+                        init_timer_deferrable(&pool->idle_timer);
+                        pool->idle_timer.function = idle_worker_timeout;
+                        pool->idle_timer.data = (unsigned long)pool;
-                setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout,
+                        setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
-                            (unsigned long)gcwq);
+                                    (unsigned long)pool);
-                ida_init(&gcwq->worker_ida);
+                        mutex_init(&pool->manager_mutex);
+                        ida_init(&pool->worker_ida);
+                }
-                gcwq->trustee_state = TRUSTEE_DONE;
+                init_waitqueue_head(&gcwq->rebind_hold);
-                init_waitqueue_head(&gcwq->trustee_wait);
        }
        /* create the initial worker */
        for_each_online_gcwq_cpu(cpu) {
                struct global_cwq *gcwq = get_gcwq(cpu);
-                struct worker *worker;
+                struct worker_pool *pool;
                if (cpu != WORK_CPU_UNBOUND)
                        gcwq->flags &= ~GCWQ_DISASSOCIATED;
-                worker = create_worker(gcwq, true);
-                BUG_ON(!worker);
+                for_each_worker_pool(pool, gcwq) {
-                spin_lock_irq(&gcwq->lock);
+                        struct worker *worker;
-                start_worker(worker);
-                spin_unlock_irq(&gcwq->lock);
+                        worker = create_worker(pool);
+                        BUG_ON(!worker);
+                        spin_lock_irq(&gcwq->lock);
+                        start_worker(worker);
+                        spin_unlock_irq(&gcwq->lock);
+                }
        }
        system_wq = alloc_workqueue("events", 0, 0);
author	Ingo Molnar <mingo@kernel.org>	2012-08-21 05:27:00 -0400
committer	Ingo Molnar <mingo@kernel.org>	2012-08-21 05:27:00 -0400
commit	bcada3d4b8c96b8792c2306f363992ca5ab9da42 (patch)
tree	e420679a5db6ea4e1694eef57f9abb6acac8d4d3 /kernel
parent	26198c21d1b286a084fe5d514a30bc7e6c712a34 (diff)
parent	000078bc3ee69efb1124b8478c7527389a826074 (diff)