Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux into mips-for-linux-next

Conflicts: include/linux/ssb/ssb_driver_gige.h Also resolves a logical merge conflict in drivers/net/ethernet/broadcom/- bgmac.c due to change of an API.
author: Ralf Baechle <ralf@linux-mips.org> 2013-02-21 10:16:55 -0500
committer: Ralf Baechle <ralf@linux-mips.org> 2013-02-22 04:07:30 -0500
commit: edb15d83a875a1f4b1576188844db5c330c3267d (patch)
tree: 74d54eab401b6ccf2a6ad4821227108a8d160f03 /kernel
parent: 8bfc245f9ad7bd4e461179e4e7852ef99b8b6144 (diff)
parent: a0b1c42951dd06ec83cc1bc2c9788131d9fefcd8 (diff)
86 files changed, 3941 insertions, 2497 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 051e071a06e7..e8b1627ab9c7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -566,6 +566,7 @@ out:
 void acct_collect(long exitcode, int group_dead)
 {
        struct pacct_struct *pacct = &current->signal->pacct;
+        cputime_t utime, stime;
        unsigned long vsize = 0;
        if (group_dead && current->mm) {
@@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead)
                pacct->ac_flag |= ACORE;
        if (current->flags & PF_SIGNALED)
                pacct->ac_flag |= AXSIG;
-        pacct->ac_utime += current->utime;
+        task_cputime(current, &utime, &stime);
-        pacct->ac_stime += current->stime;
+        pacct->ac_utime += utime;
+        pacct->ac_stime += stime;
        pacct->ac_minflt += current->min_flt;
        pacct->ac_majflt += current->maj_flt;
        spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 6f34904a0b53..8ddee2c3e5b0 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -57,65 +57,52 @@ asynchronous and synchronous parts of the kernel.
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include "workqueue_internal.h"
 static async_cookie_t next_cookie = 1;
-#define MAX_WORK        32768
+#define MAX_WORK                32768
+#define ASYNC_COOKIE_MAX        ULLONG_MAX      /* infinity cookie */
-static LIST_HEAD(async_pending);
+static LIST_HEAD(async_global_pending); /* pending from all registered doms */
-static ASYNC_DOMAIN(async_running);
+static ASYNC_DOMAIN(async_dfl_domain);
-static LIST_HEAD(async_domains);
 static DEFINE_SPINLOCK(async_lock);
-static DEFINE_MUTEX(async_register_mutex);
 struct async_entry {
-        struct list_head        list;
+        struct list_head        domain_list;
+        struct list_head        global_list;
        struct work_struct      work;
        async_cookie_t          cookie;
        async_func_ptr          *func;
        void                    *data;
-        struct async_domain     *running;
+        struct async_domain     *domain;
 };
 static DECLARE_WAIT_QUEUE_HEAD(async_done);
 static atomic_t entry_count;
+static async_cookie_t lowest_in_progress(struct async_domain *domain)
-/*
- * MUST be called with the lock held!
- */
-static async_cookie_t  __lowest_in_progress(struct async_domain *running)
 {
-        async_cookie_t first_running = next_cookie;     /* infinity value */
+        struct async_entry *first = NULL;
-        async_cookie_t first_pending = next_cookie;     /* ditto */
+        async_cookie_t ret = ASYNC_COOKIE_MAX;
-        struct async_entry *entry;
+        unsigned long flags;
-        /*
+        spin_lock_irqsave(&async_lock, flags);
-         * Both running and pending lists are sorted but not disjoint.
-         * Take the first cookies from both and return the min.
-         */
-        if (!list_empty(&running->domain)) {
-                entry = list_first_entry(&running->domain, typeof(*entry), list);
-                first_running = entry->cookie;
-        }
-        list_for_each_entry(entry, &async_pending, list) {
+        if (domain) {
-                if (entry->running == running) {
+                if (!list_empty(&domain->pending))
-                        first_pending = entry->cookie;
+                        first = list_first_entry(&domain->pending,
-                        break;
+                                        struct async_entry, domain_list);
-                }
+        } else {
+                if (!list_empty(&async_global_pending))
+                        first = list_first_entry(&async_global_pending,
+                                        struct async_entry, global_list);
        }
-        return min(first_running, first_pending);
+        if (first)
-}
+                ret = first->cookie;
-static async_cookie_t  lowest_in_progress(struct async_domain *running)
-{
-        unsigned long flags;
-        async_cookie_t ret;
-        spin_lock_irqsave(&async_lock, flags);
-        ret = __lowest_in_progress(running);
        spin_unlock_irqrestore(&async_lock, flags);
        return ret;
 }
@@ -127,20 +114,10 @@ static void async_run_entry_fn(struct work_struct *work)
 {
        struct async_entry *entry =
                container_of(work, struct async_entry, work);
-        struct async_entry *pos;
        unsigned long flags;
        ktime_t uninitialized_var(calltime), delta, rettime;
-        struct async_domain *running = entry->running;
-        /* 1) move self to the running queue, make sure it stays sorted */
+        /* 1) run (and print duration) */
-        spin_lock_irqsave(&async_lock, flags);
-        list_for_each_entry_reverse(pos, &running->domain, list)
-                if (entry->cookie < pos->cookie)
-                        break;
-        list_move_tail(&entry->list, &pos->list);
-        spin_unlock_irqrestore(&async_lock, flags);
-        /* 2) run (and print duration) */
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                printk(KERN_DEBUG "calling  %lli_%pF @ %i\n",
                        (long long)entry->cookie,
@@ -157,23 +134,22 @@ static void async_run_entry_fn(struct work_struct *work)
                        (long long)ktime_to_ns(delta) >> 10);
        }
-        /* 3) remove self from the running queue */
+        /* 2) remove self from the pending queues */
        spin_lock_irqsave(&async_lock, flags);
-        list_del(&entry->list);
+        list_del_init(&entry->domain_list);
-        if (running->registered && --running->count == 0)
+        list_del_init(&entry->global_list);
-                list_del_init(&running->node);
-        /* 4) free the entry */
+        /* 3) free the entry */
        kfree(entry);
        atomic_dec(&entry_count);
        spin_unlock_irqrestore(&async_lock, flags);
-        /* 5) wake up any waiters */
+        /* 4) wake up any waiters */
        wake_up(&async_done);
 }
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
 {
        struct async_entry *entry;
        unsigned long flags;
@@ -196,16 +172,22 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
                ptr(data, newcookie);
                return newcookie;
        }
+        INIT_LIST_HEAD(&entry->domain_list);
+        INIT_LIST_HEAD(&entry->global_list);
        INIT_WORK(&entry->work, async_run_entry_fn);
        entry->func = ptr;
        entry->data = data;
-        entry->running = running;
+        entry->domain = domain;
        spin_lock_irqsave(&async_lock, flags);
+        /* allocate cookie and queue */
        newcookie = entry->cookie = next_cookie++;
-        list_add_tail(&entry->list, &async_pending);
-        if (running->registered && running->count++ == 0)
+        list_add_tail(&entry->domain_list, &domain->pending);
-                list_add_tail(&running->node, &async_domains);
+        if (domain->registered)
+                list_add_tail(&entry->global_list, &async_global_pending);
        atomic_inc(&entry_count);
        spin_unlock_irqrestore(&async_lock, flags);
@@ -228,7 +210,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
 */
 async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
 {
-        return __async_schedule(ptr, data, &async_running);
+        return __async_schedule(ptr, data, &async_dfl_domain);
 }
 EXPORT_SYMBOL_GPL(async_schedule);
@@ -236,18 +218,18 @@ EXPORT_SYMBOL_GPL(async_schedule);
 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
 * @ptr: function to execute asynchronously
 * @data: data pointer to pass to the function
- * @running: running list for the domain
+ * @domain: the domain
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_domain() functions
+ * @domain may be used in the async_synchronize_*_domain() functions to
- * to wait within a certain synchronization domain rather than globally.
+ * wait within a certain synchronization domain rather than globally.  A
- * A synchronization domain is specified via the running queue @running to use.
+ * synchronization domain is specified via @domain.  Note: This function
- * Note: This function may be called from atomic or non-atomic contexts.
+ * may be called from atomic or non-atomic contexts.
 */
 async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
-                                     struct async_domain *running)
+                                     struct async_domain *domain)
 {
-        return __async_schedule(ptr, data, running);
+        return __async_schedule(ptr, data, domain);
 }
 EXPORT_SYMBOL_GPL(async_schedule_domain);
@@ -258,18 +240,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
 */
 void async_synchronize_full(void)
 {
-        mutex_lock(&async_register_mutex);
+        async_synchronize_full_domain(NULL);
-        do {
-                struct async_domain *domain = NULL;
-                spin_lock_irq(&async_lock);
-                if (!list_empty(&async_domains))
-                        domain = list_first_entry(&async_domains, typeof(*domain), node);
-                spin_unlock_irq(&async_lock);
-                async_synchronize_cookie_domain(next_cookie, domain);
-        } while (!list_empty(&async_domains));
-        mutex_unlock(&async_register_mutex);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
@@ -284,51 +255,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full);
 */
 void async_unregister_domain(struct async_domain *domain)
 {
-        mutex_lock(&async_register_mutex);
        spin_lock_irq(&async_lock);
-        WARN_ON(!domain->registered || !list_empty(&domain->node) ||
+        WARN_ON(!domain->registered || !list_empty(&domain->pending));
-                !list_empty(&domain->domain));
        domain->registered = 0;
        spin_unlock_irq(&async_lock);
-        mutex_unlock(&async_register_mutex);
 }
 EXPORT_SYMBOL_GPL(async_unregister_domain);
 /**
 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @domain: running list to synchronize on
+ * @domain: the domain to synchronize
 *
 * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @domain have been done.
+ * synchronization domain specified by @domain have been done.
 */
 void async_synchronize_full_domain(struct async_domain *domain)
 {
-        async_synchronize_cookie_domain(next_cookie, domain);
+        async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 /**
 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
 * @cookie: async_cookie_t to use as checkpoint
- * @running: running list to synchronize on
+ * @domain: the domain to synchronize (%NULL for all registered domains)
 *
 * This function waits until all asynchronous function calls for the
- * synchronization domain specified by running list @running submitted
+ * synchronization domain specified by @domain submitted prior to @cookie
- * prior to @cookie have been done.
+ * have been done.
 */
-void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
 {
        ktime_t uninitialized_var(starttime), delta, endtime;
-        if (!running)
-                return;
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
                starttime = ktime_get();
        }
-        wait_event(async_done, lowest_in_progress(running) >= cookie);
+        wait_event(async_done, lowest_in_progress(domain) >= cookie);
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                endtime = ktime_get();
@@ -350,6 +315,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
 */
 void async_synchronize_cookie(async_cookie_t cookie)
 {
-        async_synchronize_cookie_domain(cookie, &async_running);
+        async_synchronize_cookie_domain(cookie, &async_dfl_domain);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_cookie);
+/**
+ * current_is_async - is %current an async worker task?
+ *
+ * Returns %true if %current is an async worker task.
+ */
+bool current_is_async(void)
+{
+        struct worker *worker = current_wq_worker();
+        return worker && worker->current_func == async_run_entry_fn;
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4855892798fd..b5c64327e712 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
 #include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
 #include <linux/namei.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
@@ -376,22 +376,18 @@ static int css_set_count;
 * account cgroups in empty hierarchies.
 */
 #define CSS_SET_HASH_BITS       7
-#define CSS_SET_TABLE_SIZE      (1 << CSS_SET_HASH_BITS)
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 {
        int i;
-        int index;
+        unsigned long key = 0UL;
-        unsigned long tmp = 0UL;
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
-                tmp += (unsigned long)css[i];
+                key += (unsigned long)css[i];
-        tmp = (tmp >> 16) ^ tmp;
+        key = (key >> 16) ^ key;
-        index = hash_long(tmp, CSS_SET_HASH_BITS);
+        return key;
-        return &css_set_table[index];
 }
 /* We don't maintain the lists running through each css_set to its
@@ -418,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
        }
        /* This css_set is dead. unlink it and release cgroup refcounts */
-        hlist_del(&cg->hlist);
+        hash_del(&cg->hlist);
        css_set_count--;
        list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -426,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
                struct cgroup *cgrp = link->cgrp;
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
+                /*
+                 * We may not be holding cgroup_mutex, and if cgrp->count is
+                 * dropped to 0 the cgroup can be destroyed at any time, hence
+                 * rcu_read_lock is used to keep it alive.
+                 */
+                rcu_read_lock();
                if (atomic_dec_and_test(&cgrp->count) &&
                    notify_on_release(cgrp)) {
                        if (taskexit)
                                set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
+                rcu_read_unlock();
                kfree(link);
        }
@@ -550,9 +554,9 @@ static struct css_set *find_existing_css_set(
 {
        int i;
        struct cgroupfs_root *root = cgrp->root;
-        struct hlist_head *hhead;
        struct hlist_node *node;
        struct css_set *cg;
+        unsigned long key;
        /*
         * Build the set of subsystem state objects that we want to see in the
@@ -572,8 +576,8 @@ static struct css_set *find_existing_css_set(
                }
        }
-        hhead = css_set_hash(template);
+        key = css_set_hash(template);
-        hlist_for_each_entry(cg, node, hhead, hlist) {
+        hash_for_each_possible(css_set_table, cg, node, hlist, key) {
                if (!compare_css_sets(cg, oldcg, cgrp, template))
                        continue;
@@ -657,8 +661,8 @@ static struct css_set *find_css_set(
        struct list_head tmp_cg_links;
-        struct hlist_head *hhead;
        struct cg_cgroup_link *link;
+        unsigned long key;
        /* First see if we already have a cgroup group that matches
         * the desired set */
@@ -704,8 +708,8 @@ static struct css_set *find_css_set(
        css_set_count++;
        /* Add this cgroup group to the hash table */
-        hhead = css_set_hash(res->subsys);
+        key = css_set_hash(res->subsys);
-        hlist_add_head(&res->hlist, hhead);
+        hash_add(css_set_table, &res->hlist, key);
        write_unlock(&css_set_lock);
@@ -856,47 +860,54 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
        return inode;
 }
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+static void cgroup_free_fn(struct work_struct *work)
 {
-        /* is dentry a directory ? if so, kfree() associated cgroup */
+        struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
-        if (S_ISDIR(inode->i_mode)) {
+        struct cgroup_subsys *ss;
-                struct cgroup *cgrp = dentry->d_fsdata;
-                struct cgroup_subsys *ss;
-                BUG_ON(!(cgroup_is_removed(cgrp)));
-                /* It's possible for external users to be holding css
-                 * reference counts on a cgroup; css_put() needs to
-                 * be able to access the cgroup after decrementing
-                 * the reference count in order to know if it needs to
-                 * queue the cgroup to be handled by the release
-                 * agent */
-                synchronize_rcu();
-                mutex_lock(&cgroup_mutex);
+        mutex_lock(&cgroup_mutex);
-                /*
+        /*
-                 * Release the subsystem state objects.
+         * Release the subsystem state objects.
-                 */
+         */
-                for_each_subsys(cgrp->root, ss)
+        for_each_subsys(cgrp->root, ss)
-                        ss->css_free(cgrp);
+                ss->css_free(cgrp);
-                cgrp->root->number_of_cgroups--;
+        cgrp->root->number_of_cgroups--;
-                mutex_unlock(&cgroup_mutex);
+        mutex_unlock(&cgroup_mutex);
-                /*
+        /*
-                 * Drop the active superblock reference that we took when we
+         * Drop the active superblock reference that we took when we
-                 * created the cgroup
+         * created the cgroup
-                 */
+         */
-                deactivate_super(cgrp->root->sb);
+        deactivate_super(cgrp->root->sb);
-                /*
+        /*
-                 * if we're getting rid of the cgroup, refcount should ensure
+         * if we're getting rid of the cgroup, refcount should ensure
-                 * that there are no pidlists left.
+         * that there are no pidlists left.
-                 */
+         */
-                BUG_ON(!list_empty(&cgrp->pidlists));
+        BUG_ON(!list_empty(&cgrp->pidlists));
-                simple_xattrs_free(&cgrp->xattrs);
+        simple_xattrs_free(&cgrp->xattrs);
-                ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+        ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
-                kfree_rcu(cgrp, rcu_head);
+        kfree(cgrp);
+}
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+        schedule_work(&cgrp->free_work);
+}
+static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+{
+        /* is dentry a directory ? if so, kfree() associated cgroup */
+        if (S_ISDIR(inode->i_mode)) {
+                struct cgroup *cgrp = dentry->d_fsdata;
+                BUG_ON(!(cgroup_is_removed(cgrp)));
+                call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
                struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -925,13 +936,17 @@ static void remove_dir(struct dentry *d)
        dput(parent);
 }
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 {
        struct cfent *cfe;
        lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
        lockdep_assert_held(&cgroup_mutex);
+        /*
+         * If we're doing cleanup due to failure of cgroup_create(),
+         * the corresponding @cfe may not exist.
+         */
        list_for_each_entry(cfe, &cgrp->files, node) {
                struct dentry *d = cfe->dentry;
@@ -944,9 +959,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
                list_del_init(&cfe->node);
                dput(d);
-                return 0;
+                break;
        }
-        return -ENOENT;
 }
 /**
@@ -1083,7 +1097,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                }
        }
        root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
-        synchronize_rcu();
        return 0;
 }
@@ -1393,6 +1406,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->allcg_node);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
+        INIT_WORK(&cgrp->free_work, cgroup_free_fn);
        mutex_init(&cgrp->pidlist_mutex);
        INIT_LIST_HEAD(&cgrp->event_list);
        spin_lock_init(&cgrp->event_list_lock);
@@ -1597,6 +1611,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                struct cgroupfs_root *existing_root;
                const struct cred *cred;
                int i;
+                struct hlist_node *node;
+                struct css_set *cg;
                BUG_ON(sb->s_root != NULL);
@@ -1650,14 +1666,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                /* Link the top cgroup in this hierarchy into all
                 * the css_set objects */
                write_lock(&css_set_lock);
-                for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+                hash_for_each(css_set_table, i, node, cg, hlist)
-                        struct hlist_head *hhead = &css_set_table[i];
+                        link_css_set(&tmp_cg_links, cg, root_cgrp);
-                        struct hlist_node *node;
-                        struct css_set *cg;
-                        hlist_for_each_entry(cg, node, hhead, hlist)
-                                link_css_set(&tmp_cg_links, cg, root_cgrp);
-                }
                write_unlock(&css_set_lock);
                free_cg_links(&tmp_cg_links);
@@ -1773,7 +1783,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
        rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
                           "cgroup_path() called without proper locking");
-        if (!dentry || cgrp == dummytop) {
+        if (cgrp == dummytop) {
                /*
                 * Inactive subsystems have no dentry for their root
                 * cgroup
@@ -1982,7 +1992,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                        ss->attach(cgrp, &tset);
        }
-        synchronize_rcu();
 out:
        if (retval) {
                for_each_subsys(root, ss) {
@@ -2151,7 +2160,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
        /*
         * step 5: success! and cleanup
         */
-        synchronize_rcu();
        retval = 0;
 out_put_css_set_refs:
        if (retval) {
@@ -2769,14 +2777,14 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
                        continue;
-                if (is_add)
+                if (is_add) {
                        err = cgroup_add_file(cgrp, subsys, cft);
-                else
+                        if (err)
-                        err = cgroup_rm_file(cgrp, cft);
+                                pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
-                if (err) {
+                                        cft->name, err);
-                        pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
-                                   is_add ? "add" : "remove", cft->name, err);
                        ret = err;
+                } else {
+                        cgroup_rm_file(cgrp, cft);
                }
        }
        return ret;
@@ -3017,6 +3025,32 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 }
 EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos.  If there's no descendant,
+ * @pos is returned.  This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+        struct cgroup *last, *tmp;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        do {
+                last = pos;
+                /* ->prev isn't RCU safe, walk ->next till the end */
+                pos = NULL;
+                list_for_each_entry_rcu(tmp, &last->children, sibling)
+                        pos = tmp;
+        } while (pos);
+        return last;
+}
+EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
 static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
 {
        struct cgroup *last;
@@ -3752,8 +3786,13 @@ static void cgroup_event_remove(struct work_struct *work)
                        remove);
        struct cgroup *cgrp = event->cgrp;
+        remove_wait_queue(event->wqh, &event->wait);
        event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+        /* Notify userspace the event is going away. */
+        eventfd_signal(event->eventfd, 1);
        eventfd_ctx_put(event->eventfd);
        kfree(event);
        dput(cgrp->dentry);
@@ -3773,15 +3812,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
        unsigned long flags = (unsigned long)key;
        if (flags & POLLHUP) {
-                __remove_wait_queue(event->wqh, &event->wait);
-                spin_lock(&cgrp->event_list_lock);
-                list_del_init(&event->list);
-                spin_unlock(&cgrp->event_list_lock);
                /*
-                 * We are in atomic context, but cgroup_event_remove() may
+                 * If the event has been detached at cgroup removal, we
-                 * sleep, so we have to call it in workqueue.
+                 * can simply return knowing the other side will cleanup
+                 * for us.
+                 *
+                 * We can't race against event freeing since the other
+                 * side will require wqh->lock via remove_wait_queue(),
+                 * which we hold.
                 */
-                schedule_work(&event->remove);
+                spin_lock(&cgrp->event_list_lock);
+                if (!list_empty(&event->list)) {
+                        list_del_init(&event->list);
+                        /*
+                         * We are in atomic context, but cgroup_event_remove()
+                         * may sleep, so we have to call it in workqueue.
+                         */
+                        schedule_work(&event->remove);
+                }
+                spin_unlock(&cgrp->event_list_lock);
        }
        return 0;
@@ -3807,6 +3856,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
 {
        struct cgroup_event *event = NULL;
+        struct cgroup *cgrp_cfile;
        unsigned int efd, cfd;
        struct file *efile = NULL;
        struct file *cfile = NULL;
@@ -3862,6 +3912,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                goto fail;
        }
+        /*
+         * The file to be monitored must be in the same cgroup as
+         * cgroup.event_control is.
+         */
+        cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+        if (cgrp_cfile != cgrp) {
+                ret = -EINVAL;
+                goto fail;
+        }
        if (!event->cft->register_event || !event->cft->unregister_event) {
                ret = -EINVAL;
                goto fail;
@@ -4135,6 +4195,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        init_cgroup_housekeeping(cgrp);
+        dentry->d_fsdata = cgrp;
+        cgrp->dentry = dentry;
        cgrp->parent = parent;
        cgrp->root = parent->root;
        cgrp->top_cgroup = parent->top_cgroup;
@@ -4172,8 +4235,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        lockdep_assert_held(&dentry->d_inode->i_mutex);
        /* allocation complete, commit to creation */
-        dentry->d_fsdata = cgrp;
-        cgrp->dentry = dentry;
        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
        root->number_of_cgroups++;
@@ -4340,20 +4401,14 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
        /*
         * Unregister events and notify userspace.
         * Notify userspace about cgroup removing only after rmdir of cgroup
-         * directory to avoid race between userspace and kernelspace. Use
+         * directory to avoid race between userspace and kernelspace.
-         * a temporary list to avoid a deadlock with cgroup_event_wake(). Since
-         * cgroup_event_wake() is called with the wait queue head locked,
-         * remove_wait_queue() cannot be called while holding event_list_lock.
         */
        spin_lock(&cgrp->event_list_lock);
-        list_splice_init(&cgrp->event_list, &tmp_list);
+        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-        spin_unlock(&cgrp->event_list_lock);
-        list_for_each_entry_safe(event, tmp, &tmp_list, list) {
                list_del_init(&event->list);
-                remove_wait_queue(event->wqh, &event->wait);
-                eventfd_signal(event->eventfd, 1);
                schedule_work(&event->remove);
        }
+        spin_unlock(&cgrp->event_list_lock);
        return 0;
 }
@@ -4438,6 +4493,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 {
        struct cgroup_subsys_state *css;
        int i, ret;
+        struct hlist_node *node, *tmp;
+        struct css_set *cg;
+        unsigned long key;
        /* check name and function validity */
        if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
@@ -4503,23 +4561,17 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         * this is all done under the css_set_lock.
         */
        write_lock(&css_set_lock);
-        for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+        hash_for_each_safe(css_set_table, i, node, tmp, cg, hlist) {
-                struct css_set *cg;
+                /* skip entries that we already rehashed */
-                struct hlist_node *node, *tmp;
+                if (cg->subsys[ss->subsys_id])
-                struct hlist_head *bucket = &css_set_table[i], *new_bucket;
+                        continue;
+                /* remove existing entry */
-                hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
+                hash_del(&cg->hlist);
-                        /* skip entries that we already rehashed */
+                /* set new value */
-                        if (cg->subsys[ss->subsys_id])
+                cg->subsys[ss->subsys_id] = css;
-                                continue;
+                /* recompute hash and restore entry */
-                        /* remove existing entry */
+                key = css_set_hash(cg->subsys);
-                        hlist_del(&cg->hlist);
+                hash_add(css_set_table, node, key);
-                        /* set new value */
-                        cg->subsys[ss->subsys_id] = css;
-                        /* recompute hash and restore entry */
-                        new_bucket = css_set_hash(cg->subsys);
-                        hlist_add_head(&cg->hlist, new_bucket);
-                }
        }
        write_unlock(&css_set_lock);
@@ -4551,7 +4603,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
        struct cg_cgroup_link *link;
-        struct hlist_head *hhead;
        BUG_ON(ss->module == NULL);
@@ -4585,11 +4636,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        write_lock(&css_set_lock);
        list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
                struct css_set *cg = link->cg;
+                unsigned long key;
-                hlist_del(&cg->hlist);
+                hash_del(&cg->hlist);
                cg->subsys[ss->subsys_id] = NULL;
-                hhead = css_set_hash(cg->subsys);
+                key = css_set_hash(cg->subsys);
-                hlist_add_head(&cg->hlist, hhead);
+                hash_add(css_set_table, &cg->hlist, key);
        }
        write_unlock(&css_set_lock);
@@ -4631,9 +4683,6 @@ int __init cgroup_init_early(void)
        list_add(&init_css_set_link.cg_link_list,
                 &init_css_set.cg_links);
-        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
-                INIT_HLIST_HEAD(&css_set_table[i]);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
@@ -4667,7 +4716,7 @@ int __init cgroup_init(void)
 {
        int err;
        int i;
-        struct hlist_head *hhead;
+        unsigned long key;
        err = bdi_init(&cgroup_backing_dev_info);
        if (err)
@@ -4686,8 +4735,8 @@ int __init cgroup_init(void)
        }
        /* Add init_css_set to the hash table */
-        hhead = css_set_hash(init_css_set.subsys);
+        key = css_set_hash(init_css_set.subsys);
-        hlist_add_head(&init_css_set.hlist, hhead);
+        hash_add(css_set_table, &init_css_set.hlist, key);
        BUG_ON(!init_root_id(&rootnode));
        cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4982,8 +5031,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        }
        task_unlock(tsk);
-        if (cg)
+        put_css_set_taskexit(cg);
-                put_css_set_taskexit(cg);
 }
 /**
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e0e07fd55508..65349f07b878 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,29 +1,41 @@
+/*
+ * Context tracking: Probe on high level context boundaries such as kernel
+ * and userspace. This includes syscalls and exceptions entry/exit.
+ *
+ * This is used by RCU to remove its dependency on the timer tick while a CPU
+ * runs in userspace.
+ *
+ *  Started by Frederic Weisbecker:
+ *
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
+ *
+ */
 #include <linux/context_tracking.h>
+#include <linux/kvm_host.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
-#include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/export.h>
-struct context_tracking {
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
-        /*
-         * When active is false, hooks are not set to
-         * minimize overhead: TIF flags are cleared
-         * and calls to user_enter/exit are ignored. This
-         * may be further optimized using static keys.
-         */
-        bool active;
-        enum {
-                IN_KERNEL = 0,
-                IN_USER,
-        } state;
-};
-static DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
 #ifdef CONFIG_CONTEXT_TRACKING_FORCE
        .active = true,
 #endif
 };
+/**
+ * user_enter - Inform the context tracking that the CPU is going to
+ *              enter userspace mode.
+ *
+ * This function must be called right before we switch from the kernel
+ * to userspace, when it's guaranteed the remaining kernel instructions
+ * to execute won't use any RCU read side critical section because this
+ * function sets RCU in extended quiescent state.
+ */
 void user_enter(void)
 {
        unsigned long flags;
@@ -39,40 +51,90 @@ void user_enter(void)
        if (in_interrupt())
                return;
+        /* Kernel threads aren't supposed to go to userspace */
        WARN_ON_ONCE(!current->mm);
        local_irq_save(flags);
        if (__this_cpu_read(context_tracking.active) &&
            __this_cpu_read(context_tracking.state) != IN_USER) {
-                __this_cpu_write(context_tracking.state, IN_USER);
+                /*
+                 * At this stage, only low level arch entry code remains and
+                 * then we'll run in userspace. We can assume there won't be
+                 * any RCU read-side critical section until the next call to
+                 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+                 * on the tick.
+                 */
+                vtime_user_enter(current);
                rcu_user_enter();
+                __this_cpu_write(context_tracking.state, IN_USER);
        }
        local_irq_restore(flags);
 }
+/**
+ * user_exit - Inform the context tracking that the CPU is
+ *             exiting userspace mode and entering the kernel.
+ *
+ * This function must be called after we entered the kernel from userspace
+ * before any use of RCU read side critical section. This potentially include
+ * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ *
+ * This call supports re-entrancy. This way it can be called from any exception
+ * handler without needing to know if we came from userspace or not.
+ */
 void user_exit(void)
 {
        unsigned long flags;
-        /*
-         * Some contexts may involve an exception occuring in an irq,
-         * leading to that nesting:
-         * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
-         * This would mess up the dyntick_nesting count though. And rcu_irq_*()
-         * helpers are enough to protect RCU uses inside the exception. So
-         * just return immediately if we detect we are in an IRQ.
-         */
        if (in_interrupt())
                return;
        local_irq_save(flags);
        if (__this_cpu_read(context_tracking.state) == IN_USER) {
-                __this_cpu_write(context_tracking.state, IN_KERNEL);
+                /*
+                 * We are going to run code that may use RCU. Inform
+                 * RCU core about that (ie: we may need the tick again).
+                 */
                rcu_user_exit();
+                vtime_user_exit(current);
+                __this_cpu_write(context_tracking.state, IN_KERNEL);
        }
        local_irq_restore(flags);
 }
+void guest_enter(void)
+{
+        if (vtime_accounting_enabled())
+                vtime_guest_enter(current);
+        else
+                __guest_enter();
+}
+EXPORT_SYMBOL_GPL(guest_enter);
+void guest_exit(void)
+{
+        if (vtime_accounting_enabled())
+                vtime_guest_exit(current);
+        else
+                __guest_exit();
+}
+EXPORT_SYMBOL_GPL(guest_exit);
+/**
+ * context_tracking_task_switch - context switch the syscall callbacks
+ * @prev: the task that is being switched out
+ * @next: the task that is being switched in
+ *
+ * The context tracking uses the syscall slow path to implement its user-kernel
+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
+ * path on CPUs that don't do context tracking.
+ *
+ * But we need to clear the flag on the previous task because it may later
+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
+ * flag may not be desired there.
+ */
 void context_tracking_task_switch(struct task_struct *prev,
                             struct task_struct *next)
 {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3046a503242c..b5e4ab2d427e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu)
 static inline void check_for_tasks(int cpu)
 {
        struct task_struct *p;
+        cputime_t utime, stime;
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
+                task_cputime(p, &utime, &stime);
                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
-                    (p->utime || p->stime))
+                    (utime || stime))
                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
                                "(state = %ld, flags = %x)\n",
                                p->comm, task_pid_nr(p), cpu,
@@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param)
                return err;
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
+        /* Park the stopper thread */
+        kthread_park(current);
        return 0;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7bb63eea6eb8..4f9dfe43ecbd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
 #include <linux/cgroup.h>
 /*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-/*
 * Tracks how many cpusets are currently defined in system.
 * When there is only one cpuset (the root cpuset) we can
 * short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
        cpumask_var_t cpus_allowed;     /* CPUs allowed to tasks in cpuset */
        nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
-        struct cpuset *parent;          /* my parent */
        struct fmeter fmeter;           /* memory_pressure filter */
+        /*
+         * Tasks are being attached to this cpuset.  Used to prevent
+         * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+         */
+        int attach_in_progress;
        /* partition number for rebuild_sched_domains() */
        int pn;
        /* for custom sched domain */
        int relax_domain_level;
-        /* used for walking a cpuset hierarchy */
+        struct work_struct hotplug_work;
-        struct list_head stack_list;
 };
 /* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
                            struct cpuset, css);
 }
+static inline struct cpuset *parent_cs(const struct cpuset *cs)
+{
+        struct cgroup *pcgrp = cs->css.cgroup->parent;
+        if (pcgrp)
+                return cgroup_cs(pcgrp);
+        return NULL;
+}
 #ifdef CONFIG_NUMA
 static inline bool task_has_mempolicy(struct task_struct *task)
 {
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
 /* bits in struct cpuset flags field */
 typedef enum {
+        CS_ONLINE,
        CS_CPU_EXCLUSIVE,
        CS_MEM_EXCLUSIVE,
        CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
        CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
-/* the type of hotplug event */
-enum hotplug_event {
-        CPUSET_CPU_OFFLINE,
-        CPUSET_MEM_OFFLINE,
-};
 /* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+        return test_bit(CS_ONLINE, &cs->flags);
+}
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
        return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
 }
 static struct cpuset top_cpuset = {
-        .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+        .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+                  (1 << CS_MEM_EXCLUSIVE)),
 };
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_cgrp: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs.  Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs)            \
+        cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup)      \
+                if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs.  Must be used
+ * with RCU read locked.  The caller may modify @pos_cgrp by calling
+ * cgroup_rightmost_descendant() to skip subtree.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs)       \
+        cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
+                if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
 /*
- * There are two global mutexes guarding cpuset structures.  The first
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
- * is the main control groups cgroup_mutex, accessed via
+ * and callback_mutex.  The latter may nest inside the former.  We also
- * cgroup_lock()/cgroup_unlock().  The second is the cpuset-specific
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
- * callback_mutex, below. They can nest.  It is ok to first take
+ * See "The task_lock() exception", at the end of this comment.
- * cgroup_mutex, then nest callback_mutex.  We also require taking
+ *
- * task_lock() when dereferencing a task's cpuset pointer.  See "The
+ * A task must hold both mutexes to modify cpusets.  If a task holds
- * task_lock() exception", at the end of this comment.
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- *
+ * is the only task able to also acquire callback_mutex and be able to
- * A task must hold both mutexes to modify cpusets.  If a task
+ * modify cpusets.  It can perform various checks on the cpuset structure
- * holds cgroup_mutex, then it blocks others wanting that mutex,
+ * first, knowing nothing will change.  It can also allocate memory while
- * ensuring that it is the only task able to also acquire callback_mutex
+ * just holding cpuset_mutex.  While it is performing these checks, various
- * and be able to modify cpusets.  It can perform various checks on
+ * callback routines can briefly acquire callback_mutex to query cpusets.
- * the cpuset structure first, knowing nothing will change.  It can
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
- * also allocate memory while just holding cgroup_mutex.  While it is
+ * everyone else.
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets.  Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
 * callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
 * guidelines for accessing subsystem state in kernel/cgroup.c
 */
+static DEFINE_MUTEX(cpuset_mutex);
 static DEFINE_MUTEX(callback_mutex);
 /*
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
 static DEFINE_SPINLOCK(cpuset_buffer_lock);
 /*
+ * CPU / memory hotplug is handled asynchronously.
+ */
+static struct workqueue_struct *cpuset_propagate_hotplug_wq;
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+/*
 * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
 * silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
                                  struct cpumask *pmask)
 {
        while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
-                cs = cs->parent;
+                cs = parent_cs(cs);
        if (cs)
                cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
        else
@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 {
        while (cs && !nodes_intersects(cs->mems_allowed,
                                        node_states[N_MEMORY]))
-                cs = cs->parent;
+                cs = parent_cs(cs);
        if (cs)
                nodes_and(*pmask, cs->mems_allowed,
                                        node_states[N_MEMORY]);
@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 /*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
- * Called with callback_mutex/cgroup_mutex held
+ * Called with callback_mutex/cpuset_mutex held
 */
 static void cpuset_update_task_spread_flag(struct cpuset *cs,
                                        struct task_struct *tsk)
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.  Call holding cgroup_mutex.
+ * are only set if the other's are set.  Call holding cpuset_mutex.
 */
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 {
        struct cgroup *cont;
        struct cpuset *c, *par;
+        int ret;
+        rcu_read_lock();
        /* Each of our child cpusets must be a subset of us */
-        list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
+        ret = -EBUSY;
-                if (!is_cpuset_subset(cgroup_cs(cont), trial))
+        cpuset_for_each_child(c, cont, cur)
-                        return -EBUSY;
+                if (!is_cpuset_subset(c, trial))
-        }
+                        goto out;
        /* Remaining checks don't apply to root cpuset */
+        ret = 0;
        if (cur == &top_cpuset)
-                return 0;
+                goto out;
-        par = cur->parent;
+        par = parent_cs(cur);
        /* We must be a subset of our parent cpuset */
+        ret = -EACCES;
        if (!is_cpuset_subset(trial, par))
-                return -EACCES;
+                goto out;
        /*
         * If either I or some sibling (!= me) is exclusive, we can't
         * overlap
         */
-        list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
+        ret = -EINVAL;
-                c = cgroup_cs(cont);
+        cpuset_for_each_child(c, cont, par) {
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                    c != cur &&
                    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
-                        return -EINVAL;
+                        goto out;
                if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
                    c != cur &&
                    nodes_intersects(trial->mems_allowed, c->mems_allowed))
-                        return -EINVAL;
+                        goto out;
        }
-        /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
+        /*
-        if (cgroup_task_count(cur->css.cgroup)) {
+         * Cpusets with tasks - existing or newly being attached - can't
-                if (cpumask_empty(trial->cpus_allowed) ||
+         * have empty cpus_allowed or mems_allowed.
-                    nodes_empty(trial->mems_allowed)) {
+         */
-                        return -ENOSPC;
+        ret = -ENOSPC;
-                }
+        if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
-        }
+            (cpumask_empty(trial->cpus_allowed) ||
+             nodes_empty(trial->mems_allowed)))
+                goto out;
-        return 0;
+        ret = 0;
+out:
+        rcu_read_unlock();
+        return ret;
 }
 #ifdef CONFIG_SMP
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
        return;
 }
-static void
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+                                    struct cpuset *root_cs)
 {
-        LIST_HEAD(q);
+        struct cpuset *cp;
+        struct cgroup *pos_cgrp;
-        list_add(&c->stack_list, &q);
-        while (!list_empty(&q)) {
-                struct cpuset *cp;
-                struct cgroup *cont;
-                struct cpuset *child;
-                cp = list_first_entry(&q, struct cpuset, stack_list);
-                list_del(q.next);
-                if (cpumask_empty(cp->cpus_allowed))
+        rcu_read_lock();
+        cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+                /* skip the whole subtree if @cp doesn't have any CPU */
+                if (cpumask_empty(cp->cpus_allowed)) {
+                        pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
                        continue;
+                }
                if (is_sched_load_balance(cp))
                        update_domain_attr(dattr, cp);
-                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-                        child = cgroup_cs(cont);
-                        list_add_tail(&child->stack_list, &q);
-                }
        }
+        rcu_read_unlock();
 }
 /*
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
 *
 * The three key local variables below are:
 *    q  - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
 {
-        LIST_HEAD(q);           /* queue of cpusets to be scanned */
        struct cpuset *cp;      /* scans q */
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;          /* number of sched domains in result */
        int nslot;              /* next empty doms[] struct cpumask slot */
+        struct cgroup *pos_cgrp;
        doms = NULL;
        dattr = NULL;
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
                goto done;
        csn = 0;
-        list_add(&top_cpuset.stack_list, &q);
+        rcu_read_lock();
-        while (!list_empty(&q)) {
+        cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
-                struct cgroup *cont;
-                struct cpuset *child;   /* scans child cpusets of cp */
-                cp = list_first_entry(&q, struct cpuset, stack_list);
-                list_del(q.next);
-                if (cpumask_empty(cp->cpus_allowed))
-                        continue;
                /*
-                 * All child cpusets contain a subset of the parent's cpus, so
+                 * Continue traversing beyond @cp iff @cp has some CPUs and
-                 * just skip them, and then we call update_domain_attr_tree()
+                 * isn't load balancing.  The former is obvious.  The
-                 * to calc relax_domain_level of the corresponding sched
+                 * latter: All child cpusets contain a subset of the
-                 * domain.
+                 * parent's cpus, so just skip them, and then we call
+                 * update_domain_attr_tree() to calc relax_domain_level of
+                 * the corresponding sched domain.
                 */
-                if (is_sched_load_balance(cp)) {
+                if (!cpumask_empty(cp->cpus_allowed) &&
-                        csa[csn++] = cp;
+                    !is_sched_load_balance(cp))
                        continue;
-                }
-                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                if (is_sched_load_balance(cp))
-                        child = cgroup_cs(cont);
+                        csa[csn++] = cp;
-                        list_add_tail(&child->stack_list, &q);
-                }
+                /* skip @cp's subtree */
-        }
+                pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+        }
+        rcu_read_unlock();
        for (i = 0; i < csn; i++)
                csa[i]->pn = i;
@@ -725,25 +763,25 @@ done:
 /*
 * Rebuild scheduler domains.
 *
- * Call with neither cgroup_mutex held nor within get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
- * Takes both cgroup_mutex and get_online_cpus().
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
 *
- * Cannot be directly called from cpuset code handling changes
+ * Call with cpuset_mutex held.  Takes get_online_cpus().
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
 */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
 {
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
        int ndoms;
+        lockdep_assert_held(&cpuset_mutex);
        get_online_cpus();
        /* Generate domain masks and attrs */
-        cgroup_lock();
        ndoms = generate_sched_domains(&doms, &attr);
-        cgroup_unlock();
        /* Have scheduler rebuild the domains */
        partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
        put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
 {
 }
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
 }
 #endif /* CONFIG_SMP */
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
- *
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
- *
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
- */
-static void async_rebuild_sched_domains(void)
-{
-        queue_work(cpuset_wq, &rebuild_sched_domains_work);
-}
-/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
- *
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
 void rebuild_sched_domains(void)
 {
-        do_rebuild_sched_domains(NULL);
+        mutex_lock(&cpuset_mutex);
+        rebuild_sched_domains_locked();
+        mutex_unlock(&cpuset_mutex);
 }
 /**
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
 * @tsk: task to test
 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
 *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Call with cpuset_mutex held.  May take callback_mutex during call.
 * Called for each task in a cgroup by cgroup_scan_tasks().
 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
 * words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
 * cpus_allowed mask needs to be changed.
 *
 * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
 */
 static void cpuset_change_cpumask(struct task_struct *tsk,
                                  struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
        heap_free(&heap);
        if (is_load_balanced)
-                async_rebuild_sched_domains();
+                rebuild_sched_domains_locked();
        return 0;
 }
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    Temporarilly set tasks mems_allowed to target nodes of migration,
 *    so that the migration code can allocate pages on these nodes.
 *
- *    Call holding cgroup_mutex, so current's cpuset won't change
+ *    Call holding cpuset_mutex, so current's cpuset won't change
 *    during this call, as manage_mutex holds off any cpuset_attach()
 *    calls.  Therefore we don't need to take task_lock around the
 *    call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 /*
 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
+ * memory_migrate flag is set. Called with cpuset_mutex held.
 */
 static void cpuset_change_nodemask(struct task_struct *p,
                                   struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
-        static nodemask_t newmems;      /* protected by cgroup_mutex */
+        static nodemask_t newmems;      /* protected by cpuset_mutex */
        cs = cgroup_cs(scan->cg);
        guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
 * @oldmem: old mems_allowed of cpuset cs
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
 * if @heap != NULL.
 */
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
         * take while holding tasklist_lock.  Forks can happen - the
         * mpol_dup() cpuset_being_rebound check will catch such forks,
         * and rebind their vma mempolicies too.  Because we still hold
-         * the global cgroup_mutex, we know that no other rebind effort
+         * the global cpuset_mutex, we know that no other rebind effort
         * will be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
 *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Call with cpuset_mutex held.  May take callback_mutex during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
                cs->relax_domain_level = val;
                if (!cpumask_empty(cs->cpus_allowed) &&
                    is_sched_load_balance(cs))
-                        async_rebuild_sched_domains();
+                        rebuild_sched_domains_locked();
        }
        return 0;
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 * Called by cgroup_scan_tasks() for each task in a cgroup.
 *
 * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
 */
 static void cpuset_change_flag(struct task_struct *tsk,
                                struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
 * @cs: the cpuset in which each task's spread flags needs to be changed
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
 * cs:          the cpuset to update
 * turning_on:  whether the flag is being set or cleared
 *
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
 */
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        mutex_unlock(&callback_mutex);
        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-                async_rebuild_sched_domains();
+                rebuild_sched_domains_locked();
        if (spread_flag_changed)
                update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
-/*
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
- * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in can_attach, and they must
- * persist until attach.
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_from;
-static nodemask_t cpuset_attach_nodemask_to;
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
        struct cpuset *cs = cgroup_cs(cgrp);
        struct task_struct *task;
        int ret;
+        mutex_lock(&cpuset_mutex);
+        ret = -ENOSPC;
        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
-                return -ENOSPC;
+                goto out_unlock;
        cgroup_taskset_for_each(task, cgrp, tset) {
                /*
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
                 * set_cpus_allowed_ptr() on all attached tasks before
                 * cpus_allowed may be changed.
                 */
+                ret = -EINVAL;
                if (task->flags & PF_THREAD_BOUND)
-                        return -EINVAL;
+                        goto out_unlock;
-                if ((ret = security_task_setscheduler(task)))
+                ret = security_task_setscheduler(task);
-                        return ret;
+                if (ret)
+                        goto out_unlock;
        }
-        /* prepare for attach */
+        /*
-        if (cs == &top_cpuset)
+         * Mark attach is in progress.  This makes validate_change() fail
-                cpumask_copy(cpus_attach, cpu_possible_mask);
+         * changes which zero cpus/mems_allowed.
-        else
+         */
-                guarantee_online_cpus(cs, cpus_attach);
+        cs->attach_in_progress++;
+        ret = 0;
-        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+out_unlock:
+        mutex_unlock(&cpuset_mutex);
+        return ret;
+}
-        return 0;
+static void cpuset_cancel_attach(struct cgroup *cgrp,
+                                 struct cgroup_taskset *tset)
+{
+        mutex_lock(&cpuset_mutex);
+        cgroup_cs(cgrp)->attach_in_progress--;
+        mutex_unlock(&cpuset_mutex);
 }
+/*
+ * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there.  Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
 static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
+        /* static bufs protected by cpuset_mutex */
+        static nodemask_t cpuset_attach_nodemask_from;
+        static nodemask_t cpuset_attach_nodemask_to;
        struct mm_struct *mm;
        struct task_struct *task;
        struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
        struct cpuset *cs = cgroup_cs(cgrp);
        struct cpuset *oldcs = cgroup_cs(oldcgrp);
+        mutex_lock(&cpuset_mutex);
+        /* prepare for attach */
+        if (cs == &top_cpuset)
+                cpumask_copy(cpus_attach, cpu_possible_mask);
+        else
+                guarantee_online_cpus(cs, cpus_attach);
+        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
        cgroup_taskset_for_each(task, cgrp, tset) {
                /*
                 * can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
                                          &cpuset_attach_nodemask_to);
                mmput(mm);
        }
+        cs->attach_in_progress--;
+        /*
+         * We may have raced with CPU/memory hotunplug.  Trigger hotplug
+         * propagation if @cs doesn't have any CPU or memory.  It will move
+         * the newly added tasks to the nearest parent which can execute.
+         */
+        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+                schedule_cpuset_propagate_hotplug(cs);
+        mutex_unlock(&cpuset_mutex);
 }
 /* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@ typedef enum {
 static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
-        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
+        int retval = -ENODEV;
-        if (!cgroup_lock_live_group(cgrp))
+        mutex_lock(&cpuset_mutex);
-                return -ENODEV;
+        if (!is_cpuset_online(cs))
+                goto out_unlock;
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
                retval = -EINVAL;
                break;
        }
-        cgroup_unlock();
+out_unlock:
+        mutex_unlock(&cpuset_mutex);
        return retval;
 }
 static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 {
-        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
+        int retval = -ENODEV;
-        if (!cgroup_lock_live_group(cgrp))
+        mutex_lock(&cpuset_mutex);
-                return -ENODEV;
+        if (!is_cpuset_online(cs))
+                goto out_unlock;
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
                retval = -EINVAL;
                break;
        }
-        cgroup_unlock();
+out_unlock:
+        mutex_unlock(&cpuset_mutex);
        return retval;
 }
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
                                const char *buf)
 {
-        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        struct cpuset *trialcs;
+        int retval = -ENODEV;
+        /*
+         * CPU or memory hotunplug may leave @cs w/o any execution
+         * resources, in which case the hotplug code asynchronously updates
+         * configuration and transfers all tasks to the nearest ancestor
+         * which can execute.
+         *
+         * As writes to "cpus" or "mems" may restore @cs's execution
+         * resources, wait for the previously scheduled operations before
+         * proceeding, so that we don't end up keep removing tasks added
+         * after execution capability is restored.
+         *
+         * Flushing cpuset_hotplug_work is enough to synchronize against
+         * hotplug hanlding; however, cpuset_attach() may schedule
+         * propagation work directly.  Flush the workqueue too.
+         */
+        flush_work(&cpuset_hotplug_work);
+        flush_workqueue(cpuset_propagate_hotplug_wq);
-        if (!cgroup_lock_live_group(cgrp))
+        mutex_lock(&cpuset_mutex);
-                return -ENODEV;
+        if (!is_cpuset_online(cs))
+                goto out_unlock;
        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs) {
                retval = -ENOMEM;
-                goto out;
+                goto out_unlock;
        }
        switch (cft->private) {
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
        }
        free_trial_cpuset(trialcs);
-out:
+out_unlock:
-        cgroup_unlock();
+        mutex_unlock(&cpuset_mutex);
        return retval;
 }
@@ -1790,15 +1854,12 @@ static struct cftype files[] = {
 static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
 {
-        struct cgroup *parent_cg = cont->parent;
+        struct cpuset *cs;
-        struct cgroup *tmp_cg;
-        struct cpuset *parent, *cs;
-        if (!parent_cg)
+        if (!cont->parent)
                return &top_cpuset.css;
-        parent = cgroup_cs(parent_cg);
-        cs = kmalloc(sizeof(*cs), GFP_KERNEL);
+        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);
        if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
                return ERR_PTR(-ENOMEM);
        }
-        cs->flags = 0;
-        if (is_spread_page(parent))
-                set_bit(CS_SPREAD_PAGE, &cs->flags);
-        if (is_spread_slab(parent))
-                set_bit(CS_SPREAD_SLAB, &cs->flags);
        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        cpumask_clear(cs->cpus_allowed);
        nodes_clear(cs->mems_allowed);
        fmeter_init(&cs->fmeter);
+        INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
        cs->relax_domain_level = -1;
-        cs->parent = parent;
+        return &cs->css;
+}
+static int cpuset_css_online(struct cgroup *cgrp)
+{
+        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *parent = parent_cs(cs);
+        struct cpuset *tmp_cs;
+        struct cgroup *pos_cg;
+        if (!parent)
+                return 0;
+        mutex_lock(&cpuset_mutex);
+        set_bit(CS_ONLINE, &cs->flags);
+        if (is_spread_page(parent))
+                set_bit(CS_SPREAD_PAGE, &cs->flags);
+        if (is_spread_slab(parent))
+                set_bit(CS_SPREAD_SLAB, &cs->flags);
        number_of_cpusets++;
-        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags))
+        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
-                goto skip_clone;
+                goto out_unlock;
        /*
         * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
         * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
         * (and likewise for mems) to the new cgroup.
         */
-        list_for_each_entry(tmp_cg, &parent_cg->children, sibling) {
+        rcu_read_lock();
-                struct cpuset *tmp_cs = cgroup_cs(tmp_cg);
+        cpuset_for_each_child(tmp_cs, pos_cg, parent) {
+                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
-                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs))
+                        rcu_read_unlock();
-                        goto skip_clone;
+                        goto out_unlock;
+                }
        }
+        rcu_read_unlock();
        mutex_lock(&callback_mutex);
        cs->mems_allowed = parent->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
        mutex_unlock(&callback_mutex);
-skip_clone:
+out_unlock:
-        return &cs->css;
+        mutex_unlock(&cpuset_mutex);
+        return 0;
+}
+static void cpuset_css_offline(struct cgroup *cgrp)
+{
+        struct cpuset *cs = cgroup_cs(cgrp);
+        mutex_lock(&cpuset_mutex);
+        if (is_sched_load_balance(cs))
+                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+        number_of_cpusets--;
+        clear_bit(CS_ONLINE, &cs->flags);
+        mutex_unlock(&cpuset_mutex);
 }
 /*
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
 */
 static void cpuset_css_free(struct cgroup *cont)
 {
        struct cpuset *cs = cgroup_cs(cont);
-        if (is_sched_load_balance(cs))
-                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-        number_of_cpusets--;
        free_cpumask_var(cs->cpus_allowed);
        kfree(cs);
 }
@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont)
 struct cgroup_subsys cpuset_subsys = {
        .name = "cpuset",
        .css_alloc = cpuset_css_alloc,
+        .css_online = cpuset_css_online,
+        .css_offline = cpuset_css_offline,
        .css_free = cpuset_css_free,
        .can_attach = cpuset_can_attach,
+        .cancel_attach = cpuset_cancel_attach,
        .attach = cpuset_attach,
        .subsys_id = cpuset_subsys_id,
        .base_cftypes = files,
@@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
 {
        struct cgroup *new_cgroup = scan->data;
+        cgroup_lock();
        cgroup_attach_task(new_cgroup, tsk);
+        cgroup_unlock();
 }
 /**
@@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
 * @from: cpuset in which the tasks currently reside
 * @to: cpuset to which the tasks will be moved
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 * callback_mutex must not be held, as cpuset_attach() will take it.
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
 * cpuset to its next-highest non-empty parent.
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
 */
 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 {
        struct cpuset *parent;
        /*
-         * The cgroup's css_sets list is in use if there are tasks
-         * in the cpuset; the list is empty if there are none;
-         * the cs->css.refcnt seems always 0.
-         */
-        if (list_empty(&cs->css.cgroup->css_sets))
-                return;
-        /*
         * Find its next-highest non-empty parent, (top cpuset
         * has online cpus, so can't be empty).
         */
-        parent = cs->parent;
+        parent = parent_cs(cs);
        while (cpumask_empty(parent->cpus_allowed) ||
                        nodes_empty(parent->mems_allowed))
-                parent = parent->parent;
+                parent = parent_cs(parent);
        move_member_tasks_to_cpuset(cs, parent);
 }
-/*
+/**
- * Helper function to traverse cpusets.
+ * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
- * It can be used to walk the cpuset tree from top to bottom, completing
+ * @cs: cpuset in interest
- * one layer before dropping down to the next (thus always processing a
+ *
- * node before any of its children).
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
 */
-static struct cpuset *cpuset_next(struct list_head *queue)
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
 {
-        struct cpuset *cp;
+        static cpumask_t off_cpus;
-        struct cpuset *child;   /* scans child cpusets of cp */
+        static nodemask_t off_mems, tmp_mems;
-        struct cgroup *cont;
+        struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
+        bool is_empty;
-        if (list_empty(queue))
+        mutex_lock(&cpuset_mutex);
-                return NULL;
+        cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+        nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-        cp = list_first_entry(queue, struct cpuset, stack_list);
+        /* remove offline cpus from @cs */
-        list_del(queue->next);
+        if (!cpumask_empty(&off_cpus)) {
-        list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                mutex_lock(&callback_mutex);
-                child = cgroup_cs(cont);
+                cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
-                list_add_tail(&child->stack_list, queue);
+                mutex_unlock(&callback_mutex);
+                update_tasks_cpumask(cs, NULL);
+        }
+        /* remove offline mems from @cs */
+        if (!nodes_empty(off_mems)) {
+                tmp_mems = cs->mems_allowed;
+                mutex_lock(&callback_mutex);
+                nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+                mutex_unlock(&callback_mutex);
+                update_tasks_nodemask(cs, &tmp_mems, NULL);
        }
-        return cp;
+        is_empty = cpumask_empty(cs->cpus_allowed) ||
+                nodes_empty(cs->mems_allowed);
+        mutex_unlock(&cpuset_mutex);
+        /*
+         * If @cs became empty, move tasks to the nearest ancestor with
+         * execution resources.  This is full cgroup operation which will
+         * also call back into cpuset.  Should be done outside any lock.
+         */
+        if (is_empty)
+                remove_tasks_in_empty_cpuset(cs);
+        /* the following may free @cs, should be the last operation */
+        css_put(&cs->css);
 }
+/**
+ * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
+ * @cs: cpuset of interest
+ *
+ * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
+ * memory masks according to top_cpuset.
+ */
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
+{
+        /*
+         * Pin @cs.  The refcnt will be released when the work item
+         * finishes executing.
+         */
+        if (!css_tryget(&cs->css))
+                return;
-/*
+        /*
- * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
+         * Queue @cs->hotplug_work.  If already pending, lose the css ref.
- * online/offline) and update the cpusets accordingly.
+         * cpuset_propagate_hotplug_wq is ordered and propagation will
- * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
+         * happen in the order this function is called.
- * cpuset must be moved to a parent cpuset.
+         */
+        if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
+                css_put(&cs->css);
+}
+/**
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
 *
- * Called with cgroup_mutex held.  We take callback_mutex to modify
+ * This function is called after either CPU or memory configuration has
- * cpus_allowed and mems_allowed.
+ * changed and updates cpuset accordingly.  The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
 *
- * This walk processes the tree from top to bottom, completing one layer
+ * Non-root cpusets are only affected by offlining.  If any CPUs or memory
- * before dropping down to the next.  It always processes a node before
+ * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
- * any of its children.
+ * descendants.
 *
- * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
+ * Note that CPU offlining during suspend is ignored.  We don't modify
- * if all present pages from a node are offlined.
+ * cpusets across suspend/resume cycles at all.
 */
-static void
+static void cpuset_hotplug_workfn(struct work_struct *work)
-scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 {
-        LIST_HEAD(queue);
+        static cpumask_t new_cpus, tmp_cpus;
-        struct cpuset *cp;              /* scans cpusets being updated */
+        static nodemask_t new_mems, tmp_mems;
-        static nodemask_t oldmems;      /* protected by cgroup_mutex */
+        bool cpus_updated, mems_updated;
+        bool cpus_offlined, mems_offlined;
-        list_add_tail((struct list_head *)&root->stack_list, &queue);
+        mutex_lock(&cpuset_mutex);
-        switch (event) {
+        /* fetch the available cpus/mems and find out which changed how */
-        case CPUSET_CPU_OFFLINE:
+        cpumask_copy(&new_cpus, cpu_active_mask);
-                while ((cp = cpuset_next(&queue)) != NULL) {
+        new_mems = node_states[N_MEMORY];
-                        /* Continue past cpusets with all cpus online */
+        cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
-                        if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
+        cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
-                                continue;
+                                       &new_cpus);
-                        /* Remove offline cpus from this cpuset. */
+        mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
-                        mutex_lock(&callback_mutex);
+        nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
-                        cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+        mems_offlined = !nodes_empty(tmp_mems);
-                                                        cpu_active_mask);
-                        mutex_unlock(&callback_mutex);
-                        /* Move tasks from the empty cpuset to a parent */
+        /* synchronize cpus_allowed to cpu_active_mask */
-                        if (cpumask_empty(cp->cpus_allowed))
+        if (cpus_updated) {
-                                remove_tasks_in_empty_cpuset(cp);
+                mutex_lock(&callback_mutex);
-                        else
+                cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
-                                update_tasks_cpumask(cp, NULL);
+                mutex_unlock(&callback_mutex);
-                }
+                /* we don't mess with cpumasks of tasks in top_cpuset */
-                break;
+        }
-        case CPUSET_MEM_OFFLINE:
+        /* synchronize mems_allowed to N_MEMORY */
-                while ((cp = cpuset_next(&queue)) != NULL) {
+        if (mems_updated) {
+                tmp_mems = top_cpuset.mems_allowed;
+                mutex_lock(&callback_mutex);
+                top_cpuset.mems_allowed = new_mems;
+                mutex_unlock(&callback_mutex);
+                update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
+        }
-                        /* Continue past cpusets with all mems online */
+        /* if cpus or mems went down, we need to propagate to descendants */
-                        if (nodes_subset(cp->mems_allowed,
+        if (cpus_offlined || mems_offlined) {
-                                        node_states[N_MEMORY]))
+                struct cpuset *cs;
-                                continue;
+                struct cgroup *pos_cgrp;
-                        oldmems = cp->mems_allowed;
+                rcu_read_lock();
+                cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
+                        schedule_cpuset_propagate_hotplug(cs);
+                rcu_read_unlock();
+        }
-                        /* Remove offline mems from this cpuset. */
+        mutex_unlock(&cpuset_mutex);
-                        mutex_lock(&callback_mutex);
-                        nodes_and(cp->mems_allowed, cp->mems_allowed,
-                                                node_states[N_MEMORY]);
-                        mutex_unlock(&callback_mutex);
-                        /* Move tasks from the empty cpuset to a parent */
+        /* wait for propagations to finish */
-                        if (nodes_empty(cp->mems_allowed))
+        flush_workqueue(cpuset_propagate_hotplug_wq);
-                                remove_tasks_in_empty_cpuset(cp);
-                        else
+        /* rebuild sched domains if cpus_allowed has changed */
-                                update_tasks_nodemask(cp, &oldmems, NULL);
+        if (cpus_updated) {
-                }
+                struct sched_domain_attr *attr;
+                cpumask_var_t *doms;
+                int ndoms;
+                mutex_lock(&cpuset_mutex);
+                ndoms = generate_sched_domains(&doms, &attr);
+                mutex_unlock(&cpuset_mutex);
+                partition_sched_domains(ndoms, doms, attr);
        }
 }
-/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period.  This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
- *
- * The only exception to this is suspend/resume, where we don't
- * modify cpusets at all.
- *
- * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_active_mask on each CPU hotplug (cpuhp) event.
- *
- * Called within get_online_cpus().  Needs to call cgroup_lock()
- * before calling generate_sched_domains().
- *
- * @cpu_online: Indicates whether this is a CPU online event (true) or
- * a CPU offline event (false).
- */
 void cpuset_update_active_cpus(bool cpu_online)
 {
-        struct sched_domain_attr *attr;
+        /*
-        cpumask_var_t *doms;
+         * We're inside cpu hotplug critical region which usually nests
-        int ndoms;
+         * inside cgroup synchronization.  Bounce actual hotplug processing
+         * to a work item to avoid reverse locking order.
-        cgroup_lock();
+         *
-        mutex_lock(&callback_mutex);
+         * We still need to do partition_sched_domains() synchronously;
-        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
+         * otherwise, the scheduler will get confused and put tasks to the
-        mutex_unlock(&callback_mutex);
+         * dead CPU.  Fall back to the default single domain.
+         * cpuset_hotplug_workfn() will rebuild it as necessary.
-        if (!cpu_online)
+         */
-                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
+        partition_sched_domains(1, NULL, NULL);
+        schedule_work(&cpuset_hotplug_work);
-        ndoms = generate_sched_domains(&doms, &attr);
-        cgroup_unlock();
-        /* Have scheduler rebuild the domains */
-        partition_sched_domains(ndoms, doms, attr);
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online)
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
 {
-        static nodemask_t oldmems;      /* protected by cgroup_mutex */
+        schedule_work(&cpuset_hotplug_work);
-        cgroup_lock();
-        switch (action) {
-        case MEM_ONLINE:
-                oldmems = top_cpuset.mems_allowed;
-                mutex_lock(&callback_mutex);
-                top_cpuset.mems_allowed = node_states[N_MEMORY];
-                mutex_unlock(&callback_mutex);
-                update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
-                break;
-        case MEM_OFFLINE:
-                /*
-                 * needn't update top_cpuset.mems_allowed explicitly because
-                 * scan_cpusets_upon_hotplug() will update it.
-                 */
-                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
-                break;
-        default:
-                break;
-        }
-        cgroup_unlock();
        return NOTIFY_OK;
 }
 #endif
@@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void)
        hotplug_memory_notifier(cpuset_track_online_nodes, 10);
-        cpuset_wq = create_singlethread_workqueue("cpuset");
+        cpuset_propagate_hotplug_wq =
-        BUG_ON(!cpuset_wq);
+                alloc_ordered_workqueue("cpuset_hotplug", 0);
+        BUG_ON(!cpuset_propagate_hotplug_wq);
 }
 /**
@@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 */
 static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 {
-        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
+        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
-                cs = cs->parent;
+                cs = parent_cs(cs);
        return cs;
 }
@@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 }
 /**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-void cpuset_unlock(void)
-{
-        mutex_unlock(&callback_mutex);
-}
-/**
 * cpuset_mem_spread_node() - On which node to begin search for a file page
 * cpuset_slab_spread_node() - On which node to begin search for a slab page
 *
@@ -2511,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
        dentry = task_cs(tsk)->css.cgroup->dentry;
        spin_lock(&cpuset_buffer_lock);
-        snprintf(cpuset_name, CPUSET_NAME_LEN,
-                 dentry ? (const char *)dentry->d_name.name : "/");
+        if (!dentry) {
+                strcpy(cpuset_name, "/");
+        } else {
+                spin_lock(&dentry->d_lock);
+                strlcpy(cpuset_name, (const char *)dentry->d_name.name,
+                        CPUSET_NAME_LEN);
+                spin_unlock(&dentry->d_lock);
+        }
        nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
                           tsk->mems_allowed);
        printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
@@ -2560,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
- *    and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
 *    anyway.
 */
 static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2582,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
        if (!tsk)
                goto out_free;
-        retval = -EINVAL;
+        rcu_read_lock();
-        cgroup_lock();
        css = task_subsys_state(tsk, cpuset_subsys_id);
        retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+        rcu_read_unlock();
        if (retval < 0)
-                goto out_unlock;
+                goto out_put_task;
        seq_puts(m, buf);
        seq_putc(m, '\n');
-out_unlock:
+out_put_task:
-        cgroup_unlock();
        put_task_struct(tsk);
 out_free:
        kfree(buf);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 418b3f7053aa..d473988c1d0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        unsigned long long t2, t3;
        unsigned long flags;
        struct timespec ts;
+        cputime_t utime, stime, stimescaled, utimescaled;
        /* Though tsk->delays accessed later, early exit avoids
         * unnecessary returning of other data
@@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
                goto done;
        tmp = (s64)d->cpu_run_real_total;
-        cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+        task_cputime(tsk, &utime, &stime);
+        cputime_to_timespec(utime + stime, &ts);
        tmp += timespec_to_ns(&ts);
        d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
        tmp = (s64)d->cpu_scaled_run_real_total;
-        cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+        task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+        cputime_to_timespec(utimescaled + stimescaled, &ts);
        tmp += timespec_to_ns(&ts);
        d->cpu_scaled_run_real_total =
                (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7b6646a8c067..5c75791d7269 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6171,11 +6171,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
+                if (attr->type == PERF_TYPE_TRACEPOINT)
+                        event->hw.tp_target = task;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
                /*
                 * hw_breakpoint is a bit difficult here..
                 */
-                if (attr->type == PERF_TYPE_BREAKPOINT)
+                else if (attr->type == PERF_TYPE_BREAKPOINT)
                        event->hw.bp_target = task;
 #endif
        }
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index fe8a916507ed..a64f8aeb5c1f 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void)
 err_alloc:
        for_each_possible_cpu(err_cpu) {
                for (i = 0; i < TYPE_MAX; i++)
-                        kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+                        kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
                if (err_cpu == cpu)
                        break;
        }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index dea7acfbb071..a567c8c7ef31 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@
 #include <linux/pagemap.h>      /* read_mapping_page */
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/export.h>
 #include <linux/rmap.h>         /* anon_vma_prepare */
 #include <linux/mmu_notifier.h> /* set_pte_at_notify */
 #include <linux/swap.h>         /* try_to_free_swap */
@@ -41,58 +42,31 @@
 #define MAX_UPROBE_XOL_SLOTS            UINSNS_PER_PAGE
 static struct rb_root uprobes_tree = RB_ROOT;
-static DEFINE_SPINLOCK(uprobes_treelock);       /* serialize rbtree access */
-#define UPROBES_HASH_SZ 13
 /*
- * We need separate register/unregister and mmap/munmap lock hashes because
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
- * of mmap_sem nesting.
+ * at this time.  Probably a fine grained per inode count is better?
- *
- * uprobe_register() needs to install probes on (potentially) all processes
- * and thus needs to acquire multiple mmap_sems (consequtively, not
- * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
- * for the particular process doing the mmap.
- *
- * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
- * because of lock order against i_mmap_mutex. This means there's a hole in
- * the register vma iteration where a mmap() can happen.
- *
- * Thus uprobe_register() can race with uprobe_mmap() and we can try and
- * install a probe where one is already installed.
 */
+#define no_uprobe_events()      RB_EMPTY_ROOT(&uprobes_tree)
-/* serialize (un)register */
+static DEFINE_SPINLOCK(uprobes_treelock);       /* serialize rbtree access */
-static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
-#define uprobes_hash(v)         (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+#define UPROBES_HASH_SZ 13
 /* serialize uprobe->pending_list */
 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 #define uprobes_mmap_hash(v)    (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
 static struct percpu_rw_semaphore dup_mmap_sem;
-/*
- * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
- * events active at this time.  Probably a fine grained per inode count is
- * better?
- */
-static atomic_t uprobe_events = ATOMIC_INIT(0);
 /* Have a copy of original instruction */
 #define UPROBE_COPY_INSN        0
-/* Dont run handlers when first register/ last unregister in progress*/
-#define UPROBE_RUN_HANDLER      1
 /* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP       2
+#define UPROBE_SKIP_SSTEP       1
 struct uprobe {
        struct rb_node          rb_node;        /* node in the rb tree */
        atomic_t                ref;
+        struct rw_semaphore     register_rwsem;
        struct rw_semaphore     consumer_rwsem;
-        struct mutex            copy_mutex;     /* TODO: kill me and UPROBE_COPY_INSN */
        struct list_head        pending_list;
        struct uprobe_consumer  *consumers;
        struct inode            *inode;         /* Also hold a ref to inode */
@@ -430,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
        u = __insert_uprobe(uprobe);
        spin_unlock(&uprobes_treelock);
-        /* For now assume that the instruction need not be single-stepped */
-        __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
        return u;
 }
@@ -452,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
        uprobe->inode = igrab(inode);
        uprobe->offset = offset;
+        init_rwsem(&uprobe->register_rwsem);
        init_rwsem(&uprobe->consumer_rwsem);
-        mutex_init(&uprobe->copy_mutex);
+        /* For now assume that the instruction need not be single-stepped */
+        __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
@@ -463,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
                kfree(uprobe);
                uprobe = cur_uprobe;
                iput(inode);
-        } else {
-                atomic_inc(&uprobe_events);
        }
        return uprobe;
 }
-static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
-{
-        struct uprobe_consumer *uc;
-        if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
-                return;
-        down_read(&uprobe->consumer_rwsem);
-        for (uc = uprobe->consumers; uc; uc = uc->next) {
-                if (!uc->filter || uc->filter(uc, current))
-                        uc->handler(uc, regs);
-        }
-        up_read(&uprobe->consumer_rwsem);
-}
-/* Returns the previous consumer */
-static struct uprobe_consumer *
-consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
        down_write(&uprobe->consumer_rwsem);
        uc->next = uprobe->consumers;
        uprobe->consumers = uc;
        up_write(&uprobe->consumer_rwsem);
-        return uc->next;
 }
 /*
@@ -588,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                return ret;
-        mutex_lock(&uprobe->copy_mutex);
+        /* TODO: move this into _register, until then we abuse this sem. */
+        down_write(&uprobe->consumer_rwsem);
        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                goto out;
@@ -612,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
        set_bit(UPROBE_COPY_INSN, &uprobe->flags);
 out:
-        mutex_unlock(&uprobe->copy_mutex);
+        up_write(&uprobe->consumer_rwsem);
+        return ret;
+}
+static inline bool consumer_filter(struct uprobe_consumer *uc,
+                                   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+        return !uc->filter || uc->filter(uc, ctx, mm);
+}
+static bool filter_chain(struct uprobe *uprobe,
+                         enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+        struct uprobe_consumer *uc;
+        bool ret = false;
+        down_read(&uprobe->consumer_rwsem);
+        for (uc = uprobe->consumers; uc; uc = uc->next) {
+                ret = consumer_filter(uc, ctx, mm);
+                if (ret)
+                        break;
+        }
+        up_read(&uprobe->consumer_rwsem);
        return ret;
 }
@@ -624,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
        bool first_uprobe;
        int ret;
-        /*
-         * If probe is being deleted, unregister thread could be done with
-         * the vma-rmap-walk through. Adding a probe now can be fatal since
-         * nobody will be able to cleanup. Also we could be from fork or
-         * mremap path, where the probe might have already been inserted.
-         * Hence behave as if probe already existed.
-         */
-        if (!uprobe->consumers)
-                return 0;
        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
        if (ret)
                return ret;
@@ -658,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 static int
 remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        /* can happen if uprobe_register() fails */
-        if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
-                return 0;
        set_bit(MMF_RECALC_UPROBES, &mm->flags);
        return set_orig_insn(&uprobe->arch, mm, vaddr);
 }
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+        return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
 /*
 * There could be threads that have already hit the breakpoint. They
 * will recheck the current insn and restart if find_uprobe() fails.
@@ -673,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
 */
 static void delete_uprobe(struct uprobe *uprobe)
 {
+        if (WARN_ON(!uprobe_is_active(uprobe)))
+                return;
        spin_lock(&uprobes_treelock);
        rb_erase(&uprobe->rb_node, &uprobes_tree);
        spin_unlock(&uprobes_treelock);
+        RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
        iput(uprobe->inode);
        put_uprobe(uprobe);
-        atomic_dec(&uprobe_events);
 }
 struct map_info {
@@ -764,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
        return curr;
 }
-static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
+static int
+register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
 {
+        bool is_register = !!new;
        struct map_info *info;
        int err = 0;
@@ -794,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
                        goto unlock;
-                if (is_register)
+                if (is_register) {
-                        err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+                        /* consult only the "caller", new consumer. */
-                else
+                        if (consumer_filter(new,
-                        err |= remove_breakpoint(uprobe, mm, info->vaddr);
+                                        UPROBE_FILTER_REGISTER, mm))
+                                err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+                } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+                        if (!filter_chain(uprobe,
+                                        UPROBE_FILTER_UNREGISTER, mm))
+                                err |= remove_breakpoint(uprobe, mm, info->vaddr);
+                }
 unlock:
                up_write(&mm->mmap_sem);
@@ -810,17 +787,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
        return err;
 }
-static int __uprobe_register(struct uprobe *uprobe)
+static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
-        return register_for_each_vma(uprobe, true);
+        consumer_add(uprobe, uc);
+        return register_for_each_vma(uprobe, uc);
 }
-static void __uprobe_unregister(struct uprobe *uprobe)
+static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
-        if (!register_for_each_vma(uprobe, false))
+        int err;
-                delete_uprobe(uprobe);
+        if (!consumer_del(uprobe, uc))  /* WARN? */
+                return;
+        err = register_for_each_vma(uprobe, NULL);
        /* TODO : cant unregister? schedule a worker thread */
+        if (!uprobe->consumers && !err)
+                delete_uprobe(uprobe);
 }
 /*
@@ -845,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
        struct uprobe *uprobe;
        int ret;
-        if (!inode || !uc || uc->next)
+        /* Racy, just to catch the obvious mistakes */
-                return -EINVAL;
        if (offset > i_size_read(inode))
                return -EINVAL;
-        ret = 0;
+ retry:
-        mutex_lock(uprobes_hash(inode));
        uprobe = alloc_uprobe(inode, offset);
+        if (!uprobe)
-        if (!uprobe) {
+                return -ENOMEM;
-                ret = -ENOMEM;
+        /*
-        } else if (!consumer_add(uprobe, uc)) {
+         * We can race with uprobe_unregister()->delete_uprobe().
-                ret = __uprobe_register(uprobe);
+         * Check uprobe_is_active() and retry if it is false.
-                if (ret) {
+         */
-                        uprobe->consumers = NULL;
+        down_write(&uprobe->register_rwsem);
-                        __uprobe_unregister(uprobe);
+        ret = -EAGAIN;
-                } else {
+        if (likely(uprobe_is_active(uprobe))) {
-                        set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
+                ret = __uprobe_register(uprobe, uc);
-                }
+                if (ret)
+                        __uprobe_unregister(uprobe, uc);
        }
+        up_write(&uprobe->register_rwsem);
+        put_uprobe(uprobe);
-        mutex_unlock(uprobes_hash(inode));
+        if (unlikely(ret == -EAGAIN))
-        if (uprobe)
+                goto retry;
-                put_uprobe(uprobe);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(uprobe_register);
+/*
+ * uprobe_apply - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: consumer which wants to add more or remove some breakpoints
+ * @add: add or remove the breakpoints
+ */
+int uprobe_apply(struct inode *inode, loff_t offset,
+                        struct uprobe_consumer *uc, bool add)
+{
+        struct uprobe *uprobe;
+        struct uprobe_consumer *con;
+        int ret = -ENOENT;
+        uprobe = find_uprobe(inode, offset);
+        if (!uprobe)
+                return ret;
+        down_write(&uprobe->register_rwsem);
+        for (con = uprobe->consumers; con && con != uc ; con = con->next)
+                ;
+        if (con)
+                ret = register_for_each_vma(uprobe, add ? uc : NULL);
+        up_write(&uprobe->register_rwsem);
+        put_uprobe(uprobe);
        return ret;
 }
@@ -884,25 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
 {
        struct uprobe *uprobe;
-        if (!inode || !uc)
-                return;
        uprobe = find_uprobe(inode, offset);
        if (!uprobe)
                return;
-        mutex_lock(uprobes_hash(inode));
+        down_write(&uprobe->register_rwsem);
+        __uprobe_unregister(uprobe, uc);
+        up_write(&uprobe->register_rwsem);
+        put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
-        if (consumer_del(uprobe, uc)) {
+static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
-                if (!uprobe->consumers) {
+{
-                        __uprobe_unregister(uprobe);
+        struct vm_area_struct *vma;
-                        clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
+        int err = 0;
-                }
+        down_read(&mm->mmap_sem);
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                unsigned long vaddr;
+                loff_t offset;
+                if (!valid_vma(vma, false) ||
+                    vma->vm_file->f_mapping->host != uprobe->inode)
+                        continue;
+                offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+                if (uprobe->offset <  offset ||
+                    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
+                        continue;
+                vaddr = offset_to_vaddr(vma, uprobe->offset);
+                err |= remove_breakpoint(uprobe, mm, vaddr);
        }
+        up_read(&mm->mmap_sem);
-        mutex_unlock(uprobes_hash(inode));
+        return err;
-        if (uprobe)
-                put_uprobe(uprobe);
 }
 static struct rb_node *
@@ -979,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
        struct uprobe *uprobe, *u;
        struct inode *inode;
-        if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
+        if (no_uprobe_events() || !valid_vma(vma, true))
                return 0;
        inode = vma->vm_file->f_mapping->host;
@@ -988,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma)
        mutex_lock(uprobes_mmap_hash(inode));
        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
+        /*
+         * We can race with uprobe_unregister(), this uprobe can be already
+         * removed. But in this case filter_chain() must return false, all
+         * consumers have gone away.
+         */
        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
-                if (!fatal_signal_pending(current)) {
+                if (!fatal_signal_pending(current) &&
+                    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
                        install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
                }
@@ -1025,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
 */
 void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 {
-        if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
+        if (no_uprobe_events() || !valid_vma(vma, false))
                return;
        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1042,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
 /* Slot allocation for XOL */
 static int xol_add_vma(struct xol_area *area)
 {
-        struct mm_struct *mm;
+        struct mm_struct *mm = current->mm;
-        int ret;
+        int ret = -EALREADY;
-        area->page = alloc_page(GFP_HIGHUSER);
-        if (!area->page)
-                return -ENOMEM;
-        ret = -EALREADY;
-        mm = current->mm;
        down_write(&mm->mmap_sem);
        if (mm->uprobes_state.xol_area)
                goto fail;
        ret = -ENOMEM;
        /* Try to map as high as possible, this is only a hint. */
        area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
        if (area->vaddr & ~PAGE_MASK) {
@@ -1073,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area)
        smp_wmb();      /* pairs with get_xol_area() */
        mm->uprobes_state.xol_area = area;
        ret = 0;
+ fail:
-fail:
        up_write(&mm->mmap_sem);
-        if (ret)
-                __free_page(area->page);
        return ret;
 }
-static struct xol_area *get_xol_area(struct mm_struct *mm)
-{
-        struct xol_area *area;
-        area = mm->uprobes_state.xol_area;
-        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
-        return area;
-}
 /*
- * xol_alloc_area - Allocate process's xol_area.
+ * get_xol_area - Allocate process's xol_area if necessary.
- * This area will be used for storing instructions for execution out of
+ * This area will be used for storing instructions for execution out of line.
- * line.
 *
 * Returns the allocated area or NULL.
 */
-static struct xol_area *xol_alloc_area(void)
+static struct xol_area *get_xol_area(void)
 {
+        struct mm_struct *mm = current->mm;
        struct xol_area *area;
+        area = mm->uprobes_state.xol_area;
+        if (area)
+                goto ret;
        area = kzalloc(sizeof(*area), GFP_KERNEL);
        if (unlikely(!area))
-                return NULL;
+                goto out;
        area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
        if (!area->bitmap)
-                goto fail;
+                goto free_area;
+        area->page = alloc_page(GFP_HIGHUSER);
+        if (!area->page)
+                goto free_bitmap;
        init_waitqueue_head(&area->wq);
        if (!xol_add_vma(area))
                return area;
-fail:
+        __free_page(area->page);
+ free_bitmap:
        kfree(area->bitmap);
+ free_area:
        kfree(area);
+ out:
-        return get_xol_area(current->mm);
+        area = mm->uprobes_state.xol_area;
+ ret:
+        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
+        return area;
 }
 /*
@@ -1186,33 +1210,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
 }
 /*
- * xol_get_insn_slot - If was not allocated a slot, then
+ * xol_get_insn_slot - allocate a slot for xol.
- * allocate a slot.
 * Returns the allocated slot address or 0.
 */
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
 {
        struct xol_area *area;
        unsigned long offset;
+        unsigned long xol_vaddr;
        void *vaddr;
-        area = get_xol_area(current->mm);
+        area = get_xol_area();
-        if (!area) {
+        if (!area)
-                area = xol_alloc_area();
+                return 0;
-                if (!area)
-                        return 0;
-        }
-        current->utask->xol_vaddr = xol_take_insn_slot(area);
-        /*
+        xol_vaddr = xol_take_insn_slot(area);
-         * Initialize the slot if xol_vaddr points to valid
+        if (unlikely(!xol_vaddr))
-         * instruction slot.
-         */
-        if (unlikely(!current->utask->xol_vaddr))
                return 0;
-        current->utask->vaddr = slot_addr;
+        /* Initialize the slot */
-        offset = current->utask->xol_vaddr & ~PAGE_MASK;
+        offset = xol_vaddr & ~PAGE_MASK;
        vaddr = kmap_atomic(area->page);
        memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
        kunmap_atomic(vaddr);
@@ -1222,7 +1239,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot
         */
        flush_dcache_page(area->page);
-        return current->utask->xol_vaddr;
+        return xol_vaddr;
 }
 /*
@@ -1240,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
                return;
        slot_addr = tsk->utask->xol_vaddr;
+        if (unlikely(!slot_addr))
-        if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
                return;
        area = tsk->mm->uprobes_state.xol_area;
@@ -1303,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t)
 }
 /*
- * Allocate a uprobe_task object for the task.
+ * Allocate a uprobe_task object for the task if if necessary.
- * Called when the thread hits a breakpoint for the first time.
+ * Called when the thread hits a breakpoint.
 *
 * Returns:
 * - pointer to new uprobe_task on success
 * - NULL otherwise
 */
-static struct uprobe_task *add_utask(void)
+static struct uprobe_task *get_utask(void)
 {
-        struct uprobe_task *utask;
+        if (!current->utask)
+                current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
-        utask = kzalloc(sizeof *utask, GFP_KERNEL);
+        return current->utask;
-        if (unlikely(!utask))
-                return NULL;
-        current->utask = utask;
-        return utask;
 }
 /* Prepare to single-step probed instruction out of line. */
 static int
-pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
 {
-        if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
+        struct uprobe_task *utask;
-                return 0;
+        unsigned long xol_vaddr;
+        int err;
+        utask = get_utask();
+        if (!utask)
+                return -ENOMEM;
+        xol_vaddr = xol_get_insn_slot(uprobe);
+        if (!xol_vaddr)
+                return -ENOMEM;
+        utask->xol_vaddr = xol_vaddr;
+        utask->vaddr = bp_vaddr;
+        err = arch_uprobe_pre_xol(&uprobe->arch, regs);
+        if (unlikely(err)) {
+                xol_free_insn_slot(current);
+                return err;
+        }
-        return -EFAULT;
+        utask->active_uprobe = uprobe;
+        utask->state = UTASK_SSTEP;
+        return 0;
 }
 /*
@@ -1391,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
                 * This is not strictly accurate, we can race with
                 * uprobe_unregister() and see the already removed
                 * uprobe if delete_uprobe() was not yet called.
+                 * Or this uprobe can be filtered out.
                 */
                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
                        return;
@@ -1452,13 +1484,33 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
        return uprobe;
 }
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+{
+        struct uprobe_consumer *uc;
+        int remove = UPROBE_HANDLER_REMOVE;
+        down_read(&uprobe->register_rwsem);
+        for (uc = uprobe->consumers; uc; uc = uc->next) {
+                int rc = uc->handler(uc, regs);
+                WARN(rc & ~UPROBE_HANDLER_MASK,
+                        "bad rc=0x%x from %pf()\n", rc, uc->handler);
+                remove &= rc;
+        }
+        if (remove && uprobe->consumers) {
+                WARN_ON(!uprobe_is_active(uprobe));
+                unapply_uprobe(uprobe, current->mm);
+        }
+        up_read(&uprobe->register_rwsem);
+}
 /*
 * Run handler and ask thread to singlestep.
 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
 */
 static void handle_swbp(struct pt_regs *regs)
 {
-        struct uprobe_task *utask;
        struct uprobe *uprobe;
        unsigned long bp_vaddr;
        int uninitialized_var(is_swbp);
@@ -1483,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs)
                }
                return;
        }
+        /* change it in advance for ->handler() and restart */
+        instruction_pointer_set(regs, bp_vaddr);
        /*
         * TODO: move copy_insn/etc into _register and remove this hack.
         * After we hit the bp, _unregister + _register can install the
@@ -1490,32 +1546,16 @@ static void handle_swbp(struct pt_regs *regs)
         */
        smp_rmb(); /* pairs with wmb() in install_breakpoint() */
        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
-                goto restart;
+                goto out;
-        utask = current->utask;
-        if (!utask) {
-                utask = add_utask();
-                /* Cannot allocate; re-execute the instruction. */
-                if (!utask)
-                        goto restart;
-        }
        handler_chain(uprobe, regs);
        if (can_skip_sstep(uprobe, regs))
                goto out;
-        if (!pre_ssout(uprobe, regs, bp_vaddr)) {
+        if (!pre_ssout(uprobe, regs, bp_vaddr))
-                utask->active_uprobe = uprobe;
-                utask->state = UTASK_SSTEP;
                return;
-        }
-restart:
+        /* can_skip_sstep() succeeded, or restart if can't singlestep */
-        /*
-         * cannot singlestep; cannot skip instruction;
-         * re-execute the instruction.
-         */
-        instruction_pointer_set(regs, bp_vaddr);
 out:
        put_uprobe(uprobe);
 }
@@ -1609,10 +1649,8 @@ static int __init init_uprobes(void)
 {
        int i;
-        for (i = 0; i < UPROBES_HASH_SZ; i++) {
+        for (i = 0; i < UPROBES_HASH_SZ; i++)
-                mutex_init(&uprobes_mutex[i]);
                mutex_init(&uprobes_mmap_mutex[i]);
-        }
        if (percpu_init_rwsem(&dup_mmap_sem))
                return -ENOMEM;
diff --git a/kernel/exit.c b/kernel/exit.c
index b4df21937216..7dd20408707c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk)
        bool group_dead = thread_group_leader(tsk);
        struct sighand_struct *sighand;
        struct tty_struct *uninitialized_var(tty);
+        cputime_t utime, stime;
        sighand = rcu_dereference_check(tsk->sighand,
                                        lockdep_tasklist_lock_is_held());
@@ -123,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk)
                 * We won't ever get here for the group leader, since it
                 * will have been the last reference on the signal_struct.
                 */
-                sig->utime += tsk->utime;
+                task_cputime(tsk, &utime, &stime);
-                sig->stime += tsk->stime;
+                sig->utime += utime;
-                sig->gtime += tsk->gtime;
+                sig->stime += stime;
+                sig->gtime += task_gtime(tsk);
                sig->min_flt += tsk->min_flt;
                sig->maj_flt += tsk->maj_flt;
                sig->nvcsw += tsk->nvcsw;
@@ -1092,7 +1094,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                sig = p->signal;
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
-                psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
+                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
                psig->cmin_flt +=
                        p->min_flt + sig->min_flt + sig->cmin_flt;
                psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index c535f33bbb9c..4133876d8cd2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1233,6 +1233,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
        p->prev_cputime.utime = p->prev_cputime.stime = 0;
 #endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+        seqlock_init(&p->vtime_seqlock);
+        p->vtime_snap = 0;
+        p->vtime_snap_whence = VTIME_SLEEPING;
+#endif
 #if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
 #endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 19eb089ca003..9618b6e9fb36 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -60,6 +60,7 @@
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
 #include <linux/ptrace.h>
+#include <linux/sched/rt.h>
 #include <asm/futex.h>
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6db7a5ed52b5..cc47812d3feb 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -44,6 +44,8 @@
 #include <linux/err.h>
 #include <linux/debugobjects.h>
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
 #include <linux/timer.h>
 #include <asm/uaccess.h>
@@ -640,21 +642,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 * and expiry check is done in the hrtimer_interrupt or in the softirq.
 */
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-                                            struct hrtimer_clock_base *base,
+                                            struct hrtimer_clock_base *base)
-                                            int wakeup)
 {
-        if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
+        return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
-                if (wakeup) {
-                        raw_spin_unlock(&base->cpu_base->lock);
-                        raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-                        raw_spin_lock(&base->cpu_base->lock);
-                } else
-                        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-                return 1;
-        }
-        return 0;
 }
 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
@@ -735,8 +725,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-                                            struct hrtimer_clock_base *base,
+                                            struct hrtimer_clock_base *base)
-                                            int wakeup)
 {
        return 0;
 }
@@ -995,8 +984,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
         *
         * XXX send_remote_softirq() ?
         */
-        if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
+        if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
-                hrtimer_enqueue_reprogram(timer, new_base, wakeup);
+                && hrtimer_enqueue_reprogram(timer, new_base)) {
+                if (wakeup) {
+                        /*
+                         * We need to drop cpu_base->lock to avoid a
+                         * lock ordering issue vs. rq->lock.
+                         */
+                        raw_spin_unlock(&new_base->cpu_base->lock);
+                        raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+                        local_irq_restore(flags);
+                        return ret;
+                } else {
+                        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+                }
+        }
        unlock_hrtimer_base(timer, &flags);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 3aca9f29d30e..cbd97ce0b000 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
 EXPORT_SYMBOL(irq_set_handler_data);
 /**
- *      irq_set_msi_desc - set MSI descriptor data for an irq
+ *      irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
- *      @irq:   Interrupt number
+ *      @irq_base:      Interrupt number base
- *      @entry: Pointer to MSI descriptor data
+ *      @irq_offset:    Interrupt number offset
+ *      @entry:         Pointer to MSI descriptor data
 *
- *      Set the MSI descriptor entry for an irq
+ *      Set the MSI descriptor entry for an irq at offset
 */
-int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
+                         struct msi_desc *entry)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+        struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        if (!desc)
                return -EINVAL;
        desc->irq_data.msi_desc = entry;
-        if (entry)
+        if (entry && !irq_offset)
-                entry->irq = irq;
+                entry->irq = irq_base;
        irq_put_desc_unlock(desc, flags);
        return 0;
 }
 /**
+ *      irq_set_msi_desc - set MSI descriptor data for an irq
+ *      @irq:   Interrupt number
+ *      @entry: Pointer to MSI descriptor data
+ *
+ *      Set the MSI descriptor entry for an irq
+ */
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+{
+        return irq_set_msi_desc_off(irq, 0, entry);
+}
+/**
 *      irq_set_chip_data - set irq chip data for an irq
 *      @irq:   Interrupt number
 *      @data:  Pointer to chip specific data
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e49a288fa479..fa17855ca65a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/task_work.h>
 #include "internals.h"
@@ -1524,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
 out:
        irq_put_desc_unlock(desc, flags);
 }
+EXPORT_SYMBOL_GPL(enable_percpu_irq);
 void disable_percpu_irq(unsigned int irq)
 {
@@ -1537,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq)
        irq_percpu_disable(desc, cpu);
        irq_put_desc_unlock(desc, flags);
 }
+EXPORT_SYMBOL_GPL(disable_percpu_irq);
 /*
 * Internal function to unregister a percpu irqaction.
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd6003c45..7b5f012bde9d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
        /*
         * All handlers must agree on IRQF_SHARED, so we test just the
-         * first. Check for action->next as well.
+         * first.
         */
        action = desc->action;
        if (!action || !(action->flags & IRQF_SHARED) ||
-            (action->flags & __IRQF_TIMER) ||
+            (action->flags & __IRQF_TIMER))
-            (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
-            !action->next)
                goto out;
        /* Already running on another processor */
@@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
        do {
                if (handle_irq_event(desc) == IRQ_HANDLED)
                        ret = IRQ_HANDLED;
+                /* Make sure that there is still a valid action */
                action = desc->action;
        } while ((desc->istate & IRQS_PENDING) && action);
        desc->istate &= ~IRQS_POLL_INPROGRESS;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b2871b..55fcce6065cf 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -12,37 +12,36 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/irqflags.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
 #include <asm/processor.h>
-/*
- * An entry can be in one of four states:
- *
- * free      NULL, 0 -> {claimed}       : free to be used
- * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
- * pending   next, 3 -> {busy}          : queued, pending callback
- * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- */
-#define IRQ_WORK_PENDING        1UL
-#define IRQ_WORK_BUSY           2UL
-#define IRQ_WORK_FLAGS          3UL
 static DEFINE_PER_CPU(struct llist_head, irq_work_list);
+static DEFINE_PER_CPU(int, irq_work_raised);
 /*
 * Claim the entry so that no one else will poke at it.
 */
 static bool irq_work_claim(struct irq_work *work)
 {
-        unsigned long flags, nflags;
+        unsigned long flags, oflags, nflags;
+        /*
+         * Start with our best wish as a premise but only trust any
+         * flag value after cmpxchg() result.
+         */
+        flags = work->flags & ~IRQ_WORK_PENDING;
        for (;;) {
-                flags = work->flags;
-                if (flags & IRQ_WORK_PENDING)
-                        return false;
                nflags = flags | IRQ_WORK_FLAGS;
-                if (cmpxchg(&work->flags, flags, nflags) == flags)
+                oflags = cmpxchg(&work->flags, flags, nflags);
+                if (oflags == flags)
                        break;
+                if (oflags & IRQ_WORK_PENDING)
+                        return false;
+                flags = oflags;
                cpu_relax();
        }
@@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void)
 }
 /*
- * Queue the entry and raise the IPI if needed.
+ * Enqueue the irq_work @entry unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
 */
-static void __irq_work_queue(struct irq_work *work)
+void irq_work_queue(struct irq_work *work)
 {
-        bool empty;
+        /* Only queue if not already pending */
+        if (!irq_work_claim(work))
+                return;
+        /* Queue the entry and raise the IPI if needed. */
        preempt_disable();
-        empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
+        llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
-        /* The list was empty, raise self-interrupt to start processing. */
-        if (empty)
+        /*
-                arch_irq_work_raise();
+         * If the work is not "lazy" or the tick is stopped, raise the irq
+         * work interrupt (if supported by the arch), otherwise, just wait
+         * for the next tick.
+         */
+        if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
+                if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
+                        arch_irq_work_raise();
+        }
        preempt_enable();
 }
+EXPORT_SYMBOL_GPL(irq_work_queue);
-/*
+bool irq_work_needs_cpu(void)
- * Enqueue the irq_work @entry, returns true on success, failure when the
- * @entry was already enqueued by someone else.
- *
- * Can be re-enqueued while the callback is still in progress.
- */
-bool irq_work_queue(struct irq_work *work)
 {
-        if (!irq_work_claim(work)) {
+        struct llist_head *this_list;
-                /*
-                 * Already enqueued, can't do!
+        this_list = &__get_cpu_var(irq_work_list);
-                 */
+        if (llist_empty(this_list))
                return false;
-        }
-        __irq_work_queue(work);
+        /* All work should have been flushed before going offline */
+        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
        return true;
 }
-EXPORT_SYMBOL_GPL(irq_work_queue);
-/*
+static void __irq_work_run(void)
- * Run the irq_work entries on this cpu. Requires to be ran from hardirq
- * context with local IRQs disabled.
- */
-void irq_work_run(void)
 {
+        unsigned long flags;
        struct irq_work *work;
        struct llist_head *this_list;
        struct llist_node *llnode;
+        /*
+         * Reset the "raised" state right before we check the list because
+         * an NMI may enqueue after we find the list empty from the runner.
+         */
+        __this_cpu_write(irq_work_raised, 0);
+        barrier();
        this_list = &__get_cpu_var(irq_work_list);
        if (llist_empty(this_list))
                return;
-        BUG_ON(!in_irq());
        BUG_ON(!irqs_disabled());
        llnode = llist_del_all(this_list);
@@ -119,16 +130,31 @@ void irq_work_run(void)
                /*
                 * Clear the PENDING bit, after this point the @work
                 * can be re-used.
+                 * Make it immediately visible so that other CPUs trying
+                 * to claim that work don't rely on us to handle their data
+                 * while we are in the middle of the func.
                 */
-                work->flags = IRQ_WORK_BUSY;
+                flags = work->flags & ~IRQ_WORK_PENDING;
+                xchg(&work->flags, flags);
                work->func(work);
                /*
                 * Clear the BUSY bit and return to the free state if
                 * no-one else claimed it meanwhile.
                 */
-                (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
+                (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
        }
 }
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+        BUG_ON(!in_irq());
+        __irq_work_run();
+}
 EXPORT_SYMBOL_GPL(irq_work_run);
 /*
@@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work)
                cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
+#ifdef CONFIG_HOTPLUG_CPU
+static int irq_work_cpu_notify(struct notifier_block *self,
+                               unsigned long action, void *hcpu)
+{
+        long cpu = (long)hcpu;
+        switch (action) {
+        case CPU_DYING:
+                /* Called from stop_machine */
+                if (WARN_ON_ONCE(cpu != smp_processor_id()))
+                        break;
+                __irq_work_run();
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block cpu_notify;
+static __init int irq_work_init_cpu_notifier(void)
+{
+        cpu_notify.notifier_call = irq_work_cpu_notify;
+        cpu_notify.priority = 0;
+        register_cpu_notifier(&cpu_notify);
+        return 0;
+}
+device_initcall(irq_work_init_cpu_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 0023a87e8de6..56dd34976d7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -38,6 +38,7 @@
 #include <linux/suspend.h>
 #include <linux/rwsem.h>
 #include <linux/ptrace.h>
+#include <linux/async.h>
 #include <asm/uaccess.h>
 #include <trace/events/module.h>
@@ -130,6 +131,14 @@ int __request_module(bool wait, const char *fmt, ...)
 #define MAX_KMOD_CONCURRENT 50  /* Completely arbitrary value - KAO */
        static int kmod_loop_msg;
+        /*
+         * We don't allow synchronous module loading from async.  Module
+         * init may invoke async_synchronize_full() which will end up
+         * waiting for this task which already is waiting for the module
+         * loading to complete, leading to a deadlock.
+         */
+        WARN_ON_ONCE(wait && current_is_async());
        va_start(args, fmt);
        ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
        va_end(args);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396aa409..550294d58a02 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -471,7 +471,6 @@ static LIST_HEAD(unoptimizing_list);
 static void kprobe_optimizer(struct work_struct *work);
 static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
-static DECLARE_COMPLETION(optimizer_comp);
 #define OPTIMIZE_DELAY 5
 /*
@@ -552,8 +551,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
 /* Start optimizer after OPTIMIZE_DELAY passed */
 static __kprobes void kick_kprobe_optimizer(void)
 {
-        if (!delayed_work_pending(&optimizing_work))
+        schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
-                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
 }
 /* Kprobe jump optimizer */
@@ -592,16 +590,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
        /* Step 5: Kick optimizer again if needed */
        if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
                kick_kprobe_optimizer();
-        else
-                /* Wake up all waiters */
-                complete_all(&optimizer_comp);
 }
 /* Wait for completing optimization and unoptimization */
 static __kprobes void wait_for_kprobe_optimizer(void)
 {
-        if (delayed_work_pending(&optimizing_work))
+        mutex_lock(&kprobe_mutex);
-                wait_for_completion(&optimizer_comp);
+        while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
+                mutex_unlock(&kprobe_mutex);
+                /* this will also make optimizing_work execute immmediately */
+                flush_delayed_work(&optimizing_work);
+                /* @optimizing_work might not have been queued yet, relax */
+                cpu_relax();
+                mutex_lock(&kprobe_mutex);
+        }
+        mutex_unlock(&kprobe_mutex);
 }
 /* Optimize kprobe if p is ready to be optimized */
@@ -919,7 +926,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 }
 #endif /* CONFIG_OPTPROBES */
-#ifdef KPROBES_CAN_USE_FTRACE
+#ifdef CONFIG_KPROBES_ON_FTRACE
 static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
        .func = kprobe_ftrace_handler,
        .flags = FTRACE_OPS_FL_SAVE_REGS,
@@ -964,7 +971,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
                           (unsigned long)p->addr, 1, 0);
        WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
 }
-#else   /* !KPROBES_CAN_USE_FTRACE */
+#else   /* !CONFIG_KPROBES_ON_FTRACE */
 #define prepare_kprobe(p)       arch_prepare_kprobe(p)
 #define arm_kprobe_ftrace(p)    do {} while (0)
 #define disarm_kprobe_ftrace(p) do {} while (0)
@@ -1414,12 +1421,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
         */
        ftrace_addr = ftrace_location((unsigned long)p->addr);
        if (ftrace_addr) {
-#ifdef KPROBES_CAN_USE_FTRACE
+#ifdef CONFIG_KPROBES_ON_FTRACE
                /* Given address is not on the instruction boundary */
                if ((unsigned long)p->addr != ftrace_addr)
                        return -EILSEQ;
                p->flags |= KPROBE_FLAG_FTRACE;
-#else   /* !KPROBES_CAN_USE_FTRACE */
+#else   /* !CONFIG_KPROBES_ON_FTRACE */
                return -EINVAL;
 #endif
        }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a307cc9c9526..52f23011b6e0 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,6 +19,7 @@
 */
 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
diff --git a/kernel/pid.c b/kernel/pid.c
index de9af600006f..f2c6a6825098 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -331,7 +331,7 @@ out:
        return pid;
 out_unlock:
-        spin_unlock(&pidmap_lock);
+        spin_unlock_irq(&pidmap_lock);
 out_free:
        while (++i <= ns->level)
                free_pidmap(pid->numbers + i);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index a278cad1d5d6..8fd709c9bb58 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -155,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer,
 static inline cputime_t prof_ticks(struct task_struct *p)
 {
-        return p->utime + p->stime;
+        cputime_t utime, stime;
+        task_cputime(p, &utime, &stime);
+        return utime + stime;
 }
 static inline cputime_t virt_ticks(struct task_struct *p)
 {
-        return p->utime;
+        cputime_t utime;
+        task_cputime(p, &utime, NULL);
+        return utime;
 }
 static int
@@ -471,18 +479,23 @@ static void cleanup_timers(struct list_head *head,
 */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
+        cputime_t utime, stime;
        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
                                                sizeof(unsigned long long));
+        task_cputime(tsk, &utime, &stime);
        cleanup_timers(tsk->cpu_timers,
-                       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+                       utime, stime, tsk->se.sum_exec_runtime);
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
        struct signal_struct *const sig = tsk->signal;
+        cputime_t utime, stime;
+        task_cputime(tsk, &utime, &stime);
        cleanup_timers(tsk->signal->cpu_timers,
-                       tsk->utime + sig->utime, tsk->stime + sig->stime,
+                       utime + sig->utime, stime + sig->stime,
                       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
@@ -1226,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
 static inline int fastpath_timer_check(struct task_struct *tsk)
 {
        struct signal_struct *sig;
+        cputime_t utime, stime;
+        task_cputime(tsk, &utime, &stime);
        if (!task_cputime_zero(&tsk->cputime_expires)) {
                struct task_cputime task_sample = {
-                        .utime = tsk->utime,
+                        .utime = utime,
-                        .stime = tsk->stime,
+                        .stime = stime,
                        .sum_exec_runtime = tsk->se.sum_exec_runtime
                };
@@ -1401,8 +1417,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
                while (!signal_pending(current)) {
                        if (timer.it.cpu.expires.sched == 0) {
                                /*
-                                 * Our timer fired and was reset.
+                                 * Our timer fired and was reset, below
+                                 * deletion can not fail.
                                 */
+                                posix_cpu_timer_del(&timer);
                                spin_unlock_irq(&timer.it_lock);
                                return 0;
                        }
@@ -1420,9 +1438,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
                 * We were interrupted by a signal.
                 */
                sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
-                posix_cpu_timer_set(&timer, 0, &zero_it, it);
+                error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+                if (!error) {
+                        /*
+                         * Timer is now unarmed, deletion can not fail.
+                         */
+                        posix_cpu_timer_del(&timer);
+                }
                spin_unlock_irq(&timer.it_lock);
+                while (error == TIMER_RETRY) {
+                        /*
+                         * We need to handle case when timer was or is in the
+                         * middle of firing. In other cases we already freed
+                         * resources.
+                         */
+                        spin_lock_irq(&timer.it_lock);
+                        error = posix_cpu_timer_del(&timer);
+                        spin_unlock_irq(&timer.it_lock);
+                }
                if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
                        /*
                         * It actually did fire already.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae6b701..10349d5f2ec3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -997,7 +997,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
        err = kc->clock_adj(which_clock, &ktx);
-        if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+        if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
                return -EFAULT;
        return err;
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index ca304046d9e2..c6422ffeda9a 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);
 void queue_up_suspend_work(void)
 {
-        if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
+        if (autosleep_state > PM_SUSPEND_ON)
                queue_work(autosleep_wq, &suspend_work);
 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c16f9167de1..d77663bfedeb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 static suspend_state_t decode_state(const char *buf, size_t n)
 {
 #ifdef CONFIG_SUSPEND
-        suspend_state_t state = PM_SUSPEND_STANDBY;
+        suspend_state_t state = PM_SUSPEND_MIN;
        const char * const *s;
 #endif
        char *p;
@@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);
 #endif /* CONFIG_PM_TRACE */
+#ifdef CONFIG_FREEZER
+static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
+                                      struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%u\n", freeze_timeout_msecs);
+}
+static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
+                                       struct kobj_attribute *attr,
+                                       const char *buf, size_t n)
+{
+        unsigned long val;
+        if (kstrtoul(buf, 10, &val))
+                return -EINVAL;
+        freeze_timeout_msecs = val;
+        return n;
+}
+power_attr(pm_freeze_timeout);
+#endif  /* CONFIG_FREEZER*/
 static struct attribute * g[] = {
        &state_attr.attr,
 #ifdef CONFIG_PM_TRACE
@@ -576,6 +600,9 @@ static struct attribute * g[] = {
        &pm_print_times_attr.attr,
 #endif
 #endif
+#ifdef CONFIG_FREEZER
+        &pm_freeze_timeout_attr.attr,
+#endif
        NULL,
 };
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d5a258b60c6f..98088e0e71e8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -21,7 +21,7 @@
 /* 
 * Timeout for stopping processes
 */
-#define TIMEOUT (20 * HZ)
+unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
 static int try_to_freeze_tasks(bool user_only)
 {
@@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)
        do_gettimeofday(&start);
-        end_time = jiffies + TIMEOUT;
+        end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
        if (!user_only)
                freeze_workqueues_begin();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9322ff7eaad6..587dddeebf15 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -359,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
                return;
        }
-        if (delayed_work_pending(&req->work))
+        cancel_delayed_work_sync(&req->work);
-                cancel_delayed_work_sync(&req->work);
        if (new_value != req->node.prio)
                pm_qos_update_target(
@@ -386,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
                 "%s called for unknown object.", __func__))
                return;
-        if (delayed_work_pending(&req->work))
+        cancel_delayed_work_sync(&req->work);
-                cancel_delayed_work_sync(&req->work);
        if (new_value != req->node.prio)
                pm_qos_update_target(
@@ -416,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
                return;
        }
-        if (delayed_work_pending(&req->work))
+        cancel_delayed_work_sync(&req->work);
-                cancel_delayed_work_sync(&req->work);
        pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
                             &req->node, PM_QOS_REMOVE_REQ,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..d4feda084a3a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,12 +30,38 @@
 #include "power.h"
 const char *const pm_states[PM_SUSPEND_MAX] = {
+        [PM_SUSPEND_FREEZE]     = "freeze",
        [PM_SUSPEND_STANDBY]    = "standby",
        [PM_SUSPEND_MEM]        = "mem",
 };
 static const struct platform_suspend_ops *suspend_ops;
+static bool need_suspend_ops(suspend_state_t state)
+{
+        return !!(state > PM_SUSPEND_FREEZE);
+}
+static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static bool suspend_freeze_wake;
+static void freeze_begin(void)
+{
+        suspend_freeze_wake = false;
+}
+static void freeze_enter(void)
+{
+        wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+}
+void freeze_wake(void)
+{
+        suspend_freeze_wake = true;
+        wake_up(&suspend_freeze_wait_head);
+}
+EXPORT_SYMBOL_GPL(freeze_wake);
 /**
 * suspend_set_ops - Set the global suspend method table.
 * @ops: Suspend operations to use.
@@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
 bool valid_state(suspend_state_t state)
 {
+        if (state == PM_SUSPEND_FREEZE)
+                return true;
        /*
-         * All states need lowlevel support and need to be valid to the lowlevel
+         * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
+         * support and need to be valid to the lowlevel
         * implementation, no valid callback implies that none are valid.
         */
        return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
@@ -89,11 +118,11 @@ static int suspend_test(int level)
 * hibernation).  Run suspend notifiers, allocate the "suspend" console and
 * freeze processes.
 */
-static int suspend_prepare(void)
+static int suspend_prepare(suspend_state_t state)
 {
        int error;
-        if (!suspend_ops || !suspend_ops->enter)
+        if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
                return -EPERM;
        pm_prepare_console();
@@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 {
        int error;
-        if (suspend_ops->prepare) {
+        if (need_suspend_ops(state) && suspend_ops->prepare) {
                error = suspend_ops->prepare();
                if (error)
                        goto Platform_finish;
@@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                goto Platform_finish;
        }
-        if (suspend_ops->prepare_late) {
+        if (need_suspend_ops(state) && suspend_ops->prepare_late) {
                error = suspend_ops->prepare_late();
                if (error)
                        goto Platform_wake;
        }
+        /*
+         * PM_SUSPEND_FREEZE equals
+         * frozen processes + suspended devices + idle processors.
+         * Thus we should invoke freeze_enter() soon after
+         * all the devices are suspended.
+         */
+        if (state == PM_SUSPEND_FREEZE) {
+                freeze_enter();
+                goto Platform_wake;
+        }
        if (suspend_test(TEST_PLATFORM))
                goto Platform_wake;
@@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        enable_nonboot_cpus();
 Platform_wake:
-        if (suspend_ops->wake)
+        if (need_suspend_ops(state) && suspend_ops->wake)
                suspend_ops->wake();
        dpm_resume_start(PMSG_RESUME);
 Platform_finish:
-        if (suspend_ops->finish)
+        if (need_suspend_ops(state) && suspend_ops->finish)
                suspend_ops->finish();
        return error;
@@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state)
        int error;
        bool wakeup = false;
-        if (!suspend_ops)
+        if (need_suspend_ops(state) && !suspend_ops)
                return -ENOSYS;
        trace_machine_suspend(state);
-        if (suspend_ops->begin) {
+        if (need_suspend_ops(state) && suspend_ops->begin) {
                error = suspend_ops->begin(state);
                if (error)
                        goto Close;
@@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        do {
                error = suspend_enter(state, &wakeup);
-        } while (!error && !wakeup
+        } while (!error && !wakeup && need_suspend_ops(state)
                && suspend_ops->suspend_again && suspend_ops->suspend_again());
 Resume_devices:
@@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state)
        ftrace_start();
        resume_console();
 Close:
-        if (suspend_ops->end)
+        if (need_suspend_ops(state) && suspend_ops->end)
                suspend_ops->end();
        trace_machine_suspend(PWR_EVENT_EXIT);
        return error;
 Recover_platform:
-        if (suspend_ops->recover)
+        if (need_suspend_ops(state) && suspend_ops->recover)
                suspend_ops->recover();
        goto Resume_devices;
 }
@@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state)
        if (!mutex_trylock(&pm_mutex))
                return -EBUSY;
+        if (state == PM_SUSPEND_FREEZE)
+                freeze_begin();
        printk(KERN_INFO "PM: Syncing filesystems ... ");
        sys_sync();
        printk("done.\n");
        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
-        error = suspend_prepare();
+        error = suspend_prepare(state);
        if (error)
                goto Unlock;
diff --git a/kernel/printk.c b/kernel/printk.c
index 267ce780abe8..f24633afa46a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -42,6 +42,7 @@
 #include <linux/notifier.h>
 #include <linux/rculist.h>
 #include <linux/poll.h>
+#include <linux/irq_work.h>
 #include <asm/uaccess.h>
@@ -1959,30 +1960,32 @@ int is_console_locked(void)
 static DEFINE_PER_CPU(int, printk_pending);
 static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
-void printk_tick(void)
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
 {
-        if (__this_cpu_read(printk_pending)) {
+        int pending = __this_cpu_xchg(printk_pending, 0);
-                int pending = __this_cpu_xchg(printk_pending, 0);
-                if (pending & PRINTK_PENDING_SCHED) {
+        if (pending & PRINTK_PENDING_SCHED) {
-                        char *buf = __get_cpu_var(printk_sched_buf);
+                char *buf = __get_cpu_var(printk_sched_buf);
-                        printk(KERN_WARNING "[sched_delayed] %s", buf);
+                printk(KERN_WARNING "[sched_delayed] %s", buf);
-                }
-                if (pending & PRINTK_PENDING_WAKEUP)
-                        wake_up_interruptible(&log_wait);
        }
-}
-int printk_needs_cpu(int cpu)
+        if (pending & PRINTK_PENDING_WAKEUP)
-{
+                wake_up_interruptible(&log_wait);
-        if (cpu_is_offline(cpu))
-                printk_tick();
-        return __this_cpu_read(printk_pending);
 }
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+        .func = wake_up_klogd_work_func,
+        .flags = IRQ_WORK_LAZY,
+};
 void wake_up_klogd(void)
 {
-        if (waitqueue_active(&log_wait))
+        preempt_disable();
+        if (waitqueue_active(&log_wait)) {
                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+                irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+        }
+        preempt_enable();
 }
 static void console_cont_flush(char *text, size_t size)
@@ -2462,6 +2465,7 @@ int printk_sched(const char *fmt, ...)
        va_end(args);
        __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
+        irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
        local_irq_restore(flags);
        return r;
diff --git a/kernel/profile.c b/kernel/profile.c
index 1f391819c42f..dc3384ee874e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -37,9 +37,6 @@ struct profile_hit {
 #define NR_PROFILE_HIT          (PAGE_SIZE/sizeof(struct profile_hit))
 #define NR_PROFILE_GRP          (NR_PROFILE_HIT/PROFILE_GRPSZ)
-/* Oprofile timer tick hook */
-static int (*timer_hook)(struct pt_regs *) __read_mostly;
 static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
@@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
 }
 EXPORT_SYMBOL_GPL(profile_event_unregister);
-int register_timer_hook(int (*hook)(struct pt_regs *))
-{
-        if (timer_hook)
-                return -EBUSY;
-        timer_hook = hook;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(register_timer_hook);
-void unregister_timer_hook(int (*hook)(struct pt_regs *))
-{
-        WARN_ON(hook != timer_hook);
-        timer_hook = NULL;
-        /* make sure all CPUs see the NULL hook */
-        synchronize_sched();  /* Allow ongoing interrupts to complete. */
-}
-EXPORT_SYMBOL_GPL(unregister_timer_hook);
 #ifdef CONFIG_SMP
 /*
 * Each cpu has a pair of open-addressed hashtables for pending
@@ -436,8 +414,6 @@ void profile_tick(int type)
 {
        struct pt_regs *regs = get_irq_regs();
-        if (type == CPU_PROFILING && timer_hook)
-                timer_hook(regs);
        if (!user_mode(regs) && prof_cpu_mask != NULL &&
            cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
                profile_hit(type, (void *)profile_pc(regs));
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6cbeaae4406d..acbd28424d81 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -712,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
                                             kiov->iov_len, kiov->iov_base);
 }
+/*
+ * This is declared in linux/regset.h and defined in machine-dependent
+ * code.  We put the export here, near the primary machine-neutral use,
+ * to ensure no machine forgets it.
+ */
+EXPORT_SYMBOL_GPL(task_user_regset_view);
 #endif
 int ptrace_request(struct task_struct *child, long request,
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 20dfba576c2b..7f8e7590e3e5 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -111,4 +111,11 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
 extern int rcu_expedited;
+#ifdef CONFIG_RCU_STALL_COMMON
+extern int rcu_cpu_stall_suppress;
+int rcu_jiffies_till_stall_check(void);
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a2cf76177b44..48ab70384a4c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -404,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
-void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
+void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
+                               unsigned long secs,
+                               unsigned long c_old, unsigned long c)
 {
-        trace_rcu_torture_read(rcutorturename, rhp);
+        trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
 }
 EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
 #else
-#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+        do { } while (0)
 #endif
+#ifdef CONFIG_RCU_STALL_COMMON
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA          (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA          0
+#endif
+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+module_param(rcu_cpu_stall_suppress, int, 0644);
+module_param(rcu_cpu_stall_timeout, int, 0644);
+int rcu_jiffies_till_stall_check(void)
+{
+        int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+        /*
+         * Limit check must be consistent with the Kconfig limits
+         * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+         */
+        if (till_stall_check < 3) {
+                ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+                till_stall_check = 3;
+        } else if (till_stall_check > 300) {
+                ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+                till_stall_check = 300;
+        }
+        return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+        rcu_cpu_stall_suppress = 1;
+        return NOTIFY_DONE;
+}
+static struct notifier_block rcu_panic_block = {
+        .notifier_call = rcu_panic,
+};
+static int __init check_cpu_stall_init(void)
+{
+        atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+        return 0;
+}
+early_initcall(check_cpu_stall_init);
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e7dce58f9c2a..a0714a51b6d7 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
-#include "rcutiny_plugin.h"
 static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+#include "rcutiny_plugin.h"
 /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
 static void rcu_idle_enter_common(long long newval)
 {
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
 * interrupts don't count, we must be running at the first interrupt
 * level.
 */
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
 {
        return rcu_dynticks_nesting <= 1;
 }
@@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void)
 */
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
+        reset_cpu_stall_ticks(rcp);
        if (rcp->rcucblist != NULL &&
            rcp->donetail != rcp->curtail) {
                rcp->donetail = rcp->curtail;
@@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu)
 */
 void rcu_check_callbacks(int cpu, int user)
 {
+        check_cpu_stalls();
        if (user || rcu_is_cpu_rrupt_from_idle())
                rcu_sched_qs(cpu);
        else if (!in_softirq())
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f85016a2309b..8a233002faeb 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -33,6 +33,9 @@ struct rcu_ctrlblk {
        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
        struct rcu_head **curtail;      /* ->next pointer of last CB. */
        RCU_TRACE(long qlen);           /* Number of pending CBs. */
+        RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
+        RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
+        RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
        RCU_TRACE(char *name);          /* Name of RCU type. */
 };
@@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#ifdef CONFIG_RCU_TRACE
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+        unsigned long j;
+        unsigned long js;
+        if (rcu_cpu_stall_suppress)
+                return;
+        rcp->ticks_this_gp++;
+        j = jiffies;
+        js = rcp->jiffies_stall;
+        if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+                pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
+                       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+                       jiffies - rcp->gp_start, rcp->qlen);
+                dump_stack();
+        }
+        if (*rcp->curtail && ULONG_CMP_GE(j, js))
+                rcp->jiffies_stall = jiffies +
+                        3 * rcu_jiffies_till_stall_check() + 3;
+        else if (ULONG_CMP_GE(j, js))
+                rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+}
+static void check_cpu_stall_preempt(void);
+#endif /* #ifdef CONFIG_RCU_TRACE */
+static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
+{
+#ifdef CONFIG_RCU_TRACE
+        rcp->ticks_this_gp = 0;
+        rcp->gp_start = jiffies;
+        rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+#endif /* #ifdef CONFIG_RCU_TRACE */
+}
+static void check_cpu_stalls(void)
+{
+        RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
+        RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+        RCU_TRACE(check_cpu_stall_preempt());
+}
 #ifdef CONFIG_TINY_PREEMPT_RCU
 #include <linux/delay.h>
@@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void)
                /* Official start of GP. */
                rcu_preempt_ctrlblk.gpnum++;
                RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
+                reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
                /* Any blocked RCU readers block new GP. */
                if (rcu_preempt_blocked_readers_any())
@@ -1054,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
 MODULE_LICENSE("GPL");
+static void check_cpu_stall_preempt(void)
+{
+#ifdef CONFIG_TINY_PREEMPT_RCU
+        check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
+#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+}
 #endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 31dea01c85fd..e1f3a8c96724 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -46,6 +46,7 @@
 #include <linux/stat.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/trace_clock.h>
 #include <asm/byteorder.h>
 MODULE_LICENSE("GPL");
@@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
 #define rcu_can_boost() 0
 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+#ifdef CONFIG_RCU_TRACE
+static u64 notrace rcu_trace_clock_local(void)
+{
+        u64 ts = trace_clock_local();
+        unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+        return ts;
+}
+#else /* #ifdef CONFIG_RCU_TRACE */
+static u64 notrace rcu_trace_clock_local(void)
+{
+        return 0ULL;
+}
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
 static unsigned long shutdown_time;     /* jiffies to system shutdown. */
 static unsigned long boost_starttime;   /* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
@@ -845,7 +860,7 @@ static int rcu_torture_boost(void *arg)
                /* Wait for the next test interval. */
                oldstarttime = boost_starttime;
                while (ULONG_CMP_LT(jiffies, oldstarttime)) {
-                        schedule_timeout_uninterruptible(1);
+                        schedule_timeout_interruptible(oldstarttime - jiffies);
                        rcu_stutter_wait("rcu_torture_boost");
                        if (kthread_should_stop() ||
                            fullstop != FULLSTOP_DONTSTOP)
@@ -1028,7 +1043,6 @@ void rcutorture_trace_dump(void)
                return;
        if (atomic_xchg(&beenhere, 1) != 0)
                return;
-        do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
        ftrace_dump(DUMP_ALL);
 }
@@ -1042,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused)
 {
        int idx;
        int completed;
+        int completed_end;
        static DEFINE_RCU_RANDOM(rand);
        static DEFINE_SPINLOCK(rand_lock);
        struct rcu_torture *p;
        int pipe_count;
+        unsigned long long ts;
        idx = cur_ops->readlock();
        completed = cur_ops->completed();
+        ts = rcu_trace_clock_local();
        p = rcu_dereference_check(rcu_torture_current,
                                  rcu_read_lock_bh_held() ||
                                  rcu_read_lock_sched_held() ||
@@ -1058,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused)
                cur_ops->readunlock(idx);
                return;
        }
-        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
        if (p->rtort_mbtest == 0)
                atomic_inc(&n_rcu_torture_mberror);
        spin_lock(&rand_lock);
@@ -1071,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        if (pipe_count > 1)
+        completed_end = cur_ops->completed();
+        if (pipe_count > 1) {
+                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
+                                          completed, completed_end);
                rcutorture_trace_dump();
+        }
        __this_cpu_inc(rcu_torture_count[pipe_count]);
-        completed = cur_ops->completed() - completed;
+        completed = completed_end - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
@@ -1094,11 +1114,13 @@ static int
 rcu_torture_reader(void *arg)
 {
        int completed;
+        int completed_end;
        int idx;
        DEFINE_RCU_RANDOM(rand);
        struct rcu_torture *p;
        int pipe_count;
        struct timer_list t;
+        unsigned long long ts;
        VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
        set_user_nice(current, 19);
@@ -1112,6 +1134,7 @@ rcu_torture_reader(void *arg)
                }
                idx = cur_ops->readlock();
                completed = cur_ops->completed();
+                ts = rcu_trace_clock_local();
                p = rcu_dereference_check(rcu_torture_current,
                                          rcu_read_lock_bh_held() ||
                                          rcu_read_lock_sched_held() ||
@@ -1122,7 +1145,6 @@ rcu_torture_reader(void *arg)
                        schedule_timeout_interruptible(HZ);
                        continue;
                }
-                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
                if (p->rtort_mbtest == 0)
                        atomic_inc(&n_rcu_torture_mberror);
                cur_ops->read_delay(&rand);
@@ -1132,10 +1154,14 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                if (pipe_count > 1)
+                completed_end = cur_ops->completed();
+                if (pipe_count > 1) {
+                        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
+                                                  ts, completed, completed_end);
                        rcutorture_trace_dump();
+                }
                __this_cpu_inc(rcu_torture_count[pipe_count]);
-                completed = cur_ops->completed() - completed;
+                completed = completed_end - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
@@ -1301,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void)
                                set_cpus_allowed_ptr(reader_tasks[i],
                                                     shuffle_tmp_mask);
        }
        if (fakewriter_tasks) {
                for (i = 0; i < nfakewriters; i++)
                        if (fakewriter_tasks[i])
                                set_cpus_allowed_ptr(fakewriter_tasks[i],
                                                     shuffle_tmp_mask);
        }
        if (writer_task)
                set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
        if (stats_task)
                set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
+        if (stutter_task)
+                set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
+        if (fqs_task)
+                set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
+        if (shutdown_task)
+                set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
+#ifdef CONFIG_HOTPLUG_CPU
+        if (onoff_task)
+                set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+        if (stall_task)
+                set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
+        if (barrier_cbs_tasks)
+                for (i = 0; i < n_barrier_cbs; i++)
+                        if (barrier_cbs_tasks[i])
+                                set_cpus_allowed_ptr(barrier_cbs_tasks[i],
+                                                     shuffle_tmp_mask);
+        if (barrier_task)
+                set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
        if (rcu_idle_cpu == -1)
                rcu_idle_cpu = num_online_cpus() - 1;
@@ -1749,7 +1791,7 @@ static int rcu_torture_barrier_init(void)
        barrier_cbs_wq =
                kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
                        GFP_KERNEL);
-        if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
+        if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
                return -ENOMEM;
        for (i = 0; i < n_barrier_cbs; i++) {
                init_waitqueue_head(&barrier_cbs_wq[i]);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e441b77b614e..5b8ad827fd86 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
 * The rcu_scheduler_active variable transitions from zero to one just
 * before the first task is spawned.  So when this variable is zero, RCU
 * can assume that there is but one task, allowing RCU to (for example)
- * optimized synchronize_sched() to a simple barrier().  When this variable
+ * optimize synchronize_sched() to a simple barrier().  When this variable
 * is one, RCU must actually do all the hard work required to detect real
 * grace periods.  This variable is also used to suppress boot-time false
 * positives from lockdep-RCU error checking.
@@ -217,12 +217,6 @@ module_param(blimit, long, 0444);
 module_param(qhimark, long, 0444);
 module_param(qlowmark, long, 0444);
-int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
-int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
-module_param(rcu_cpu_stall_suppress, int, 0644);
-module_param(rcu_cpu_stall_timeout, int, 0644);
 static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
 static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
@@ -305,17 +299,27 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
 }
 /*
- * Does the current CPU require a yet-as-unscheduled grace period?
+ * Does the current CPU require a not-yet-started grace period?
+ * The caller must have disabled interrupts to prevent races with
+ * normal callback registry.
 */
 static int
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-        struct rcu_head **ntp;
+        int i;
-        ntp = rdp->nxttail[RCU_DONE_TAIL +
+        if (rcu_gp_in_progress(rsp))
-                           (ACCESS_ONCE(rsp->completed) != rdp->completed)];
+                return 0;  /* No, a grace period is already in progress. */
-        return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp &&
+        if (!rdp->nxttail[RCU_NEXT_TAIL])
-               !rcu_gp_in_progress(rsp);
+                return 0;  /* No, this is a no-CBs (or offline) CPU. */
+        if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
+                return 1;  /* Yes, this CPU has newly registered callbacks. */
+        for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+                if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
+                    ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+                                 rdp->nxtcompleted[i]))
+                        return 1;  /* Yes, CBs for future grace period. */
+        return 0; /* No grace period needed. */
 }
 /*
@@ -336,7 +340,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
                                bool user)
 {
-        trace_rcu_dyntick("Start", oldval, 0);
+        trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
        if (!user && !is_idle_task(current)) {
                struct task_struct *idle = idle_task(smp_processor_id());
@@ -727,7 +731,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
 * interrupt from idle, return true.  The caller must have at least
 * disabled preemption.
 */
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
 {
        return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
 }
@@ -793,28 +797,10 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
        return 0;
 }
-static int jiffies_till_stall_check(void)
-{
-        int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
-        /*
-         * Limit check must be consistent with the Kconfig limits
-         * for CONFIG_RCU_CPU_STALL_TIMEOUT.
-         */
-        if (till_stall_check < 3) {
-                ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
-                till_stall_check = 3;
-        } else if (till_stall_check > 300) {
-                ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
-                till_stall_check = 300;
-        }
-        return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
-}
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
        rsp->gp_start = jiffies;
-        rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
+        rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
 }
 /*
@@ -857,7 +843,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
-        rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
+        rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /*
@@ -935,7 +921,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
        raw_spin_lock_irqsave(&rnp->lock, flags);
        if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
                rsp->jiffies_stall = jiffies +
-                                     3 * jiffies_till_stall_check() + 3;
+                                     3 * rcu_jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        set_need_resched();  /* kick ourselves to get things going. */
@@ -966,12 +952,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
        }
 }
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
-        rcu_cpu_stall_suppress = 1;
-        return NOTIFY_DONE;
-}
 /**
 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
 *
@@ -989,15 +969,6 @@ void rcu_cpu_stall_reset(void)
                rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
 }
-static struct notifier_block rcu_panic_block = {
-        .notifier_call = rcu_panic,
-};
-static void __init check_cpu_stall_init(void)
-{
-        atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
-}
 /*
 * Update CPU-local rcu_data state to record the newly noticed grace period.
 * This is used both when we started the grace period and when we notice
@@ -1071,6 +1042,145 @@ static void init_callback_list(struct rcu_data *rdp)
 }
 /*
+ * Determine the value that ->completed will have at the end of the
+ * next subsequent grace period.  This is used to tag callbacks so that
+ * a CPU can invoke callbacks in a timely fashion even if that CPU has
+ * been dyntick-idle for an extended period with callbacks under the
+ * influence of RCU_FAST_NO_HZ.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
+                                       struct rcu_node *rnp)
+{
+        /*
+         * If RCU is idle, we just wait for the next grace period.
+         * But we can only be sure that RCU is idle if we are looking
+         * at the root rcu_node structure -- otherwise, a new grace
+         * period might have started, but just not yet gotten around
+         * to initializing the current non-root rcu_node structure.
+         */
+        if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
+                return rnp->completed + 1;
+        /*
+         * Otherwise, wait for a possible partial grace period and
+         * then the subsequent full grace period.
+         */
+        return rnp->completed + 2;
+}
+/*
+ * If there is room, assign a ->completed number to any callbacks on
+ * this CPU that have not already been assigned.  Also accelerate any
+ * callbacks that were previously assigned a ->completed number that has
+ * since proven to be too conservative, which can happen if callbacks get
+ * assigned a ->completed number while RCU is idle, but with reference to
+ * a non-root rcu_node structure.  This function is idempotent, so it does
+ * not hurt to call it repeatedly.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+                               struct rcu_data *rdp)
+{
+        unsigned long c;
+        int i;
+        /* If the CPU has no callbacks, nothing to do. */
+        if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+                return;
+        /*
+         * Starting from the sublist containing the callbacks most
+         * recently assigned a ->completed number and working down, find the
+         * first sublist that is not assignable to an upcoming grace period.
+         * Such a sublist has something in it (first two tests) and has
+         * a ->completed number assigned that will complete sooner than
+         * the ->completed number for newly arrived callbacks (last test).
+         *
+         * The key point is that any later sublist can be assigned the
+         * same ->completed number as the newly arrived callbacks, which
+         * means that the callbacks in any of these later sublist can be
+         * grouped into a single sublist, whether or not they have already
+         * been assigned a ->completed number.
+         */
+        c = rcu_cbs_completed(rsp, rnp);
+        for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
+                if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
+                    !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
+                        break;
+        /*
+         * If there are no sublist for unassigned callbacks, leave.
+         * At the same time, advance "i" one sublist, so that "i" will
+         * index into the sublist where all the remaining callbacks should
+         * be grouped into.
+         */
+        if (++i >= RCU_NEXT_TAIL)
+                return;
+        /*
+         * Assign all subsequent callbacks' ->completed number to the next
+         * full grace period and group them all in the sublist initially
+         * indexed by "i".
+         */
+        for (; i <= RCU_NEXT_TAIL; i++) {
+                rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
+                rdp->nxtcompleted[i] = c;
+        }
+        /* Trace depending on how much we were able to accelerate. */
+        if (!*rdp->nxttail[RCU_WAIT_TAIL])
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
+        else
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
+}
+/*
+ * Move any callbacks whose grace period has completed to the
+ * RCU_DONE_TAIL sublist, then compact the remaining sublists and
+ * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
+ * sublist.  This function is idempotent, so it does not hurt to
+ * invoke it repeatedly.  As long as it is not invoked -too- often...
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+                            struct rcu_data *rdp)
+{
+        int i, j;
+        /* If the CPU has no callbacks, nothing to do. */
+        if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+                return;
+        /*
+         * Find all callbacks whose ->completed numbers indicate that they
+         * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+         */
+        for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+                if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
+                        break;
+                rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
+        }
+        /* Clean up any sublist tail pointers that were misordered above. */
+        for (j = RCU_WAIT_TAIL; j < i; j++)
+                rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
+        /* Copy down callbacks to fill in empty sublists. */
+        for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+                if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
+                        break;
+                rdp->nxttail[j] = rdp->nxttail[i];
+                rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
+        }
+        /* Classify any remaining callbacks. */
+        rcu_accelerate_cbs(rsp, rnp, rdp);
+}
+/*
 * Advance this CPU's callbacks, but only if the current grace period
 * has ended.  This may be called only from the CPU to whom the rdp
 * belongs.  In addition, the corresponding leaf rcu_node structure's
@@ -1080,12 +1190,15 @@ static void
 __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
        /* Did another grace period end? */
-        if (rdp->completed != rnp->completed) {
+        if (rdp->completed == rnp->completed) {
-                /* Advance callbacks.  No harm if list empty. */
+                /* No, so just accelerate recent callbacks. */
-                rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+                rcu_accelerate_cbs(rsp, rnp, rdp);
-                rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
-                rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+        } else {
+                /* Advance callbacks. */
+                rcu_advance_cbs(rsp, rnp, rdp);
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
@@ -1392,17 +1505,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        /*
         * Because there is no grace period in progress right now,
         * any callbacks we have up to this point will be satisfied
-         * by the next grace period.  So promote all callbacks to be
+         * by the next grace period.  So this is a good place to
-         * handled after the end of the next grace period.  If the
+         * assign a grace period number to recently posted callbacks.
-         * CPU is not yet aware of the end of the previous grace period,
-         * we need to allow for the callback advancement that will
-         * occur when it does become aware.  Deadlock prevents us from
-         * making it aware at this point: We cannot acquire a leaf
-         * rcu_node ->lock while holding the root rcu_node ->lock.
         */
-        rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+        rcu_accelerate_cbs(rsp, rnp, rdp);
-        if (rdp->completed == rsp->completed)
-                rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
        rsp->gp_flags = RCU_GP_FLAG_INIT;
        raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
@@ -1527,7 +1633,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
                 * This GP can't end until cpu checks in, so all of our
                 * callbacks can be processed during the next GP.
                 */
-                rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+                rcu_accelerate_cbs(rsp, rnp, rdp);
                rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
        }
@@ -1779,7 +1885,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        long bl, count, count_lazy;
        int i;
-        /* If no callbacks are ready, just return.*/
+        /* If no callbacks are ready, just return. */
        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
                trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
                trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
@@ -2008,19 +2114,19 @@ __rcu_process_callbacks(struct rcu_state *rsp)
        WARN_ON_ONCE(rdp->beenonline == 0);
-        /*
+        /* Handle the end of a grace period that some other CPU ended.  */
-         * Advance callbacks in response to end of earlier grace
-         * period that some other CPU ended.
-         */
        rcu_process_gp_end(rsp, rdp);
        /* Update RCU state based on any recent quiescent states. */
        rcu_check_quiescent_state(rsp, rdp);
        /* Does this CPU require a not-yet-started grace period? */
+        local_irq_save(flags);
        if (cpu_needs_another_gp(rsp, rdp)) {
-                raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+                raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
                rcu_start_gp(rsp, flags);  /* releases above lock */
+        } else {
+                local_irq_restore(flags);
        }
        /* If there are callbacks ready, invoke them. */
@@ -2719,9 +2825,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
-#ifdef CONFIG_RCU_USER_QS
-        WARN_ON_ONCE(rdp->dynticks->in_user);
-#endif
        rdp->cpu = cpu;
        rdp->rsp = rsp;
        rcu_boot_init_nocb_percpu_data(rdp);
@@ -2938,6 +3041,10 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
+        /* Silence gcc 4.8 warning about array index out of range. */
+        if (rcu_num_lvls > RCU_NUM_LVLS)
+                panic("rcu_init_one: rcu_num_lvls overflow");
        /* Initialize the level-tracking arrays. */
        for (i = 0; i < rcu_num_lvls; i++)
@@ -3074,7 +3181,6 @@ void __init rcu_init(void)
        cpu_notifier(rcu_cpu_notify, 0);
        for_each_online_cpu(cpu)
                rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
-        check_cpu_stall_init();
 }
 #include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4b69291b093d..c896b5045d9d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -102,10 +102,6 @@ struct rcu_dynticks {
                                    /* idle-period nonlazy_posted snapshot. */
        int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-#ifdef CONFIG_RCU_USER_QS
-        bool ignore_user_qs;        /* Treat userspace as extended QS or not */
-        bool in_user;               /* Is the CPU in userland from RCU POV? */
-#endif
 };
 /* RCU's kthread states for tracing. */
@@ -282,6 +278,8 @@ struct rcu_data {
         */
        struct rcu_head *nxtlist;
        struct rcu_head **nxttail[RCU_NEXT_SIZE];
+        unsigned long   nxtcompleted[RCU_NEXT_SIZE];
+                                        /* grace periods for sublists. */
        long            qlen_lazy;      /* # of lazy queued callbacks */
        long            qlen;           /* # of queued callbacks, incl lazy */
        long            qlen_last_fqs_check;
@@ -343,11 +341,6 @@ struct rcu_data {
 #define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA          (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA          0
-#endif
 #define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
                                                /*  to take at least one */
                                                /*  scheduling clock irq */
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 16502d3a71c8..13b243a323fa 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -17,6 +17,7 @@
 * See rt.c in preempt-rt for proper credits and further information
 */
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 98ec49475460..7890b10084a7 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -10,6 +10,7 @@
 #include <linux/kthread.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e691c993..1e09308bf2a1 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/timer.h>
 #include "rtmutex_common.h"
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
        ag->tg->rt_se = NULL;
        ag->tg->rt_rq = NULL;
 #endif
+        sched_offline_group(ag->tg);
        sched_destroy_group(ag->tg);
 }
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
        if (IS_ERR(tg))
                goto out_free;
+        sched_online_group(tg, &root_task_group);
        kref_init(&ag->kref);
        init_rwsem(&ag->lock);
        ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26058d0bebba..3a673a3b0c6b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,7 +83,7 @@
 #endif
 #include "sched.h"
-#include "../workqueue_sched.h"
+#include "../workqueue_internal.h"
 #include "../smpboot.h"
 #define CREATE_TRACE_POINTS
@@ -4371,7 +4371,7 @@ bool __sched yield_to(struct task_struct *p, bool preempt)
        struct task_struct *curr = current;
        struct rq *rq, *p_rq;
        unsigned long flags;
-        bool yielded = 0;
+        int yielded = 0;
        local_irq_save(flags);
        rq = this_rq();
@@ -4667,6 +4667,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         */
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
+        vtime_init_idle(idle);
 #if defined(CONFIG_SMP)
        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
@@ -7160,7 +7161,6 @@ static void free_sched_group(struct task_group *tg)
 struct task_group *sched_create_group(struct task_group *parent)
 {
        struct task_group *tg;
-        unsigned long flags;
        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
        if (!tg)
@@ -7172,6 +7172,17 @@ struct task_group *sched_create_group(struct task_group *parent)
        if (!alloc_rt_sched_group(tg, parent))
                goto err;
+        return tg;
+err:
+        free_sched_group(tg);
+        return ERR_PTR(-ENOMEM);
+}
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+        unsigned long flags;
        spin_lock_irqsave(&task_group_lock, flags);
        list_add_rcu(&tg->list, &task_groups);
@@ -7181,12 +7192,6 @@ struct task_group *sched_create_group(struct task_group *parent)
        INIT_LIST_HEAD(&tg->children);
        list_add_rcu(&tg->siblings, &parent->children);
        spin_unlock_irqrestore(&task_group_lock, flags);
-        return tg;
-err:
-        free_sched_group(tg);
-        return ERR_PTR(-ENOMEM);
 }
 /* rcu callback to free various structures associated with a task group */
@@ -7199,6 +7204,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
+        /* wait for possible concurrent references to cfs_rqs complete */
+        call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+void sched_offline_group(struct task_group *tg)
+{
        unsigned long flags;
        int i;
@@ -7210,9 +7221,6 @@ void sched_destroy_group(struct task_group *tg)
        list_del_rcu(&tg->list);
        list_del_rcu(&tg->siblings);
        spin_unlock_irqrestore(&task_group_lock, flags);
-        /* wait for possible concurrent references to cfs_rqs complete */
-        call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 /* change task's runqueue when it moves between groups.
@@ -7508,6 +7516,25 @@ static int sched_rt_global_constraints(void)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
+int sched_rr_handler(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret;
+        static DEFINE_MUTEX(mutex);
+        mutex_lock(&mutex);
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        /* make sure that internally we keep jiffies */
+        /* also, writing zero resets timeslice to default */
+        if (!ret && write) {
+                sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+                        RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+        }
+        mutex_unlock(&mutex);
+        return ret;
+}
 int sched_rt_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -7564,6 +7591,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
        return &tg->css;
 }
+static int cpu_cgroup_css_online(struct cgroup *cgrp)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        struct task_group *parent;
+        if (!cgrp->parent)
+                return 0;
+        parent = cgroup_tg(cgrp->parent);
+        sched_online_group(tg, parent);
+        return 0;
+}
 static void cpu_cgroup_css_free(struct cgroup *cgrp)
 {
        struct task_group *tg = cgroup_tg(cgrp);
@@ -7571,6 +7611,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
        sched_destroy_group(tg);
 }
+static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        sched_offline_group(tg);
+}
 static int cpu_cgroup_can_attach(struct cgroup *cgrp,
                                 struct cgroup_taskset *tset)
 {
@@ -7926,6 +7973,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .css_alloc      = cpu_cgroup_css_alloc,
        .css_free       = cpu_cgroup_css_free,
+        .css_online     = cpu_cgroup_css_online,
+        .css_offline    = cpu_cgroup_css_offline,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 23aa789c53ee..1095e878a46f 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,8 @@
 */
 #include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include "cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 293b202fcf79..9857329ed280 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/kernel_stat.h>
 #include <linux/static_key.h>
+#include <linux/context_tracking.h>
 #include "sched.h"
@@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
        task_group_account_field(p, index, (__force u64) cputime);
        /* Account for user time used */
-        acct_update_integrals(p);
+        acct_account_cputime(p);
 }
 /*
@@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
        task_group_account_field(p, index, (__force u64) cputime);
        /* Account for system time used */
-        acct_update_integrals(p);
+        acct_account_cputime(p);
 }
 /*
@@ -295,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
        struct signal_struct *sig = tsk->signal;
+        cputime_t utime, stime;
        struct task_struct *t;
        times->utime = sig->utime;
@@ -308,16 +310,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        t = tsk;
        do {
-                times->utime += t->utime;
+                task_cputime(tsk, &utime, &stime);
-                times->stime += t->stime;
+                times->utime += utime;
+                times->stime += stime;
                times->sum_exec_runtime += task_sched_runtime(t);
        } while_each_thread(tsk, t);
 out:
        rcu_read_unlock();
 }
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
 * Account a tick to a process and cpustat
@@ -382,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks)
                irqtime_account_process_tick(current, 0, rq);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                                                struct rq *rq) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /*
 * Account a single tick of cpu time.
 * @p: the process that the cpu time gets accounted to
@@ -397,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
        struct rq *rq = this_rq();
+        if (vtime_accounting_enabled())
+                return;
        if (sched_clock_irqtime) {
                irqtime_account_process_tick(p, user_tick, rq);
                return;
@@ -438,8 +443,7 @@ void account_idle_ticks(unsigned long ticks)
        account_idle_time(jiffies_to_cputime(ticks));
 }
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-#endif
 /*
 * Use precise platform statistics if available:
@@ -461,25 +465,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
        *st = cputime.stime;
 }
-void vtime_account_system_irqsafe(struct task_struct *tsk)
-{
-        unsigned long flags;
-        local_irq_save(flags);
-        vtime_account_system(tsk);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
 #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 void vtime_task_switch(struct task_struct *prev)
 {
+        if (!vtime_accounting_enabled())
+                return;
        if (is_idle_task(prev))
                vtime_account_idle(prev);
        else
                vtime_account_system(prev);
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        vtime_account_user(prev);
+#endif
        arch_vtime_task_switch(prev);
 }
 #endif
@@ -493,27 +492,40 @@ void vtime_task_switch(struct task_struct *prev)
 * vtime_account().
 */
 #ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
 {
-        if (in_interrupt() || !is_idle_task(tsk))
+        if (!vtime_accounting_enabled())
-                vtime_account_system(tsk);
+                return;
-        else
-                vtime_account_idle(tsk);
+        if (!in_interrupt()) {
+                /*
+                 * If we interrupted user, context_tracking_in_user()
+                 * is 1 because the context tracking don't hook
+                 * on irq entry/exit. This way we know if
+                 * we need to flush user time on kernel entry.
+                 */
+                if (context_tracking_in_user()) {
+                        vtime_account_user(tsk);
+                        return;
+                }
+                if (is_idle_task(tsk)) {
+                        vtime_account_idle(tsk);
+                        return;
+                }
+        }
+        vtime_account_system(tsk);
 }
-EXPORT_SYMBOL_GPL(vtime_account);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#else
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
-#endif
-static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
 {
        u64 temp = (__force u64) rtime;
-        temp *= (__force u64) utime;
+        temp *= (__force u64) stime;
        if (sizeof(cputime_t) == 4)
                temp = div_u64(temp, (__force u32) total);
@@ -531,10 +543,10 @@ static void cputime_adjust(struct task_cputime *curr,
                           struct cputime *prev,
                           cputime_t *ut, cputime_t *st)
 {
-        cputime_t rtime, utime, total;
+        cputime_t rtime, stime, total;
-        utime = curr->utime;
+        stime = curr->stime;
-        total = utime + curr->stime;
+        total = stime + curr->utime;
        /*
         * Tick based cputime accounting depend on random scheduling
@@ -549,17 +561,17 @@ static void cputime_adjust(struct task_cputime *curr,
        rtime = nsecs_to_cputime(curr->sum_exec_runtime);
        if (total)
-                utime = scale_utime(utime, rtime, total);
+                stime = scale_stime(stime, rtime, total);
        else
-                utime = rtime;
+                stime = rtime;
        /*
         * If the tick based count grows faster than the scheduler one,
         * the result of the scaling may go backward.
         * Let's enforce monotonicity.
         */
-        prev->utime = max(prev->utime, utime);
+        prev->stime = max(prev->stime, stime);
-        prev->stime = max(prev->stime, rtime - prev->utime);
+        prev->utime = max(prev->utime, rtime - prev->stime);
        *ut = prev->utime;
        *st = prev->stime;
@@ -568,11 +580,10 @@ static void cputime_adjust(struct task_cputime *curr,
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        struct task_cputime cputime = {
-                .utime = p->utime,
-                .stime = p->stime,
                .sum_exec_runtime = p->se.sum_exec_runtime,
        };
+        task_cputime(p, &cputime.utime, &cputime.stime);
        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
@@ -586,4 +597,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
        thread_group_cputime(p, &cputime);
        cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 }
-#endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static unsigned long long vtime_delta(struct task_struct *tsk)
+{
+        unsigned long long clock;
+        clock = sched_clock();
+        if (clock < tsk->vtime_snap)
+                return 0;
+        return clock - tsk->vtime_snap;
+}
+static cputime_t get_vtime_delta(struct task_struct *tsk)
+{
+        unsigned long long delta = vtime_delta(tsk);
+        WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+        tsk->vtime_snap += delta;
+        /* CHECKME: always safe to convert nsecs to cputime? */
+        return nsecs_to_cputime(delta);
+}
+static void __vtime_account_system(struct task_struct *tsk)
+{
+        cputime_t delta_cpu = get_vtime_delta(tsk);
+        account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+}
+void vtime_account_system(struct task_struct *tsk)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        write_seqlock(&tsk->vtime_seqlock);
+        __vtime_account_system(tsk);
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_account_irq_exit(struct task_struct *tsk)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        write_seqlock(&tsk->vtime_seqlock);
+        if (context_tracking_in_user())
+                tsk->vtime_snap_whence = VTIME_USER;
+        __vtime_account_system(tsk);
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_account_user(struct task_struct *tsk)
+{
+        cputime_t delta_cpu;
+        if (!vtime_accounting_enabled())
+                return;
+        delta_cpu = get_vtime_delta(tsk);
+        write_seqlock(&tsk->vtime_seqlock);
+        tsk->vtime_snap_whence = VTIME_SYS;
+        account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_user_enter(struct task_struct *tsk)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        write_seqlock(&tsk->vtime_seqlock);
+        tsk->vtime_snap_whence = VTIME_USER;
+        __vtime_account_system(tsk);
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_guest_enter(struct task_struct *tsk)
+{
+        write_seqlock(&tsk->vtime_seqlock);
+        __vtime_account_system(tsk);
+        current->flags |= PF_VCPU;
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_guest_exit(struct task_struct *tsk)
+{
+        write_seqlock(&tsk->vtime_seqlock);
+        __vtime_account_system(tsk);
+        current->flags &= ~PF_VCPU;
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_account_idle(struct task_struct *tsk)
+{
+        cputime_t delta_cpu = get_vtime_delta(tsk);
+        account_idle_time(delta_cpu);
+}
+bool vtime_accounting_enabled(void)
+{
+        return context_tracking_active();
+}
+void arch_vtime_task_switch(struct task_struct *prev)
+{
+        write_seqlock(&prev->vtime_seqlock);
+        prev->vtime_snap_whence = VTIME_SLEEPING;
+        write_sequnlock(&prev->vtime_seqlock);
+        write_seqlock(&current->vtime_seqlock);
+        current->vtime_snap_whence = VTIME_SYS;
+        current->vtime_snap = sched_clock();
+        write_sequnlock(&current->vtime_seqlock);
+}
+void vtime_init_idle(struct task_struct *t)
+{
+        unsigned long flags;
+        write_seqlock_irqsave(&t->vtime_seqlock, flags);
+        t->vtime_snap_whence = VTIME_SYS;
+        t->vtime_snap = sched_clock();
+        write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+}
+cputime_t task_gtime(struct task_struct *t)
+{
+        unsigned int seq;
+        cputime_t gtime;
+        do {
+                seq = read_seqbegin(&t->vtime_seqlock);
+                gtime = t->gtime;
+                if (t->flags & PF_VCPU)
+                        gtime += vtime_delta(t);
+        } while (read_seqretry(&t->vtime_seqlock, seq));
+        return gtime;
+}
+/*
+ * Fetch cputime raw values from fields of task_struct and
+ * add up the pending nohz execution time since the last
+ * cputime snapshot.
+ */
+static void
+fetch_task_cputime(struct task_struct *t,
+                   cputime_t *u_dst, cputime_t *s_dst,
+                   cputime_t *u_src, cputime_t *s_src,
+                   cputime_t *udelta, cputime_t *sdelta)
+{
+        unsigned int seq;
+        unsigned long long delta;
+        do {
+                *udelta = 0;
+                *sdelta = 0;
+                seq = read_seqbegin(&t->vtime_seqlock);
+                if (u_dst)
+                        *u_dst = *u_src;
+                if (s_dst)
+                        *s_dst = *s_src;
+                /* Task is sleeping, nothing to add */
+                if (t->vtime_snap_whence == VTIME_SLEEPING ||
+                    is_idle_task(t))
+                        continue;
+                delta = vtime_delta(t);
+                /*
+                 * Task runs either in user or kernel space, add pending nohz time to
+                 * the right place.
+                 */
+                if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
+                        *udelta = delta;
+                } else {
+                        if (t->vtime_snap_whence == VTIME_SYS)
+                                *sdelta = delta;
+                }
+        } while (read_seqretry(&t->vtime_seqlock, seq));
+}
+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+{
+        cputime_t udelta, sdelta;
+        fetch_task_cputime(t, utime, stime, &t->utime,
+                           &t->stime, &udelta, &sdelta);
+        if (utime)
+                *utime += udelta;
+        if (stime)
+                *stime += sdelta;
+}
+void task_cputime_scaled(struct task_struct *t,
+                         cputime_t *utimescaled, cputime_t *stimescaled)
+{
+        cputime_t udelta, sdelta;
+        fetch_task_cputime(t, utimescaled, stimescaled,
+                           &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
+        if (utimescaled)
+                *utimescaled += cputime_to_scaled(udelta);
+        if (stimescaled)
+                *stimescaled += cputime_to_scaled(sdelta);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7ae4c4c5420e..557e7b53b323 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
        if (autogroup_path(tg, group_path, PATH_MAX))
                return group_path;
-        /*
-         * May be NULL if the underlying cgroup isn't fully-created yet
-         */
-        if (!tg->css.cgroup) {
-                group_path[0] = '\0';
-                return group_path;
-        }
        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
        return group_path;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 81fa53643409..7a33e5986fc5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1680,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        }
        /* ensure we never gain time by being placed backwards. */
-        vruntime = max_vruntime(se->vruntime, vruntime);
+        se->vruntime = max_vruntime(se->vruntime, vruntime);
-        se->vruntime = vruntime;
 }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -3254,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 */
 static int select_idle_sibling(struct task_struct *p, int target)
 {
-        int cpu = smp_processor_id();
-        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
        struct sched_group *sg;
-        int i;
+        int i = task_cpu(p);
-        /*
+        if (idle_cpu(target))
-         * If the task is going to be woken-up on this cpu and if it is
+                return target;
-         * already idle, then it is the right target.
-         */
-        if (target == cpu && idle_cpu(cpu))
-                return cpu;
        /*
-         * If the task is going to be woken-up on the cpu where it previously
+         * If the prevous cpu is cache affine and idle, don't be stupid.
-         * ran and if it is currently idle, then it the right target.
         */
-        if (target == prev_cpu && idle_cpu(prev_cpu))
+        if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-                return prev_cpu;
+                return i;
        /*
         * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -3286,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
                                goto next;
                        for_each_cpu(i, sched_group_cpus(sg)) {
-                                if (!idle_cpu(i))
+                                if (i == target || !idle_cpu(i))
                                        goto next;
                        }
@@ -6101,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
         * idle runqueue:
         */
        if (rq->cfs.load.weight)
-                rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+                rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
        return rr_interval;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4f02b2847357..127a2c4cf4ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
 #include <linux/slab.h>
+int sched_rr_timeslice = RR_TIMESLICE;
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 struct rt_bandwidth def_rt_bandwidth;
@@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq)
                return;
        delta_exec = rq->clock_task - curr->se.exec_start;
-        if (unlikely((s64)delta_exec < 0))
+        if (unlikely((s64)delta_exec <= 0))
-                delta_exec = 0;
+                return;
        schedstat_set(curr->se.statistics.exec_max,
                      max(curr->se.statistics.exec_max, delta_exec));
@@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
+            cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
-            (p->nr_cpus_allowed > 1))
                return 1;
        return 0;
 }
@@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
         * we may need to handle the pulling of RT tasks
         * now.
         */
-        if (p->on_rq && !rq->rt.rt_nr_running)
+        if (!p->on_rq || rq->rt.rt_nr_running)
-                pull_rt_task(rq);
+                return;
+        if (pull_rt_task(rq))
+                resched_task(rq->curr);
 }
 void init_sched_rt_class(void)
@@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
        if (soft != RLIM_INFINITY) {
                unsigned long next;
-                p->rt.timeout++;
+                if (p->rt.watchdog_stamp != jiffies) {
+                        p->rt.timeout++;
+                        p->rt.watchdog_stamp = jiffies;
+                }
                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
                if (p->rt.timeout > next)
                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        if (--p->rt.time_slice)
                return;
-        p->rt.time_slice = RR_TIMESLICE;
+        p->rt.time_slice = sched_rr_timeslice;
        /*
         * Requeue to the end of queue if we (and all of our ancestors) are the
@@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
         * Time slice is 0 for SCHED_FIFO tasks
         */
        if (task->policy == SCHED_RR)
-                return RR_TIMESLICE;
+                return sched_rr_timeslice;
        else
                return 0;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc886441436a..cc03cfdf469f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,7 @@
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 3d09cf6cde75..7f82adbad480 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1632,6 +1632,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
        unsigned long flags;
        struct sighand_struct *psig;
        bool autoreap = false;
+        cputime_t utime, stime;
        BUG_ON(sig == -1);
@@ -1669,8 +1670,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
                                       task_uid(tsk));
        rcu_read_unlock();
-        info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
+        task_cputime(tsk, &utime, &stime);
-        info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
+        info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
+        info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
@@ -1734,6 +1736,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
        unsigned long flags;
        struct task_struct *parent;
        struct sighand_struct *sighand;
+        cputime_t utime, stime;
        if (for_ptracer) {
                parent = tsk->parent;
@@ -1752,8 +1755,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
        info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
        rcu_read_unlock();
-        info.si_utime = cputime_to_clock_t(tsk->utime);
+        task_cputime(tsk, &utime, &stime);
-        info.si_stime = cputime_to_clock_t(tsk->stime);
+        info.si_utime = cputime_to_clock_t(utime);
+        info.si_stime = cputime_to_clock_t(stime);
        info.si_code = why;
        switch (why) {
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d6c5fc054242..d4abac261779 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -183,9 +183,10 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
                kfree(td);
                return PTR_ERR(tsk);
        }
        get_task_struct(tsk);
        *per_cpu_ptr(ht->store, cpu) = tsk;
+        if (ht->create)
+                ht->create(cpu);
        return 0;
 }
@@ -225,7 +226,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
 {
        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-        if (tsk)
+        if (tsk && !ht->selfparking)
                kthread_park(tsk);
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed567babe789..b4d252fd195b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)
 EXPORT_SYMBOL(local_bh_enable_ip);
 /*
- * We restart softirq processing MAX_SOFTIRQ_RESTART times,
+ * We restart softirq processing for at most 2 ms,
- * and we fall back to softirqd after that.
+ * and if need_resched() is not set.
 *
- * This number has been established via experimentation.
+ * These limits have been established via experimentation.
 * The two things to balance is latency against fairness -
 * we want to handle softirqs as soon as possible, but they
 * should not be able to lock up the box.
 */
-#define MAX_SOFTIRQ_RESTART 10
+#define MAX_SOFTIRQ_TIME  msecs_to_jiffies(2)
 asmlinkage void __do_softirq(void)
 {
        struct softirq_action *h;
        __u32 pending;
-        int max_restart = MAX_SOFTIRQ_RESTART;
+        unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
        int cpu;
        unsigned long old_flags = current->flags;
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
        current->flags &= ~PF_MEMALLOC;
        pending = local_softirq_pending();
-        vtime_account_irq_enter(current);
+        account_irq_enter_time(current);
        __local_bh_disable((unsigned long)__builtin_return_address(0),
                                SOFTIRQ_OFFSET);
@@ -264,15 +264,16 @@ restart:
        local_irq_disable();
        pending = local_softirq_pending();
-        if (pending && --max_restart)
+        if (pending) {
-                goto restart;
+                if (time_before(jiffies, end) && !need_resched())
+                        goto restart;
-        if (pending)
                wakeup_softirqd();
+        }
        lockdep_softirq_exit();
-        vtime_account_irq_exit(current);
+        account_irq_exit_time(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
        tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
@@ -341,7 +342,7 @@ static inline void invoke_softirq(void)
 */
 void irq_exit(void)
 {
-        vtime_account_irq_exit(current);
+        account_irq_exit_time(current);
        trace_hardirq_exit();
        sub_preempt_count(IRQ_EXIT_OFFSET);
        if (!in_interrupt() && local_softirq_pending())
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2b859828cdc3..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -282,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
 */
 void cleanup_srcu_struct(struct srcu_struct *sp)
 {
-        int sum;
+        if (WARN_ON(srcu_readers_active(sp)))
+                return; /* Leakage unless caller handles error. */
-        sum = srcu_readers_active(sp);
-        WARN_ON(sum);  /* Leakage unless caller handles error. */
-        if (sum != 0)
-                return;
        free_percpu(sp->per_cpu_ref);
        sp->per_cpu_ref = NULL;
 }
@@ -302,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
 {
        int idx;
+        idx = ACCESS_ONCE(sp->completed) & 0x1;
        preempt_disable();
-        idx = rcu_dereference_index_check(sp->completed,
-                                          rcu_read_lock_sched_held()) & 0x1;
        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
        smp_mb(); /* B */  /* Avoid leaking the critical section. */
        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
@@ -321,10 +316,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
 */
 void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
-        preempt_disable();
        smp_mb(); /* C */  /* Avoid leaking the critical section. */
-        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
+        this_cpu_dec(sp->per_cpu_ref->c[idx]);
-        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -423,6 +416,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
                           !lock_is_held(&rcu_sched_lock_map),
                           "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+        might_sleep();
        init_completion(&rcu.completion);
        head->next = NULL;
@@ -455,10 +449,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
 * @sp: srcu_struct with which to synchronize.
 *
- * Flip the completed counter, and wait for the old count to drain to zero.
+ * Wait for the count to drain to zero of both indexes. To avoid the
- * As with classic RCU, the updater must use some separate means of
+ * possible starvation of synchronize_srcu(), it waits for the count of
- * synchronizing concurrent updates.  Can block; must be called from
+ * the index=((->completed & 1) ^ 1) to drain to zero at first,
- * process context.
+ * and then flip the completed and wait for the count of the other index.
+ *
+ * Can block; must be called from process context.
 *
 * Note that it is illegal to call synchronize_srcu() from the corresponding
 * SRCU read-side critical section; doing so will result in deadlock.
@@ -480,12 +476,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
 * Wait for an SRCU grace period to elapse, but be more aggressive about
 * spinning rather than blocking when waiting.
 *
- * Note that it is illegal to call this function while holding any lock
+ * Note that it is also illegal to call synchronize_srcu_expedited()
- * that is acquired by a CPU-hotplug notifier.  It is also illegal to call
+ * from the corresponding SRCU read-side critical section;
- * synchronize_srcu_expedited() from the corresponding SRCU read-side
+ * doing so will result in deadlock.  However, it is perfectly legal
- * critical section; doing so will result in deadlock.  However, it is
+ * to call synchronize_srcu_expedited() on one srcu_struct from some
- * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
+ * other srcu_struct's read-side critical section, as long as
- * from some other srcu_struct's read-side critical section, as long as
 * the resulting graph of srcu_structs is acyclic.
 */
 void synchronize_srcu_expedited(struct srcu_struct *sp)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e965715..95d178c62d5a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -18,7 +18,7 @@
 #include <linux/stop_machine.h>
 #include <linux/interrupt.h>
 #include <linux/kallsyms.h>
+#include <linux/smpboot.h>
 #include <linux/atomic.h>
 /*
@@ -37,10 +37,10 @@ struct cpu_stopper {
        spinlock_t              lock;
        bool                    enabled;        /* is this stopper enabled? */
        struct list_head        works;          /* list of pending works */
-        struct task_struct      *thread;        /* stopper thread */
 };
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
 static bool stop_machine_initialized = false;
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
@@ -62,16 +62,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
 }
 /* queue @work to @stopper.  if offline, @work is completed immediately */
-static void cpu_stop_queue_work(struct cpu_stopper *stopper,
+static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
-                                struct cpu_stop_work *work)
 {
+        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+        struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
        unsigned long flags;
        spin_lock_irqsave(&stopper->lock, flags);
        if (stopper->enabled) {
                list_add_tail(&work->list, &stopper->works);
-                wake_up_process(stopper->thread);
+                wake_up_process(p);
        } else
                cpu_stop_signal_done(work->done, false);
@@ -108,7 +110,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
        struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
        cpu_stop_init_done(&done, 1);
-        cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+        cpu_stop_queue_work(cpu, &work);
        wait_for_completion(&done.completion);
        return done.executed ? done.ret : -ENOENT;
 }
@@ -130,7 +132,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
                        struct cpu_stop_work *work_buf)
 {
        *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
-        cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+        cpu_stop_queue_work(cpu, work_buf);
 }
 /* static data for stop_cpus */
@@ -159,8 +161,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
         */
        preempt_disable();
        for_each_cpu(cpu, cpumask)
-                cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
+                cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
-                                    &per_cpu(stop_cpus_work, cpu));
        preempt_enable();
 }
@@ -244,20 +245,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
        return ret;
 }
-static int cpu_stopper_thread(void *data)
+static int cpu_stop_should_run(unsigned int cpu)
+{
+        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+        unsigned long flags;
+        int run;
+        spin_lock_irqsave(&stopper->lock, flags);
+        run = !list_empty(&stopper->works);
+        spin_unlock_irqrestore(&stopper->lock, flags);
+        return run;
+}
+static void cpu_stopper_thread(unsigned int cpu)
 {
-        struct cpu_stopper *stopper = data;
+        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
        struct cpu_stop_work *work;
        int ret;
 repeat:
-        set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */
-        if (kthread_should_stop()) {
-                __set_current_state(TASK_RUNNING);
-                return 0;
-        }
        work = NULL;
        spin_lock_irq(&stopper->lock);
        if (!list_empty(&stopper->works)) {
@@ -273,8 +279,6 @@ repeat:
                struct cpu_stop_done *done = work->done;
                char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
-                __set_current_state(TASK_RUNNING);
                /* cpu stop callbacks are not allowed to sleep */
                preempt_disable();
@@ -290,88 +294,55 @@ repeat:
                                          ksym_buf), arg);
                cpu_stop_signal_done(done, true);
-        } else
+                goto repeat;
-                schedule();
+        }
-        goto repeat;
 }
 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
-/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
+static void cpu_stop_create(unsigned int cpu)
-static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
+{
-                                           unsigned long action, void *hcpu)
+        sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
+}
+static void cpu_stop_park(unsigned int cpu)
 {
-        unsigned int cpu = (unsigned long)hcpu;
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-        struct task_struct *p;
+        struct cpu_stop_work *work;
+        unsigned long flags;
-        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_UP_PREPARE:
-                BUG_ON(stopper->thread || stopper->enabled ||
-                       !list_empty(&stopper->works));
-                p = kthread_create_on_node(cpu_stopper_thread,
-                                           stopper,
-                                           cpu_to_node(cpu),
-                                           "migration/%d", cpu);
-                if (IS_ERR(p))
-                        return notifier_from_errno(PTR_ERR(p));
-                get_task_struct(p);
-                kthread_bind(p, cpu);
-                sched_set_stop_task(cpu, p);
-                stopper->thread = p;
-                break;
-        case CPU_ONLINE:
-                /* strictly unnecessary, as first user will wake it */
-                wake_up_process(stopper->thread);
-                /* mark enabled */
-                spin_lock_irq(&stopper->lock);
-                stopper->enabled = true;
-                spin_unlock_irq(&stopper->lock);
-                break;
-#ifdef CONFIG_HOTPLUG_CPU
-        case CPU_UP_CANCELED:
-        case CPU_POST_DEAD:
-        {
-                struct cpu_stop_work *work;
-                sched_set_stop_task(cpu, NULL);
-                /* kill the stopper */
-                kthread_stop(stopper->thread);
-                /* drain remaining works */
-                spin_lock_irq(&stopper->lock);
-                list_for_each_entry(work, &stopper->works, list)
-                        cpu_stop_signal_done(work->done, false);
-                stopper->enabled = false;
-                spin_unlock_irq(&stopper->lock);
-                /* release the stopper */
-                put_task_struct(stopper->thread);
-                stopper->thread = NULL;
-                break;
-        }
-#endif
-        }
-        return NOTIFY_OK;
+        /* drain remaining works */
+        spin_lock_irqsave(&stopper->lock, flags);
+        list_for_each_entry(work, &stopper->works, list)
+                cpu_stop_signal_done(work->done, false);
+        stopper->enabled = false;
+        spin_unlock_irqrestore(&stopper->lock, flags);
 }
-/*
+static void cpu_stop_unpark(unsigned int cpu)
- * Give it a higher priority so that cpu stopper is available to other
+{
- * cpu notifiers.  It currently shares the same priority as sched
+        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- * migration_notifier.
- */
+        spin_lock_irq(&stopper->lock);
-static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
+        stopper->enabled = true;
-        .notifier_call  = cpu_stop_cpu_callback,
+        spin_unlock_irq(&stopper->lock);
-        .priority       = 10,
+}
+static struct smp_hotplug_thread cpu_stop_threads = {
+        .store                  = &cpu_stopper_task,
+        .thread_should_run      = cpu_stop_should_run,
+        .thread_fn              = cpu_stopper_thread,
+        .thread_comm            = "migration/%u",
+        .create                 = cpu_stop_create,
+        .setup                  = cpu_stop_unpark,
+        .park                   = cpu_stop_park,
+        .unpark                 = cpu_stop_unpark,
+        .selfparking            = true,
 };
 static int __init cpu_stop_init(void)
 {
-        void *bcpu = (void *)(long)smp_processor_id();
        unsigned int cpu;
-        int err;
        for_each_possible_cpu(cpu) {
                struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
@@ -380,15 +351,8 @@ static int __init cpu_stop_init(void)
                INIT_LIST_HEAD(&stopper->works);
        }
-        /* start one for the boot cpu */
+        BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
-        err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
-                                    bcpu);
-        BUG_ON(err != NOTIFY_OK);
-        cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
-        register_cpu_notifier(&cpu_stop_cpu_notifier);
        stop_machine_initialized = true;
        return 0;
 }
 early_initcall(cpu_stop_init);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c88878db491e..4fc9be955c71 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,6 +61,7 @@
 #include <linux/kmod.h>
 #include <linux/capability.h>
 #include <linux/binfmts.h>
+#include <linux/sched/sysctl.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -403,6 +404,13 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = sched_rt_handler,
        },
+        {
+                .procname       = "sched_rr_timeslice_ms",
+                .data           = &sched_rr_timeslice,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = sched_rr_handler,
+        },
 #ifdef CONFIG_SCHED_AUTOGROUP
        {
                .procname       = "sched_autogroup_enabled",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 5a6384450501..b669ca1fa103 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
        { CTL_INT,      NET_TCP_MODERATE_RCVBUF,                "tcp_moderate_rcvbuf" },
        { CTL_INT,      NET_TCP_TSO_WIN_DIVISOR,                "tcp_tso_win_divisor" },
        { CTL_STR,      NET_TCP_CONG_CONTROL,                   "tcp_congestion_control" },
-        { CTL_INT,      NET_TCP_ABC,                            "tcp_abc" },
        { CTL_INT,      NET_TCP_MTU_PROBING,                    "tcp_mtu_probing" },
        { CTL_INT,      NET_TCP_BASE_MSS,                       "tcp_base_mss" },
        { CTL_INT,      NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
diff --git a/kernel/time.c b/kernel/time.c
index d226c6a3fd28..c2a27dd93142 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -115,6 +115,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
 }
 /*
+ * Indicates if there is an offset between the system clock and the hardware
+ * clock/persistent clock/rtc.
+ */
+int persistent_clock_is_local;
+/*
 * Adjust the time obtained from the CMOS to be UTC time instead of
 * local time.
 *
@@ -135,6 +141,8 @@ static inline void warp_clock(void)
        struct timespec adjust;
        adjust = current_kernel_time();
+        if (sys_tz.tz_minuteswest != 0)
+                persistent_clock_is_local = 1;
        adjust.tv_sec += sys_tz.tz_minuteswest * 60;
        do_settimeofday(&adjust);
 }
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8601f0db1261..24510d84efd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
 config ARCH_CLOCKSOURCE_DATA
        bool
+# Platforms has a persistent clock
+config ALWAYS_USE_PERSISTENT_CLOCK
+        bool
+        default n
 # Timekeeping vsyscall support
 config GENERIC_TIME_VSYSCALL
        bool
@@ -38,6 +43,10 @@ config GENERIC_CLOCKEVENTS_BUILD
        default y
        depends on GENERIC_CLOCKEVENTS
+# Architecture can handle broadcast in a driver-agnostic way
+config ARCH_HAS_TICK_BROADCAST
+        bool
 # Clockevents broadcasting infrastructure
 config GENERIC_CLOCKEVENTS_BROADCAST
        bool
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4d669b..b10a42bb0165 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,6 +15,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/rtc.h>
 #include "tick-internal.h"
@@ -483,8 +484,7 @@ out:
        return leap;
 }
-#ifdef CONFIG_GENERIC_CMOS_UPDATE
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
 static void sync_cmos_clock(struct work_struct *work);
 static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -510,14 +510,26 @@ static void sync_cmos_clock(struct work_struct *work)
        }
        getnstimeofday(&now);
-        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
+        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
-                fail = update_persistent_clock(now);
+                struct timespec adjust = now;
+                fail = -ENODEV;
+                if (persistent_clock_is_local)
+                        adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+                fail = update_persistent_clock(adjust);
+#endif
+#ifdef CONFIG_RTC_SYSTOHC
+                if (fail == -ENODEV)
+                        fail = rtc_set_ntp_time(adjust);
+#endif
+        }
        next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
        if (next.tv_nsec <= 0)
                next.tv_nsec += NSEC_PER_SEC;
-        if (!fail)
+        if (!fail || fail == -ENODEV)
                next.tv_sec = 659;
        else
                next.tv_sec = 0;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f113755695e2..2fb8cb88df8d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,6 +18,7 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
+#include <linux/smp.h>
 #include "tick-internal.h"
@@ -86,6 +87,22 @@ int tick_is_broadcast_device(struct clock_event_device *dev)
        return (dev && tick_broadcast_device.evtdev == dev);
 }
+static void err_broadcast(const struct cpumask *mask)
+{
+        pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
+}
+static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
+{
+        if (!dev->broadcast)
+                dev->broadcast = tick_broadcast;
+        if (!dev->broadcast) {
+                pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
+                             dev->name);
+                dev->broadcast = err_broadcast;
+        }
+}
 /*
 * Check, if the device is disfunctional and a place holder, which
 * needs to be handled by the broadcast device.
@@ -105,6 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
         */
        if (!tick_device_is_functional(dev)) {
                dev->event_handler = tick_handle_periodic;
+                tick_device_setup_broadcast_func(dev);
                cpumask_set_cpu(cpu, tick_get_broadcast_mask());
                tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
                ret = 1;
@@ -116,15 +134,33 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
                 */
                if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
                        int cpu = smp_processor_id();
                        cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
                        tick_broadcast_clear_oneshot(cpu);
+                } else {
+                        tick_device_setup_broadcast_func(dev);
                }
        }
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
        return ret;
 }
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+int tick_receive_broadcast(void)
+{
+        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+        struct clock_event_device *evt = td->evtdev;
+        if (!evt)
+                return -ENODEV;
+        if (!evt->event_handler)
+                return -EINVAL;
+        evt->event_handler(evt);
+        return 0;
+}
+#endif
 /*
 * Broadcast the event to the cpus, which are set in the mask (mangled).
 */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d58e552d9fd1..314b9ee07edf 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/irq_work.h>
 #include <asm/irq_regs.h>
@@ -28,7 +29,7 @@
 /*
 * Per cpu nohz control structure
 */
-static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 /*
 * The time, when the last jiffy update happened. Protected by jiffies_lock.
@@ -331,8 +332,8 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                time_delta = timekeeping_max_deferment();
        } while (read_seqretry(&jiffies_lock, seq));
-        if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
+        if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
-            arch_needs_cpu(cpu)) {
+            arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
                next_jiffies = last_jiffies + 1;
                delta_jiffies = 1;
        } else {
@@ -631,8 +632,11 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 {
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        unsigned long ticks;
+        if (vtime_accounting_enabled())
+                return;
        /*
         * We stopped the tick in idle. Update process times would miss the
         * time we slept as update_process_times does only a 1 tick
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cbc6acb0db3f..1e35515a875e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -29,6 +29,9 @@ static struct timekeeper timekeeper;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
+/* Flag for if there is a persistent clock on this platform */
+bool __read_mostly persistent_clock_exist = false;
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
        while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
@@ -264,19 +267,18 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 }
 /**
- * getnstimeofday - Returns the time of day in a timespec
+ * __getnstimeofday - Returns the time of day in a timespec.
 * @ts:         pointer to the timespec to be set
 *
- * Returns the time of day in a timespec.
+ * Updates the time of day in the timespec.
+ * Returns 0 on success, or -ve when suspended (timespec will be undefined).
 */
-void getnstimeofday(struct timespec *ts)
+int __getnstimeofday(struct timespec *ts)
 {
        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs = 0;
-        WARN_ON(timekeeping_suspended);
        do {
                seq = read_seqbegin(&tk->lock);
@@ -287,6 +289,26 @@ void getnstimeofday(struct timespec *ts)
        ts->tv_nsec = 0;
        timespec_add_ns(ts, nsecs);
+        /*
+         * Do not bail out early, in case there were callers still using
+         * the value, even in the face of the WARN_ON.
+         */
+        if (unlikely(timekeeping_suspended))
+                return -EAGAIN;
+        return 0;
+}
+EXPORT_SYMBOL(__getnstimeofday);
+/**
+ * getnstimeofday - Returns the time of day in a timespec.
+ * @ts:         pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec (WARN if suspended).
+ */
+void getnstimeofday(struct timespec *ts)
+{
+        WARN_ON(__getnstimeofday(ts));
 }
 EXPORT_SYMBOL(getnstimeofday);
@@ -640,12 +662,14 @@ void __init timekeeping_init(void)
        struct timespec now, boot, tmp;
        read_persistent_clock(&now);
        if (!timespec_valid_strict(&now)) {
                pr_warn("WARNING: Persistent clock returned invalid value!\n"
                        "         Check your CMOS/BIOS settings.\n");
                now.tv_sec = 0;
                now.tv_nsec = 0;
-        }
+        } else if (now.tv_sec || now.tv_nsec)
+                persistent_clock_exist = true;
        read_boot_clock(&boot);
        if (!timespec_valid_strict(&boot)) {
@@ -718,11 +742,12 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 {
        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
-        struct timespec ts;
-        /* Make sure we don't set the clock twice */
+        /*
-        read_persistent_clock(&ts);
+         * Make sure we don't set the clock twice, as timekeeping_resume()
-        if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
+         * already did it
+         */
+        if (has_persistent_clock())
                return;
        write_seqlock_irqsave(&tk->lock, flags);
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index eb51d76e058a..3f42652a6a37 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -369,10 +369,8 @@ if ($hz eq '--can') {
                die "Usage: $0 HZ\n";
        }
-        @val = @{$canned_values{$hz}};
+        $cv = $canned_values{$hz};
-        if (!defined(@val)) {
+        @val = defined($cv) ? @$cv : compute_values($hz);
-                @val = compute_values($hz);
-        }
        output($hz, @val);
 }
 exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 367d00858482..dbf7a78a1ef1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
 #include <linux/kallsyms.h>
 #include <linux/irq_work.h>
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
@@ -1351,7 +1352,6 @@ void update_process_times(int user_tick)
        account_process_tick(p, user_tick);
        run_local_timers();
        rcu_check_callbacks(cpu, user_tick);
-        printk_tick();
 #ifdef CONFIG_IRQ_WORK
        if (in_irq())
                irq_work_run();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d89335a485f..192473b22799 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE
        help
          See Documentation/trace/ftrace-design.txt
+config HAVE_DYNAMIC_FTRACE_WITH_REGS
+        bool
 config HAVE_FTRACE_MCOUNT_RECORD
        bool
        help
@@ -78,21 +81,6 @@ config EVENT_TRACING
        select CONTEXT_SWITCH_TRACER
        bool
-config EVENT_POWER_TRACING_DEPRECATED
-        depends on EVENT_TRACING
-        bool "Deprecated power event trace API, to be removed"
-        default y
-        help
-          Provides old power event types:
-          C-state/idle accounting events:
-          power:power_start
-          power:power_end
-          and old cpufreq accounting event:
-          power:power_frequency
-          This is for userspace compatibility
-          and will vanish after 5 kernel iterations,
-          namely 3.1.
 config CONTEXT_SWITCH_TRACER
        bool
@@ -250,6 +238,16 @@ config FTRACE_SYSCALLS
        help
          Basic tracer to catch the syscall entry and exit events.
+config TRACER_SNAPSHOT
+        bool "Create a snapshot trace buffer"
+        select TRACER_MAX_TRACE
+        help
+          Allow tracing users to take snapshot of the current buffer using the
+          ftrace interface, e.g.:
+              echo 1 > /sys/kernel/debug/tracing/snapshot
+              cat snapshot
 config TRACE_BRANCH_PROFILING
        bool
        select GENERIC_TRACER
@@ -434,6 +432,11 @@ config DYNAMIC_FTRACE
          were made. If so, it runs stop_machine (stops all CPUS)
          and modifies the code to jump over the call to ftrace.
+config DYNAMIC_FTRACE_WITH_REGS
+        def_bool y
+        depends on DYNAMIC_FTRACE
+        depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
 config FUNCTION_PROFILER
        bool "Kernel function profiler"
        depends on FUNCTION_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c0bd0308741c..71259e2b6b61 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
                return;
        local_irq_save(flags);
-        buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
+        buf = this_cpu_ptr(bt->msg_data);
        va_start(args, fmt);
        n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
        va_end(args);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 41473b4ad7a4..ce8c3d68292f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
 #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
 #endif
+/*
+ * Traverse the ftrace_global_list, invoking all entries.  The reason that we
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism.  The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_global_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
+#define do_for_each_ftrace_op(op, list)                 \
+        op = rcu_dereference_raw(list);                 \
+        do
+/*
+ * Optimized for just a single item in the list (as that is the normal case).
+ */
+#define while_for_each_ftrace_op(op)                            \
+        while (likely(op = rcu_dereference_raw((op)->next)) &&  \
+               unlikely((op) != &ftrace_list_end))
 /**
 * ftrace_nr_registered_ops - return number of ops registered
 *
@@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void)
        return cnt;
 }
-/*
- * Traverse the ftrace_global_list, invoking all entries.  The reason that we
- * can use rcu_dereference_raw() is that elements removed from this list
- * are simply leaked, so there is no need to interact with a grace-period
- * mechanism.  The rcu_dereference_raw() calls are needed to handle
- * concurrent insertions into the ftrace_global_list.
- *
- * Silly Alpha and silly pointer-speculation compiler optimizations!
- */
 static void
 ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
                        struct ftrace_ops *op, struct pt_regs *regs)
 {
-        if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+        int bit;
+        bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
+        if (bit < 0)
                return;
-        trace_recursion_set(TRACE_GLOBAL_BIT);
+        do_for_each_ftrace_op(op, ftrace_global_list) {
-        op = rcu_dereference_raw(ftrace_global_list); /*see above*/
-        while (op != &ftrace_list_end) {
                op->func(ip, parent_ip, op, regs);
-                op = rcu_dereference_raw(op->next); /*see above*/
+        } while_for_each_ftrace_op(op);
-        };
-        trace_recursion_clear(TRACE_GLOBAL_BIT);
+        trace_clear_recursion(bit);
 }
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
@@ -221,10 +233,24 @@ static void update_global_ops(void)
         * registered callers.
         */
        if (ftrace_global_list == &ftrace_list_end ||
-            ftrace_global_list->next == &ftrace_list_end)
+            ftrace_global_list->next == &ftrace_list_end) {
                func = ftrace_global_list->func;
-        else
+                /*
+                 * As we are calling the function directly.
+                 * If it does not have recursion protection,
+                 * the function_trace_op needs to be updated
+                 * accordingly.
+                 */
+                if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
+                        global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
+                else
+                        global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
+        } else {
                func = ftrace_global_list_func;
+                /* The list has its own recursion protection. */
+                global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
+        }
        /* If we filter on pids, update to use the pid function */
        if (!list_empty(&ftrace_pids)) {
@@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
        if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
                return -EINVAL;
-#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
        /*
         * If the ftrace_ops specifies SAVE_REGS, then it only can be used
         * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
@@ -4090,14 +4116,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
         */
        preempt_disable_notrace();
        trace_recursion_set(TRACE_CONTROL_BIT);
-        op = rcu_dereference_raw(ftrace_control_list);
+        do_for_each_ftrace_op(op, ftrace_control_list) {
-        while (op != &ftrace_list_end) {
                if (!ftrace_function_local_disabled(op) &&
                    ftrace_ops_test(op, ip))
                        op->func(ip, parent_ip, op, regs);
+        } while_for_each_ftrace_op(op);
-                op = rcu_dereference_raw(op->next);
-        };
        trace_recursion_clear(TRACE_CONTROL_BIT);
        preempt_enable_notrace();
 }
@@ -4112,27 +4135,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
                       struct ftrace_ops *ignored, struct pt_regs *regs)
 {
        struct ftrace_ops *op;
+        int bit;
        if (function_trace_stop)
                return;
-        if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+        bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+        if (bit < 0)
                return;
-        trace_recursion_set(TRACE_INTERNAL_BIT);
        /*
         * Some of the ops may be dynamically allocated,
         * they must be freed after a synchronize_sched().
         */
        preempt_disable_notrace();
-        op = rcu_dereference_raw(ftrace_ops_list);
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
-        while (op != &ftrace_list_end) {
                if (ftrace_ops_test(op, ip))
                        op->func(ip, parent_ip, op, regs);
-                op = rcu_dereference_raw(op->next);
+        } while_for_each_ftrace_op(op);
-        };
        preempt_enable_notrace();
-        trace_recursion_clear(TRACE_INTERNAL_BIT);
+        trace_clear_recursion(bit);
 }
 /*
@@ -4143,8 +4165,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 * Archs are to support both the regs and ftrace_ops at the same time.
 * If they support ftrace_ops, it is assumed they support regs.
 * If call backs want to use regs, they must either check for regs
- * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
+ * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
- * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
+ * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
 * An architecture can pass partial regs with ftrace_ops and still
 * set the ARCH_SUPPORT_FTARCE_OPS.
 */
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index f55fcf61b223..1c71382b283d 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,8 +13,5 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
-#ifdef EVENT_POWER_TRACING_DEPRECATED
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
-#endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ce8514feedcd..7244acde77b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,8 +3,10 @@
 *
 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
 */
+#include <linux/ftrace_event.h>
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
+#include <linux/trace_seq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
@@ -21,7 +23,6 @@
 #include <linux/fs.h>
 #include <asm/local.h>
-#include "trace.h"
 static void update_pages_handler(struct work_struct *work);
@@ -2432,41 +2433,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 #ifdef CONFIG_TRACING
-#define TRACE_RECURSIVE_DEPTH 16
+/*
+ * The lock and unlock are done within a preempt disable section.
+ * The current_context per_cpu variable can only be modified
+ * by the current task between lock and unlock. But it can
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ *  bit 0 =  NMI context
+ *  bit 1 =  IRQ context
+ *  bit 2 =  SoftIRQ context
+ *  bit 3 =  normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ *  101 - 1 = 100
+ *  101 & 100 = 100 (clearing bit zero)
+ *
+ *  1010 - 1 = 1001
+ *  1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
+ */
+static DEFINE_PER_CPU(unsigned int, current_context);
-/* Keep this code out of the fast path cache */
+static __always_inline int trace_recursive_lock(void)
-static noinline void trace_recursive_fail(void)
 {
-        /* Disable all tracing before we do anything else */
+        unsigned int val = this_cpu_read(current_context);
-        tracing_off_permanent();
+        int bit;
-        printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
-                    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
-                    trace_recursion_buffer(),
-                    hardirq_count() >> HARDIRQ_SHIFT,
-                    softirq_count() >> SOFTIRQ_SHIFT,
-                    in_nmi());
-        WARN_ON_ONCE(1);
-}
-static inline int trace_recursive_lock(void)
+        if (in_interrupt()) {
-{
+                if (in_nmi())
-        trace_recursion_inc();
+                        bit = 0;
+                else if (in_irq())
+                        bit = 1;
+                else
+                        bit = 2;
+        } else
+                bit = 3;
-        if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
+        if (unlikely(val & (1 << bit)))
-                return 0;
+                return 1;
-        trace_recursive_fail();
+        val |= (1 << bit);
+        this_cpu_write(current_context, val);
-        return -1;
+        return 0;
 }
-static inline void trace_recursive_unlock(void)
+static __always_inline void trace_recursive_unlock(void)
 {
-        WARN_ON_ONCE(!trace_recursion_buffer());
+        unsigned int val = this_cpu_read(current_context);
-        trace_recursion_dec();
+        val--;
+        val &= this_cpu_read(current_context);
+        this_cpu_write(current_context, val);
 }
 #else
@@ -3067,6 +3103,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
 EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
 /**
+ * ring_buffer_read_events_cpu - get the number of events successfully read
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of events read
+ */
+unsigned long
+ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                return 0;
+        cpu_buffer = buffer->buffers[cpu];
+        return cpu_buffer->read;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
+/**
 * ring_buffer_entries - get the number of entries in a buffer
 * @buffer: The ring buffer
 *
@@ -3425,7 +3479,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
        /* check for end of page padding */
        if ((iter->head >= rb_page_size(iter->head_page)) &&
            (iter->head_page != cpu_buffer->commit_page))
-                rb_advance_iter(iter);
+                rb_inc_iter(iter);
 }
 static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c13e46d7d24..c2e2c2310374 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -39,6 +39,7 @@
 #include <linux/poll.h>
 #include <linux/nmi.h>
 #include <linux/fs.h>
+#include <linux/sched/rt.h>
 #include "trace.h"
 #include "trace_output.h"
@@ -249,7 +250,7 @@ static unsigned long		trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
 static struct tracer            *trace_types __read_mostly;
 /* current_trace points to the tracer that is currently active */
-static struct tracer            *current_trace __read_mostly;
+static struct tracer            *current_trace __read_mostly = &nop_trace;
 /*
 * trace_types_lock is used to protect the trace_types list.
@@ -709,10 +710,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
        WARN_ON_ONCE(!irqs_disabled());
-        if (!current_trace->use_max_tr) {
-                WARN_ON_ONCE(1);
+        if (!current_trace->allocated_snapshot) {
+                /* Only the nop tracer should hit this when disabling */
+                WARN_ON_ONCE(current_trace != &nop_trace);
                return;
        }
        arch_spin_lock(&ftrace_max_lock);
        tr->buffer = max_tr.buffer;
@@ -739,10 +743,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
        WARN_ON_ONCE(!irqs_disabled());
-        if (!current_trace->use_max_tr) {
+        if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
-                WARN_ON_ONCE(1);
                return;
-        }
        arch_spin_lock(&ftrace_max_lock);
@@ -862,10 +864,13 @@ int register_tracer(struct tracer *type)
                current_trace = type;
-                /* If we expanded the buffers, make sure the max is expanded too */
+                if (type->use_max_tr) {
-                if (ring_buffer_expanded && type->use_max_tr)
+                        /* If we expanded the buffers, make sure the max is expanded too */
-                        ring_buffer_resize(max_tr.buffer, trace_buf_size,
+                        if (ring_buffer_expanded)
-                                                RING_BUFFER_ALL_CPUS);
+                                ring_buffer_resize(max_tr.buffer, trace_buf_size,
+                                                   RING_BUFFER_ALL_CPUS);
+                        type->allocated_snapshot = true;
+                }
                /* the test is responsible for initializing and enabling */
                pr_info("Testing tracer %s: ", type->name);
@@ -881,10 +886,14 @@ int register_tracer(struct tracer *type)
                /* Only reset on passing, to avoid touching corrupted buffers */
                tracing_reset_online_cpus(tr);
-                /* Shrink the max buffer again */
+                if (type->use_max_tr) {
-                if (ring_buffer_expanded && type->use_max_tr)
+                        type->allocated_snapshot = false;
-                        ring_buffer_resize(max_tr.buffer, 1,
-                                                RING_BUFFER_ALL_CPUS);
+                        /* Shrink the max buffer again */
+                        if (ring_buffer_expanded)
+                                ring_buffer_resize(max_tr.buffer, 1,
+                                                   RING_BUFFER_ALL_CPUS);
+                }
                printk(KERN_CONT "PASSED\n");
        }
@@ -922,6 +931,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
 {
        struct ring_buffer *buffer = tr->buffer;
+        if (!buffer)
+                return;
        ring_buffer_record_disable(buffer);
        /* Make sure all commits have finished */
@@ -936,6 +948,9 @@ void tracing_reset_online_cpus(struct trace_array *tr)
        struct ring_buffer *buffer = tr->buffer;
        int cpu;
+        if (!buffer)
+                return;
        ring_buffer_record_disable(buffer);
        /* Make sure all commits have finished */
@@ -1167,7 +1182,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
        entry->preempt_count            = pc & 0xff;
        entry->pid                      = (tsk) ? tsk->pid : 0;
-        entry->padding                  = 0;
        entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
                (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1335,7 +1349,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
         */
        preempt_disable_notrace();
-        use_stack = ++__get_cpu_var(ftrace_stack_reserve);
+        use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
        /*
         * We don't need any atomic variables, just a barrier.
         * If an interrupt comes in, we don't care, because it would
@@ -1389,7 +1403,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
 out:
        /* Again, don't let gcc optimize things here */
        barrier();
-        __get_cpu_var(ftrace_stack_reserve)--;
+        __this_cpu_dec(ftrace_stack_reserve);
        preempt_enable_notrace();
 }
@@ -1517,7 +1531,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer;
 static char *get_trace_buf(void)
 {
        struct trace_buffer_struct *percpu_buffer;
-        struct trace_buffer_struct *buffer;
        /*
         * If we have allocated per cpu buffers, then we do not
@@ -1535,9 +1548,7 @@ static char *get_trace_buf(void)
        if (!percpu_buffer)
                return NULL;
-        buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
+        return this_cpu_ptr(&percpu_buffer->buffer[0]);
-        return buffer->buffer;
 }
 static int alloc_percpu_trace_buffer(void)
@@ -1942,21 +1953,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
        struct trace_iterator *iter = m->private;
-        static struct tracer *old_tracer;
        int cpu_file = iter->cpu_file;
        void *p = NULL;
        loff_t l = 0;
        int cpu;
-        /* copy the tracer to avoid using a global lock all around */
+        /*
+         * copy the tracer to avoid using a global lock all around.
+         * iter->trace is a copy of current_trace, the pointer to the
+         * name may be used instead of a strcmp(), as iter->trace->name
+         * will point to the same string as current_trace->name.
+         */
        mutex_lock(&trace_types_lock);
-        if (unlikely(old_tracer != current_trace && current_trace)) {
+        if (unlikely(current_trace && iter->trace->name != current_trace->name))
-                old_tracer = current_trace;
                *iter->trace = *current_trace;
-        }
        mutex_unlock(&trace_types_lock);
-        atomic_inc(&trace_record_cmdline_disabled);
+        if (iter->snapshot && iter->trace->use_max_tr)
+                return ERR_PTR(-EBUSY);
+        if (!iter->snapshot)
+                atomic_inc(&trace_record_cmdline_disabled);
        if (*pos != iter->pos) {
                iter->ent = NULL;
@@ -1995,7 +2012,11 @@ static void s_stop(struct seq_file *m, void *p)
 {
        struct trace_iterator *iter = m->private;
-        atomic_dec(&trace_record_cmdline_disabled);
+        if (iter->snapshot && iter->trace->use_max_tr)
+                return;
+        if (!iter->snapshot)
+                atomic_dec(&trace_record_cmdline_disabled);
        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
 }
@@ -2080,8 +2101,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
        unsigned long total;
        const char *name = "preemption";
-        if (type)
+        name = type->name;
-                name = type->name;
        get_total_entries(tr, &total, &entries);
@@ -2430,7 +2450,7 @@ static const struct seq_operations tracer_seq_ops = {
 };
 static struct trace_iterator *
-__tracing_open(struct inode *inode, struct file *file)
+__tracing_open(struct inode *inode, struct file *file, bool snapshot)
 {
        long cpu_file = (long) inode->i_private;
        struct trace_iterator *iter;
@@ -2457,16 +2477,16 @@ __tracing_open(struct inode *inode, struct file *file)
        if (!iter->trace)
                goto fail;
-        if (current_trace)
+        *iter->trace = *current_trace;
-                *iter->trace = *current_trace;
        if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
                goto fail;
-        if (current_trace && current_trace->print_max)
+        if (current_trace->print_max || snapshot)
                iter->tr = &max_tr;
        else
                iter->tr = &global_trace;
+        iter->snapshot = snapshot;
        iter->pos = -1;
        mutex_init(&iter->mutex);
        iter->cpu_file = cpu_file;
@@ -2483,8 +2503,9 @@ __tracing_open(struct inode *inode, struct file *file)
        if (trace_clocks[trace_clock_id].in_ns)
                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
-        /* stop the trace while dumping */
+        /* stop the trace while dumping if we are not opening "snapshot" */
-        tracing_stop();
+        if (!iter->snapshot)
+                tracing_stop();
        if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
                for_each_tracing_cpu(cpu) {
@@ -2547,8 +2568,9 @@ static int tracing_release(struct inode *inode, struct file *file)
        if (iter->trace && iter->trace->close)
                iter->trace->close(iter);
-        /* reenable tracing if it was previously enabled */
+        if (!iter->snapshot)
-        tracing_start();
+                /* reenable tracing if it was previously enabled */
+                tracing_start();
        mutex_unlock(&trace_types_lock);
        mutex_destroy(&iter->mutex);
@@ -2576,7 +2598,7 @@ static int tracing_open(struct inode *inode, struct file *file)
        }
        if (file->f_mode & FMODE_READ) {
-                iter = __tracing_open(inode, file);
+                iter = __tracing_open(inode, file, false);
                if (IS_ERR(iter))
                        ret = PTR_ERR(iter);
                else if (trace_flags & TRACE_ITER_LATENCY_FMT)
@@ -3014,10 +3036,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
        int r;
        mutex_lock(&trace_types_lock);
-        if (current_trace)
+        r = sprintf(buf, "%s\n", current_trace->name);
-                r = sprintf(buf, "%s\n", current_trace->name);
-        else
-                r = sprintf(buf, "\n");
        mutex_unlock(&trace_types_lock);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3183,6 +3202,7 @@ static int tracing_set_tracer(const char *buf)
        static struct trace_option_dentry *topts;
        struct trace_array *tr = &global_trace;
        struct tracer *t;
+        bool had_max_tr;
        int ret = 0;
        mutex_lock(&trace_types_lock);
@@ -3207,9 +3227,21 @@ static int tracing_set_tracer(const char *buf)
                goto out;
        trace_branch_disable();
-        if (current_trace && current_trace->reset)
+        if (current_trace->reset)
                current_trace->reset(tr);
-        if (current_trace && current_trace->use_max_tr) {
+        had_max_tr = current_trace->allocated_snapshot;
+        current_trace = &nop_trace;
+        if (had_max_tr && !t->use_max_tr) {
+                /*
+                 * We need to make sure that the update_max_tr sees that
+                 * current_trace changed to nop_trace to keep it from
+                 * swapping the buffers after we resize it.
+                 * The update_max_tr is called from interrupts disabled
+                 * so a synchronized_sched() is sufficient.
+                 */
+                synchronize_sched();
                /*
                 * We don't free the ring buffer. instead, resize it because
                 * The max_tr ring buffer has some state (e.g. ring->clock) and
@@ -3217,18 +3249,19 @@ static int tracing_set_tracer(const char *buf)
                 */
                ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
                set_buffer_entries(&max_tr, 1);
+                tracing_reset_online_cpus(&max_tr);
+                current_trace->allocated_snapshot = false;
        }
        destroy_trace_option_files(topts);
-        current_trace = &nop_trace;
        topts = create_trace_option_files(t);
-        if (t->use_max_tr) {
+        if (t->use_max_tr && !had_max_tr) {
                /* we need to make per cpu buffer sizes equivalent */
                ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
                                                   RING_BUFFER_ALL_CPUS);
                if (ret < 0)
                        goto out;
+                t->allocated_snapshot = true;
        }
        if (t->init) {
@@ -3336,8 +3369,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
                ret = -ENOMEM;
                goto fail;
        }
-        if (current_trace)
+        *iter->trace = *current_trace;
-                *iter->trace = *current_trace;
        if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
                ret = -ENOMEM;
@@ -3477,7 +3509,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
 {
        struct trace_iterator *iter = filp->private_data;
-        static struct tracer *old_tracer;
        ssize_t sret;
        /* return any leftover data */
@@ -3489,10 +3520,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
        /* copy the tracer to avoid using a global lock all around */
        mutex_lock(&trace_types_lock);
-        if (unlikely(old_tracer != current_trace && current_trace)) {
+        if (unlikely(iter->trace->name != current_trace->name))
-                old_tracer = current_trace;
                *iter->trace = *current_trace;
-        }
        mutex_unlock(&trace_types_lock);
        /*
@@ -3648,7 +3677,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                .ops            = &tracing_pipe_buf_ops,
                .spd_release    = tracing_spd_release_pipe,
        };
-        static struct tracer *old_tracer;
        ssize_t ret;
        size_t rem;
        unsigned int i;
@@ -3658,10 +3686,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        /* copy the tracer to avoid using a global lock all around */
        mutex_lock(&trace_types_lock);
-        if (unlikely(old_tracer != current_trace && current_trace)) {
+        if (unlikely(iter->trace->name != current_trace->name))
-                old_tracer = current_trace;
                *iter->trace = *current_trace;
-        }
        mutex_unlock(&trace_types_lock);
        mutex_lock(&iter->mutex);
@@ -4037,8 +4063,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
         * Reset the buffer so that it doesn't have incomparable timestamps.
         */
        tracing_reset_online_cpus(&global_trace);
-        if (max_tr.buffer)
+        tracing_reset_online_cpus(&max_tr);
-                tracing_reset_online_cpus(&max_tr);
        mutex_unlock(&trace_types_lock);
@@ -4054,6 +4079,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
        return single_open(file, tracing_clock_show, NULL);
 }
+#ifdef CONFIG_TRACER_SNAPSHOT
+static int tracing_snapshot_open(struct inode *inode, struct file *file)
+{
+        struct trace_iterator *iter;
+        int ret = 0;
+        if (file->f_mode & FMODE_READ) {
+                iter = __tracing_open(inode, file, true);
+                if (IS_ERR(iter))
+                        ret = PTR_ERR(iter);
+        }
+        return ret;
+}
+static ssize_t
+tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
+                       loff_t *ppos)
+{
+        unsigned long val;
+        int ret;
+        ret = tracing_update_buffers();
+        if (ret < 0)
+                return ret;
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+        if (ret)
+                return ret;
+        mutex_lock(&trace_types_lock);
+        if (current_trace->use_max_tr) {
+                ret = -EBUSY;
+                goto out;
+        }
+        switch (val) {
+        case 0:
+                if (current_trace->allocated_snapshot) {
+                        /* free spare buffer */
+                        ring_buffer_resize(max_tr.buffer, 1,
+                                           RING_BUFFER_ALL_CPUS);
+                        set_buffer_entries(&max_tr, 1);
+                        tracing_reset_online_cpus(&max_tr);
+                        current_trace->allocated_snapshot = false;
+                }
+                break;
+        case 1:
+                if (!current_trace->allocated_snapshot) {
+                        /* allocate spare buffer */
+                        ret = resize_buffer_duplicate_size(&max_tr,
+                                        &global_trace, RING_BUFFER_ALL_CPUS);
+                        if (ret < 0)
+                                break;
+                        current_trace->allocated_snapshot = true;
+                }
+                local_irq_disable();
+                /* Now, we're going to swap */
+                update_max_tr(&global_trace, current, smp_processor_id());
+                local_irq_enable();
+                break;
+        default:
+                if (current_trace->allocated_snapshot)
+                        tracing_reset_online_cpus(&max_tr);
+                else
+                        ret = -EINVAL;
+                break;
+        }
+        if (ret >= 0) {
+                *ppos += cnt;
+                ret = cnt;
+        }
+out:
+        mutex_unlock(&trace_types_lock);
+        return ret;
+}
+#endif /* CONFIG_TRACER_SNAPSHOT */
 static const struct file_operations tracing_max_lat_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_max_lat_read,
@@ -4110,6 +4216,16 @@ static const struct file_operations trace_clock_fops = {
        .write          = tracing_clock_write,
 };
+#ifdef CONFIG_TRACER_SNAPSHOT
+static const struct file_operations snapshot_fops = {
+        .open           = tracing_snapshot_open,
+        .read           = seq_read,
+        .write          = tracing_snapshot_write,
+        .llseek         = tracing_seek,
+        .release        = tracing_release,
+};
+#endif /* CONFIG_TRACER_SNAPSHOT */
 struct ftrace_buffer_info {
        struct trace_array      *tr;
        void                    *spare;
@@ -4414,6 +4530,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
        trace_seq_printf(s, "dropped events: %ld\n", cnt);
+        cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
+        trace_seq_printf(s, "read events: %ld\n", cnt);
        count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
        kfree(s);
@@ -4490,7 +4609,7 @@ struct dentry *tracing_init_dentry(void)
 static struct dentry *d_percpu;
-struct dentry *tracing_dentry_percpu(void)
+static struct dentry *tracing_dentry_percpu(void)
 {
        static int once;
        struct dentry *d_tracer;
@@ -4906,6 +5025,11 @@ static __init int tracer_init_debugfs(void)
                        &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+        trace_create_file("snapshot", 0644, d_tracer,
+                          (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
+#endif
        create_trace_options_dir();
        for_each_tracing_cpu(cpu)
@@ -5014,6 +5138,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
        if (disable_tracing)
                ftrace_kill();
+        /* Simulate the iterator */
        trace_init_global_iter(&iter);
        for_each_tracing_cpu(cpu) {
@@ -5025,10 +5150,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
        /* don't look at user memory in panic mode */
        trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
-        /* Simulate the iterator */
-        iter.tr = &global_trace;
-        iter.trace = current_trace;
        switch (oops_dump_mode) {
        case DUMP_ALL:
                iter.cpu_file = TRACE_PIPE_ALL_CPU;
@@ -5173,7 +5294,7 @@ __init static int tracer_alloc_buffers(void)
        init_irq_work(&trace_work_wakeup, trace_wake_up);
        register_tracer(&nop_trace);
-        current_trace = &nop_trace;
        /* All seems OK, enable tracing */
        tracing_disabled = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c75d7988902c..57d7e5397d56 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -287,20 +287,62 @@ struct tracer {
        struct tracer_flags     *flags;
        bool                    print_max;
        bool                    use_max_tr;
+        bool                    allocated_snapshot;
 };
 /* Only current can touch trace_recursion */
-#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
-#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
-/* Ring buffer has the 10 LSB bits to count */
+/*
-#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+ * For function tracing recursion:
+ *  The order of these bits are important.
-/* for function tracing recursion */
+ *
-#define TRACE_INTERNAL_BIT              (1<<11)
+ *  When function tracing occurs, the following steps are made:
-#define TRACE_GLOBAL_BIT                (1<<12)
+ *   If arch does not support a ftrace feature:
-#define TRACE_CONTROL_BIT               (1<<13)
+ *    call internal function (uses INTERNAL bits) which calls...
+ *   If callback is registered to the "global" list, the list
+ *    function is called and recursion checks the GLOBAL bits.
+ *    then this function calls...
+ *   The function callback, which can use the FTRACE bits to
+ *    check for recursion.
+ *
+ * Now if the arch does not suppport a feature, and it calls
+ * the global list function which calls the ftrace callback
+ * all three of these steps will do a recursion protection.
+ * There's no reason to do one if the previous caller already
+ * did. The recursion that we are protecting against will
+ * go through the same steps again.
+ *
+ * To prevent the multiple recursion checks, if a recursion
+ * bit is set that is higher than the MAX bit of the current
+ * check, then we know that the check was made by the previous
+ * caller, and we can skip the current check.
+ */
+enum {
+        TRACE_BUFFER_BIT,
+        TRACE_BUFFER_NMI_BIT,
+        TRACE_BUFFER_IRQ_BIT,
+        TRACE_BUFFER_SIRQ_BIT,
+        /* Start of function recursion bits */
+        TRACE_FTRACE_BIT,
+        TRACE_FTRACE_NMI_BIT,
+        TRACE_FTRACE_IRQ_BIT,
+        TRACE_FTRACE_SIRQ_BIT,
+        /* GLOBAL_BITs must be greater than FTRACE_BITs */
+        TRACE_GLOBAL_BIT,
+        TRACE_GLOBAL_NMI_BIT,
+        TRACE_GLOBAL_IRQ_BIT,
+        TRACE_GLOBAL_SIRQ_BIT,
+        /* INTERNAL_BITs must be greater than GLOBAL_BITs */
+        TRACE_INTERNAL_BIT,
+        TRACE_INTERNAL_NMI_BIT,
+        TRACE_INTERNAL_IRQ_BIT,
+        TRACE_INTERNAL_SIRQ_BIT,
+        TRACE_CONTROL_BIT,
 /*
 * Abuse of the trace_recursion.
@@ -309,11 +351,77 @@ struct tracer {
 * was called in irq context but we have irq tracing off. Since this
 * can only be modified by current, we can reuse trace_recursion.
 */
-#define TRACE_IRQ_BIT                   (1<<13)
+        TRACE_IRQ_BIT,
+};
+#define trace_recursion_set(bit)        do { (current)->trace_recursion |= (1<<(bit)); } while (0)
+#define trace_recursion_clear(bit)      do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
+#define trace_recursion_test(bit)       ((current)->trace_recursion & (1<<(bit)))
+#define TRACE_CONTEXT_BITS      4
+#define TRACE_FTRACE_START      TRACE_FTRACE_BIT
+#define TRACE_FTRACE_MAX        ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
+#define TRACE_GLOBAL_START      TRACE_GLOBAL_BIT
+#define TRACE_GLOBAL_MAX        ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
+#define TRACE_LIST_START        TRACE_INTERNAL_BIT
+#define TRACE_LIST_MAX          ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
+#define TRACE_CONTEXT_MASK      TRACE_LIST_MAX
+static __always_inline int trace_get_context_bit(void)
+{
+        int bit;
-#define trace_recursion_set(bit)        do { (current)->trace_recursion |= (bit); } while (0)
+        if (in_interrupt()) {
-#define trace_recursion_clear(bit)      do { (current)->trace_recursion &= ~(bit); } while (0)
+                if (in_nmi())
-#define trace_recursion_test(bit)       ((current)->trace_recursion & (bit))
+                        bit = 0;
+                else if (in_irq())
+                        bit = 1;
+                else
+                        bit = 2;
+        } else
+                bit = 3;
+        return bit;
+}
+static __always_inline int trace_test_and_set_recursion(int start, int max)
+{
+        unsigned int val = current->trace_recursion;
+        int bit;
+        /* A previous recursion check was made */
+        if ((val & TRACE_CONTEXT_MASK) > max)
+                return 0;
+        bit = trace_get_context_bit() + start;
+        if (unlikely(val & (1 << bit)))
+                return -1;
+        val |= 1 << bit;
+        current->trace_recursion = val;
+        barrier();
+        return bit;
+}
+static __always_inline void trace_clear_recursion(int bit)
+{
+        unsigned int val = current->trace_recursion;
+        if (!bit)
+                return;
+        bit = 1 << bit;
+        val &= ~bit;
+        barrier();
+        current->trace_recursion = val;
+}
 #define TRACE_PIPE_ALL_CPU      -1
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 394783531cbb..aa8f5f48dae6 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -21,8 +21,6 @@
 #include <linux/ktime.h>
 #include <linux/trace_clock.h>
-#include "trace.h"
 /*
 * trace_clock_local(): the simplest and least coherent tracing clock.
 *
@@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void)
        return clock;
 }
+EXPORT_SYMBOL_GPL(trace_clock_local);
 /*
 * trace_clock(): 'between' trace clock. Not completely serialized,
@@ -86,7 +85,7 @@ u64 notrace trace_clock_global(void)
        local_irq_save(flags);
        this_cpu = raw_smp_processor_id();
-        now = cpu_clock(this_cpu);
+        now = sched_clock_cpu(this_cpu);
        /*
         * If in an NMI context then dont risk lockups and return the
         * cpu_clock() time:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d0b946..57e9b284250c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
        __common_field(unsigned char, flags);
        __common_field(unsigned char, preempt_count);
        __common_field(int, pid);
-        __common_field(int, padding);
        return ret;
 }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8e3ad8082ab7..601152523326 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr)
        tracing_reset_online_cpus(tr);
 }
-static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
-                                 struct ftrace_ops *op, struct pt_regs *pt_regs)
-{
-        struct trace_array *tr = func_trace;
-        struct trace_array_cpu *data;
-        unsigned long flags;
-        long disabled;
-        int cpu;
-        int pc;
-        if (unlikely(!ftrace_function_enabled))
-                return;
-        pc = preempt_count();
-        preempt_disable_notrace();
-        local_save_flags(flags);
-        cpu = raw_smp_processor_id();
-        data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
-        if (likely(disabled == 1))
-                trace_function(tr, ip, parent_ip, flags, pc);
-        atomic_dec(&data->disabled);
-        preempt_enable_notrace();
-}
 /* Our option */
 enum {
        TRACE_FUNC_OPT_STACK    = 0x1,
@@ -85,34 +57,34 @@ static struct tracer_flags func_flags;
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip,
                    struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
        struct trace_array *tr = func_trace;
        struct trace_array_cpu *data;
        unsigned long flags;
-        long disabled;
+        int bit;
        int cpu;
        int pc;
        if (unlikely(!ftrace_function_enabled))
                return;
-        /*
+        pc = preempt_count();
-         * Need to use raw, since this must be called before the
+        preempt_disable_notrace();
-         * recursive protection is performed.
-         */
-        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
-        data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
-        if (likely(disabled == 1)) {
+        bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
-                pc = preempt_count();
+        if (bit < 0)
+                goto out;
+        cpu = smp_processor_id();
+        data = tr->data[cpu];
+        if (!atomic_read(&data->disabled)) {
+                local_save_flags(flags);
                trace_function(tr, ip, parent_ip, flags, pc);
        }
+        trace_clear_recursion(bit);
-        atomic_dec(&data->disabled);
+ out:
-        local_irq_restore(flags);
+        preempt_enable_notrace();
 }
 static void
@@ -185,11 +157,6 @@ static void tracing_start_function_trace(void)
 {
        ftrace_function_enabled = 0;
-        if (trace_flags & TRACE_ITER_PREEMPTONLY)
-                trace_ops.func = function_trace_call_preempt_only;
-        else
-                trace_ops.func = function_trace_call;
        if (func_flags.val & TRACE_FUNC_OPT_STACK)
                register_ftrace_function(&trace_stack_ops);
        else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4edb4b74eb7e..39ada66389cc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -47,6 +47,8 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_ABS_TIME      0x20
 #define TRACE_GRAPH_PRINT_IRQS          0x40
+static unsigned int max_depth;
 static struct tracer_opt trace_opts[] = {
        /* Display overruns? (for self-debug purpose) */
        { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
@@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
        ftrace_pop_return_trace(&trace, &ret, frame_pointer);
        trace.rettime = trace_clock_local();
-        ftrace_graph_return(&trace);
        barrier();
        current->curr_ret_stack--;
+        /*
+         * The trace should run after decrementing the ret counter
+         * in case an interrupt were to come in. We don't want to
+         * lose the interrupt if max_depth is set.
+         */
+        ftrace_graph_return(&trace);
        if (unlikely(!ret)) {
                ftrace_graph_stop();
                WARN_ON(1);
@@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
                return 0;
        /* trace it when it is-nested-in or is a function enabled. */
-        if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
+        if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
-              ftrace_graph_ignore_irqs())
+             ftrace_graph_ignore_irqs()) ||
+            (max_depth && trace->depth >= max_depth))
                return 0;
        local_irq_save(flags);
@@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = {
 #endif
 };
+static ssize_t
+graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
+                  loff_t *ppos)
+{
+        unsigned long val;
+        int ret;
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+        if (ret)
+                return ret;
+        max_depth = val;
+        *ppos += cnt;
+        return cnt;
+}
+static ssize_t
+graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
+                 loff_t *ppos)
+{
+        char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
+        int n;
+        n = sprintf(buf, "%d\n", max_depth);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+static const struct file_operations graph_depth_fops = {
+        .open           = tracing_open_generic,
+        .write          = graph_depth_write,
+        .read           = graph_depth_read,
+        .llseek         = generic_file_llseek,
+};
+static __init int init_graph_debugfs(void)
+{
+        struct dentry *d_tracer;
+        d_tracer = tracing_init_dentry();
+        if (!d_tracer)
+                return 0;
+        trace_create_file("max_graph_depth", 0644, d_tracer,
+                          NULL, &graph_depth_fops);
+        return 0;
+}
+fs_initcall(init_graph_debugfs);
 static __init int init_graph_trace(void)
 {
        max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 933708677814..5c7e09d10d74 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -66,7 +66,6 @@
 #define TP_FLAG_TRACE           1
 #define TP_FLAG_PROFILE         2
 #define TP_FLAG_REGISTERED      4
-#define TP_FLAG_UPROBE          8
 /* data_rloc: data relative location, compatible with u32 */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9fe45fcefca0..75aa97fbe1a1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,8 +15,8 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/sched/rt.h>
 #include <trace/events/sched.h>
 #include "trace.h"
 static struct trace_array       *wakeup_trace;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 47623169a815..51c819c12c29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
         * The ftrace infrastructure should provide the recursion
         * protection. If not, this will crash the kernel!
         */
-        trace_selftest_recursion_cnt++;
+        if (trace_selftest_recursion_cnt++ > 10)
+                return;
        DYN_FTRACE_TEST_NAME();
 }
@@ -452,7 +453,6 @@ trace_selftest_function_recursion(void)
        char *func_name;
        int len;
        int ret;
-        int cnt;
        /* The previous test PASSED */
        pr_cont("PASSED\n");
@@ -510,19 +510,10 @@ trace_selftest_function_recursion(void)
        unregister_ftrace_function(&test_recsafe_probe);
-        /*
-         * If arch supports all ftrace features, and no other task
-         * was on the list, we should be fine.
-         */
-        if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
-                cnt = 2; /* Should have recursed */
-        else
-                cnt = 1;
        ret = -1;
-        if (trace_selftest_recursion_cnt != cnt) {
+        if (trace_selftest_recursion_cnt != 2) {
-                pr_cont("*callback not called expected %d times (%d)* ",
+                pr_cont("*callback not called expected 2 times (%d)* ",
-                        cnt, trace_selftest_recursion_cnt);
+                        trace_selftest_recursion_cnt);
                goto out;
        }
@@ -568,7 +559,7 @@ trace_selftest_function_regs(void)
        int ret;
        int supported = 0;
-#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
        supported = 1;
 #endif
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7609dd6714c2..5329e13e74a1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -77,7 +77,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
        return syscalls_metadata[nr];
 }
-enum print_line_t
+static enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags,
                    struct trace_event *event)
 {
@@ -130,7 +130,7 @@ end:
        return TRACE_TYPE_HANDLED;
 }
-enum print_line_t
+static enum print_line_t
 print_syscall_exit(struct trace_iterator *iter, int flags,
                   struct trace_event *event)
 {
@@ -270,7 +270,7 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
        return ret;
 }
-void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
        struct syscall_trace_enter *entry;
        struct syscall_metadata *sys_data;
@@ -305,7 +305,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
-void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
        struct syscall_trace_exit *entry;
        struct syscall_metadata *sys_data;
@@ -337,7 +337,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
-int reg_event_syscall_enter(struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -356,7 +356,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
        return ret;
 }
-void unreg_event_syscall_enter(struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int num;
@@ -371,7 +371,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
        mutex_unlock(&syscall_trace_lock);
 }
-int reg_event_syscall_exit(struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -390,7 +390,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
        return ret;
 }
-void unreg_event_syscall_exit(struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int num;
@@ -459,7 +459,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
        return (unsigned long)sys_call_table[nr];
 }
-int __init init_ftrace_syscalls(void)
+static int __init init_ftrace_syscalls(void)
 {
        struct syscall_metadata *meta;
        unsigned long addr;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c86e6d4f67fb..8dad2a92dee9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,20 +28,21 @@
 #define UPROBE_EVENT_SYSTEM     "uprobes"
+struct trace_uprobe_filter {
+        rwlock_t                rwlock;
+        int                     nr_systemwide;
+        struct list_head        perf_events;
+};
 /*
 * uprobe event core functions
 */
-struct trace_uprobe;
-struct uprobe_trace_consumer {
-        struct uprobe_consumer          cons;
-        struct trace_uprobe             *tu;
-};
 struct trace_uprobe {
        struct list_head                list;
        struct ftrace_event_class       class;
        struct ftrace_event_call        call;
-        struct uprobe_trace_consumer    *consumer;
+        struct trace_uprobe_filter      filter;
+        struct uprobe_consumer          consumer;
        struct inode                    *inode;
        char                            *filename;
        unsigned long                   offset;
@@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list);
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
+{
+        rwlock_init(&filter->rwlock);
+        filter->nr_systemwide = 0;
+        INIT_LIST_HEAD(&filter->perf_events);
+}
+static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
+{
+        return !filter->nr_systemwide && list_empty(&filter->perf_events);
+}
 /*
 * Allocate new trace_uprobe and initialize it (including uprobes).
 */
@@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
                goto error;
        INIT_LIST_HEAD(&tu->list);
+        tu->consumer.handler = uprobe_dispatcher;
+        init_trace_uprobe_filter(&tu->filter);
        return tu;
 error:
@@ -253,12 +268,18 @@ static int create_trace_uprobe(int argc, char **argv)
        if (ret)
                goto fail_address_parse;
+        inode = igrab(path.dentry->d_inode);
+        path_put(&path);
+        if (!inode || !S_ISREG(inode->i_mode)) {
+                ret = -EINVAL;
+                goto fail_address_parse;
+        }
        ret = kstrtoul(arg, 0, &offset);
        if (ret)
                goto fail_address_parse;
-        inode = igrab(path.dentry->d_inode);
        argc -= 2;
        argv += 2;
@@ -356,7 +377,7 @@ fail_address_parse:
        if (inode)
                iput(inode);
-        pr_info("Failed to parse address.\n");
+        pr_info("Failed to parse address or file.\n");
        return ret;
 }
@@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
 };
 /* uprobe handler */
-static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 {
        struct uprobe_trace_entry_head *entry;
        struct ring_buffer_event *event;
@@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
        unsigned long irq_flags;
        struct ftrace_event_call *call = &tu->call;
-        tu->nhit++;
        local_save_flags(irq_flags);
        pc = preempt_count();
@@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
                                                  size, irq_flags, pc);
        if (!event)
-                return;
+                return 0;
        entry = ring_buffer_event_data(event);
-        entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+        entry->ip = instruction_pointer(task_pt_regs(current));
        data = (u8 *)&entry[1];
        for (i = 0; i < tu->nr_args; i++)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
        if (!filter_current_check_discard(buffer, call, entry, event))
                trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+        return 0;
 }
 /* Event entry printers */
@@ -533,42 +554,43 @@ partial:
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static int probe_event_enable(struct trace_uprobe *tu, int flag)
+static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
 {
-        struct uprobe_trace_consumer *utc;
+        return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
-        int ret = 0;
+}
-        if (!tu->inode || tu->consumer)
+typedef bool (*filter_func_t)(struct uprobe_consumer *self,
-                return -EINTR;
+                                enum uprobe_filter_ctx ctx,
+                                struct mm_struct *mm);
-        utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
+static int
-        if (!utc)
+probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
+{
+        int ret = 0;
+        if (is_trace_uprobe_enabled(tu))
                return -EINTR;
-        utc->cons.handler = uprobe_dispatcher;
+        WARN_ON(!uprobe_filter_is_empty(&tu->filter));
-        utc->cons.filter = NULL;
-        ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
-        if (ret) {
-                kfree(utc);
-                return ret;
-        }
        tu->flags |= flag;
-        utc->tu = tu;
+        tu->consumer.filter = filter;
-        tu->consumer = utc;
+        ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+        if (ret)
+                tu->flags &= ~flag;
-        return 0;
+        return ret;
 }
 static void probe_event_disable(struct trace_uprobe *tu, int flag)
 {
-        if (!tu->inode || !tu->consumer)
+        if (!is_trace_uprobe_enabled(tu))
                return;
-        uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
+        WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+        uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
        tu->flags &= ~flag;
-        kfree(tu->consumer);
-        tu->consumer = NULL;
 }
 static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
 }
 #ifdef CONFIG_PERF_EVENTS
+static bool
+__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
+{
+        struct perf_event *event;
+        if (filter->nr_systemwide)
+                return true;
+        list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
+                if (event->hw.tp_target->mm == mm)
+                        return true;
+        }
+        return false;
+}
+static inline bool
+uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
+{
+        return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+}
+static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+{
+        bool done;
+        write_lock(&tu->filter.rwlock);
+        if (event->hw.tp_target) {
+                /*
+                 * event->parent != NULL means copy_process(), we can avoid
+                 * uprobe_apply(). current->mm must be probed and we can rely
+                 * on dup_mmap() which preserves the already installed bp's.
+                 *
+                 * attr.enable_on_exec means that exec/mmap will install the
+                 * breakpoints we need.
+                 */
+                done = tu->filter.nr_systemwide ||
+                        event->parent || event->attr.enable_on_exec ||
+                        uprobe_filter_event(tu, event);
+                list_add(&event->hw.tp_list, &tu->filter.perf_events);
+        } else {
+                done = tu->filter.nr_systemwide;
+                tu->filter.nr_systemwide++;
+        }
+        write_unlock(&tu->filter.rwlock);
+        if (!done)
+                uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+        return 0;
+}
+static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+{
+        bool done;
+        write_lock(&tu->filter.rwlock);
+        if (event->hw.tp_target) {
+                list_del(&event->hw.tp_list);
+                done = tu->filter.nr_systemwide ||
+                        (event->hw.tp_target->flags & PF_EXITING) ||
+                        uprobe_filter_event(tu, event);
+        } else {
+                tu->filter.nr_systemwide--;
+                done = tu->filter.nr_systemwide;
+        }
+        write_unlock(&tu->filter.rwlock);
+        if (!done)
+                uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+        return 0;
+}
+static bool uprobe_perf_filter(struct uprobe_consumer *uc,
+                                enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+        struct trace_uprobe *tu;
+        int ret;
+        tu = container_of(uc, struct trace_uprobe, consumer);
+        read_lock(&tu->filter.rwlock);
+        ret = __uprobe_perf_filter(&tu->filter, mm);
+        read_unlock(&tu->filter.rwlock);
+        return ret;
+}
 /* uprobe profile handler */
-static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 {
        struct ftrace_event_call *call = &tu->call;
        struct uprobe_trace_entry_head *entry;
@@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
        int size, __size, i;
        int rctx;
+        if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+                return UPROBE_HANDLER_REMOVE;
        __size = sizeof(*entry) + tu->size;
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
-                return;
+                return 0;
        preempt_disable();
@@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
        if (!entry)
                goto out;
-        entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+        entry->ip = instruction_pointer(task_pt_regs(current));
        data = (u8 *)&entry[1];
        for (i = 0; i < tu->nr_args; i++)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 out:
        preempt_enable();
+        return 0;
 }
 #endif  /* CONFIG_PERF_EVENTS */
@@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
        switch (type) {
        case TRACE_REG_REGISTER:
-                return probe_event_enable(tu, TP_FLAG_TRACE);
+                return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
        case TRACE_REG_UNREGISTER:
                probe_event_disable(tu, TP_FLAG_TRACE);
@@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
 #ifdef CONFIG_PERF_EVENTS
        case TRACE_REG_PERF_REGISTER:
-                return probe_event_enable(tu, TP_FLAG_PROFILE);
+                return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
        case TRACE_REG_PERF_UNREGISTER:
                probe_event_disable(tu, TP_FLAG_PROFILE);
                return 0;
+        case TRACE_REG_PERF_OPEN:
+                return uprobe_perf_open(tu, data);
+        case TRACE_REG_PERF_CLOSE:
+                return uprobe_perf_close(tu, data);
 #endif
        default:
                return 0;
@@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 {
-        struct uprobe_trace_consumer *utc;
        struct trace_uprobe *tu;
+        int ret = 0;
-        utc = container_of(con, struct uprobe_trace_consumer, cons);
+        tu = container_of(con, struct trace_uprobe, consumer);
-        tu = utc->tu;
+        tu->nhit++;
-        if (!tu || tu->consumer != utc)
-                return 0;
        if (tu->flags & TP_FLAG_TRACE)
-                uprobe_trace_func(tu, regs);
+                ret |= uprobe_trace_func(tu, regs);
 #ifdef CONFIG_PERF_EVENTS
        if (tu->flags & TP_FLAG_PROFILE)
-                uprobe_perf_func(tu, regs);
+                ret |= uprobe_perf_func(tu, regs);
 #endif
-        return 0;
+        return ret;
 }
 static struct trace_event_functions uprobe_funcs = {
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 625df0b44690..a1dd9a1b1327 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 {
        const struct cred *tcred;
        struct timespec uptime, ts;
+        cputime_t utime, stime, utimescaled, stimescaled;
        u64 ac_etime;
        BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns,
        stats->ac_ppid   = pid_alive(tsk) ?
                task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
        rcu_read_unlock();
-        stats->ac_utime = cputime_to_usecs(tsk->utime);
-        stats->ac_stime = cputime_to_usecs(tsk->stime);
+        task_cputime(tsk, &utime, &stime);
-        stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
+        stats->ac_utime = cputime_to_usecs(utime);
-        stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
+        stats->ac_stime = cputime_to_usecs(stime);
+        task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+        stats->ac_utimescaled = cputime_to_usecs(utimescaled);
+        stats->ac_stimescaled = cputime_to_usecs(stimescaled);
        stats->ac_minflt = tsk->min_flt;
        stats->ac_majflt = tsk->maj_flt;
@@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 #undef KB
 #undef MB
-/**
+static void __acct_update_integrals(struct task_struct *tsk,
- * acct_update_integrals - update mm integral fields in task_struct
+                                    cputime_t utime, cputime_t stime)
- * @tsk: task_struct for accounting
- */
-void acct_update_integrals(struct task_struct *tsk)
 {
        if (likely(tsk->mm)) {
                cputime_t time, dtime;
@@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk)
                u64 delta;
                local_irq_save(flags);
-                time = tsk->stime + tsk->utime;
+                time = stime + utime;
                dtime = time - tsk->acct_timexpd;
                jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
                delta = value.tv_sec;
@@ -145,6 +148,27 @@ void acct_update_integrals(struct task_struct *tsk)
 }
 /**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+        cputime_t utime, stime;
+        task_cputime(tsk, &utime, &stime);
+        __acct_update_integrals(tsk, utime, stime);
+}
+/**
+ * acct_account_cputime - update mm integral after cputime update
+ * @tsk: task_struct for accounting
+ */
+void acct_account_cputime(struct task_struct *tsk)
+{
+        __acct_update_integrals(tsk, tsk->utime, tsk->stime);
+}
+/**
 * acct_clear_integrals - clear the mm integral fields in task_struct
 * @tsk: task_struct whose accounting fields are cleared
 */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 75a2ab3d0b02..27689422aa92 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/smpboot.h>
+#include <linux/sched/rt.h>
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fbc6576a83c3..f4feacad3812 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,32 +41,31 @@
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
+#include <linux/hashtable.h>
-#include "workqueue_sched.h"
+#include "workqueue_internal.h"
 enum {
        /*
-         * global_cwq flags
+         * worker_pool flags
         *
-         * A bound gcwq is either associated or disassociated with its CPU.
+         * A bound pool is either associated or disassociated with its CPU.
         * While associated (!DISASSOCIATED), all workers are bound to the
         * CPU and none has %WORKER_UNBOUND set and concurrency management
         * is in effect.
         *
         * While DISASSOCIATED, the cpu may be offline and all workers have
         * %WORKER_UNBOUND set and concurrency management disabled, and may
-         * be executing on any CPU.  The gcwq behaves as an unbound one.
+         * be executing on any CPU.  The pool behaves as an unbound one.
         *
         * Note that DISASSOCIATED can be flipped only while holding
-         * assoc_mutex of all pools on the gcwq to avoid changing binding
+         * assoc_mutex to avoid changing binding state while
-         * state while create_worker() is in progress.
+         * create_worker() is in progress.
         */
-        GCWQ_DISASSOCIATED      = 1 << 0,       /* cpu can't serve workers */
-        GCWQ_FREEZING           = 1 << 1,       /* freeze in progress */
-        /* pool flags */
        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
        POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
+        POOL_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
+        POOL_FREEZING           = 1 << 3,       /* freeze in progress */
        /* worker flags */
        WORKER_STARTED          = 1 << 0,       /* started */
@@ -79,11 +78,9 @@ enum {
        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_UNBOUND |
                                  WORKER_CPU_INTENSIVE,
-        NR_WORKER_POOLS         = 2,            /* # worker pools per gcwq */
+        NR_STD_WORKER_POOLS     = 2,            /* # standard pools per cpu */
        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
-        BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
-        BUSY_WORKER_HASH_MASK   = BUSY_WORKER_HASH_SIZE - 1,
        MAX_IDLE_WORKERS_RATIO  = 4,            /* 1/4 of busy can be idle */
        IDLE_WORKER_TIMEOUT     = 300 * HZ,     /* keep idle ones for 5 mins */
@@ -111,48 +108,24 @@ enum {
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
- * L: gcwq->lock protected.  Access with gcwq->lock held.
+ * L: pool->lock protected.  Access with pool->lock held.
 *
- * X: During normal operation, modification requires gcwq->lock and
+ * X: During normal operation, modification requires pool->lock and should
- *    should be done only from local cpu.  Either disabling preemption
+ *    be done only from local cpu.  Either disabling preemption on local
- *    on local cpu or grabbing gcwq->lock is enough for read access.
+ *    cpu or grabbing pool->lock is enough for read access.  If
- *    If GCWQ_DISASSOCIATED is set, it's identical to L.
+ *    POOL_DISASSOCIATED is set, it's identical to L.
 *
 * F: wq->flush_mutex protected.
 *
 * W: workqueue_lock protected.
 */
-struct global_cwq;
+/* struct worker is defined in workqueue_internal.h */
-struct worker_pool;
-/*
- * The poor guys doing the actual heavy lifting.  All on-duty workers
- * are either serving the manager role, on idle list or on busy hash.
- */
-struct worker {
-        /* on idle list while idle, on busy hash table while busy */
-        union {
-                struct list_head        entry;  /* L: while idle */
-                struct hlist_node       hentry; /* L: while busy */
-        };
-        struct work_struct      *current_work;  /* L: work being processed */
-        struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
-        struct list_head        scheduled;      /* L: scheduled works */
-        struct task_struct      *task;          /* I: worker task */
-        struct worker_pool      *pool;          /* I: the associated pool */
-        /* 64 bytes boundary on 64bit, 32 on 32bit */
-        unsigned long           last_active;    /* L: last active timestamp */
-        unsigned int            flags;          /* X: flags */
-        int                     id;             /* I: worker id */
-        /* for rebinding worker to CPU */
-        struct work_struct      rebind_work;    /* L: for busy worker */
-};
 struct worker_pool {
-        struct global_cwq       *gcwq;          /* I: the owning gcwq */
+        spinlock_t              lock;           /* the pool lock */
+        unsigned int            cpu;            /* I: the associated cpu */
+        int                     id;             /* I: pool ID */
        unsigned int            flags;          /* X: flags */
        struct list_head        worklist;       /* L: list of pending works */
@@ -165,34 +138,28 @@ struct worker_pool {
        struct timer_list       idle_timer;     /* L: worker idle timeout */
        struct timer_list       mayday_timer;   /* L: SOS timer for workers */
-        struct mutex            assoc_mutex;    /* protect GCWQ_DISASSOCIATED */
+        /* workers are chained either in busy_hash or idle_list */
-        struct ida              worker_ida;     /* L: for worker IDs */
+        DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
-};
-/*
- * Global per-cpu workqueue.  There's one and only one for each cpu
- * and all works are queued and processed here regardless of their
- * target workqueues.
- */
-struct global_cwq {
-        spinlock_t              lock;           /* the gcwq lock */
-        unsigned int            cpu;            /* I: the associated cpu */
-        unsigned int            flags;          /* L: GCWQ_* flags */
-        /* workers are chained either in busy_hash or pool idle_list */
-        struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
                                                /* L: hash of busy workers */
-        struct worker_pool      pools[NR_WORKER_POOLS];
+        struct mutex            assoc_mutex;    /* protect POOL_DISASSOCIATED */
-                                                /* normal and highpri pools */
+        struct ida              worker_ida;     /* L: for worker IDs */
+        /*
+         * The current concurrency level.  As it's likely to be accessed
+         * from other CPUs during try_to_wake_up(), put it in a separate
+         * cacheline.
+         */
+        atomic_t                nr_running ____cacheline_aligned_in_smp;
 } ____cacheline_aligned_in_smp;
 /*
- * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
+ * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
- * work_struct->data are used for flags and thus cwqs need to be
+ * of work_struct->data are used for flags and the remaining high bits
- * aligned at two's power of the number of flag bits.
+ * point to the pwq; thus, pwqs need to be aligned at two's power of the
+ * number of flag bits.
 */
-struct cpu_workqueue_struct {
+struct pool_workqueue {
        struct worker_pool      *pool;          /* I: the associated pool */
        struct workqueue_struct *wq;            /* I: the owning workqueue */
        int                     work_color;     /* L: current color */
@@ -241,16 +208,16 @@ typedef unsigned long mayday_mask_t;
 struct workqueue_struct {
        unsigned int            flags;          /* W: WQ_* flags */
        union {
-                struct cpu_workqueue_struct __percpu    *pcpu;
+                struct pool_workqueue __percpu          *pcpu;
-                struct cpu_workqueue_struct             *single;
+                struct pool_workqueue                   *single;
                unsigned long                           v;
-        } cpu_wq;                               /* I: cwq's */
+        } pool_wq;                              /* I: pwq's */
        struct list_head        list;           /* W: list of all workqueues */
        struct mutex            flush_mutex;    /* protects wq flushing */
        int                     work_color;     /* F: current work color */
        int                     flush_color;    /* F: current flush color */
-        atomic_t                nr_cwqs_to_flush; /* flush in progress */
+        atomic_t                nr_pwqs_to_flush; /* flush in progress */
        struct wq_flusher       *first_flusher; /* F: first flusher */
        struct list_head        flusher_queue;  /* F: flush waiters */
        struct list_head        flusher_overflow; /* F: flush overflow list */
@@ -259,7 +226,7 @@ struct workqueue_struct {
        struct worker           *rescuer;       /* I: rescue worker */
        int                     nr_drainers;    /* W: drain in progress */
-        int                     saved_max_active; /* W: saved cwq max_active */
+        int                     saved_max_active; /* W: saved pwq max_active */
 #ifdef CONFIG_LOCKDEP
        struct lockdep_map      lockdep_map;
 #endif
@@ -280,16 +247,15 @@ EXPORT_SYMBOL_GPL(system_freezable_wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
-#define for_each_worker_pool(pool, gcwq)                                \
+#define for_each_std_worker_pool(pool, cpu)                             \
-        for ((pool) = &(gcwq)->pools[0];                                \
+        for ((pool) = &std_worker_pools(cpu)[0];                        \
-             (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
+             (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
-#define for_each_busy_worker(worker, i, pos, gcwq)                      \
+#define for_each_busy_worker(worker, i, pos, pool)                      \
-        for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
+        hash_for_each(pool->busy_hash, i, pos, worker, hentry)
-                hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
-static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
+static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
-                                  unsigned int sw)
+                                unsigned int sw)
 {
        if (cpu < nr_cpu_ids) {
                if (sw & 1) {
@@ -300,42 +266,42 @@ static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
                if (sw & 2)
                        return WORK_CPU_UNBOUND;
        }
-        return WORK_CPU_NONE;
+        return WORK_CPU_END;
 }
-static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
+static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask,
-                                struct workqueue_struct *wq)
+                                 struct workqueue_struct *wq)
 {
-        return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
+        return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
 }
 /*
 * CPU iterators
 *
- * An extra gcwq is defined for an invalid cpu number
+ * An extra cpu number is defined using an invalid cpu number
 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
- * specific CPU.  The following iterators are similar to
+ * specific CPU.  The following iterators are similar to for_each_*_cpu()
- * for_each_*_cpu() iterators but also considers the unbound gcwq.
+ * iterators but also considers the unbound CPU.
 *
- * for_each_gcwq_cpu()          : possible CPUs + WORK_CPU_UNBOUND
+ * for_each_wq_cpu()            : possible CPUs + WORK_CPU_UNBOUND
- * for_each_online_gcwq_cpu()   : online CPUs + WORK_CPU_UNBOUND
+ * for_each_online_wq_cpu()     : online CPUs + WORK_CPU_UNBOUND
- * for_each_cwq_cpu()           : possible CPUs for bound workqueues,
+ * for_each_pwq_cpu()           : possible CPUs for bound workqueues,
 *                                WORK_CPU_UNBOUND for unbound workqueues
 */
-#define for_each_gcwq_cpu(cpu)                                          \
+#define for_each_wq_cpu(cpu)                                            \
-        for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);         \
+        for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3);           \
-             (cpu) < WORK_CPU_NONE;                                     \
+             (cpu) < WORK_CPU_END;                                      \
-             (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
+             (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
-#define for_each_online_gcwq_cpu(cpu)                                   \
+#define for_each_online_wq_cpu(cpu)                                     \
-        for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);           \
+        for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3);             \
-             (cpu) < WORK_CPU_NONE;                                     \
+             (cpu) < WORK_CPU_END;                                      \
-             (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
+             (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
-#define for_each_cwq_cpu(cpu, wq)                                       \
+#define for_each_pwq_cpu(cpu, wq)                                       \
-        for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));        \
+        for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq));       \
-             (cpu) < WORK_CPU_NONE;                                     \
+             (cpu) < WORK_CPU_END;                                      \
-             (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
+             (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
@@ -459,57 +425,69 @@ static LIST_HEAD(workqueues);
 static bool workqueue_freezing;         /* W: have wqs started freezing? */
 /*
- * The almighty global cpu workqueues.  nr_running is the only field
+ * The CPU and unbound standard worker pools.  The unbound ones have
- * which is expected to be used frequently by other cpus via
+ * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
- * try_to_wake_up().  Put it in a separate cacheline.
 */
-static DEFINE_PER_CPU(struct global_cwq, global_cwq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
+                                     cpu_std_worker_pools);
+static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
-/*
+/* idr of all pools */
- * Global cpu workqueue and nr_running counter for unbound gcwq.  The
+static DEFINE_MUTEX(worker_pool_idr_mutex);
- * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
+static DEFINE_IDR(worker_pool_idr);
- * workers have WORKER_UNBOUND set.
- */
-static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
-        [0 ... NR_WORKER_POOLS - 1]     = ATOMIC_INIT(0),       /* always 0 */
-};
 static int worker_thread(void *__worker);
-static int worker_pool_pri(struct worker_pool *pool)
+static struct worker_pool *std_worker_pools(int cpu)
 {
-        return pool - pool->gcwq->pools;
+        if (cpu != WORK_CPU_UNBOUND)
+                return per_cpu(cpu_std_worker_pools, cpu);
+        else
+                return unbound_std_worker_pools;
 }
-static struct global_cwq *get_gcwq(unsigned int cpu)
+static int std_worker_pool_pri(struct worker_pool *pool)
 {
-        if (cpu != WORK_CPU_UNBOUND)
+        return pool - std_worker_pools(pool->cpu);
-                return &per_cpu(global_cwq, cpu);
-        else
-                return &unbound_global_cwq;
 }
-static atomic_t *get_pool_nr_running(struct worker_pool *pool)
+/* allocate ID and assign it to @pool */
+static int worker_pool_assign_id(struct worker_pool *pool)
 {
-        int cpu = pool->gcwq->cpu;
+        int ret;
-        int idx = worker_pool_pri(pool);
-        if (cpu != WORK_CPU_UNBOUND)
+        mutex_lock(&worker_pool_idr_mutex);
-                return &per_cpu(pool_nr_running, cpu)[idx];
+        idr_pre_get(&worker_pool_idr, GFP_KERNEL);
-        else
+        ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
-                return &unbound_pool_nr_running[idx];
+        mutex_unlock(&worker_pool_idr_mutex);
+        return ret;
 }
-static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
+/*
-                                            struct workqueue_struct *wq)
+ * Lookup worker_pool by id.  The idr currently is built during boot and
+ * never modified.  Don't worry about locking for now.
+ */
+static struct worker_pool *worker_pool_by_id(int pool_id)
+{
+        return idr_find(&worker_pool_idr, pool_id);
+}
+static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
+{
+        struct worker_pool *pools = std_worker_pools(cpu);
+        return &pools[highpri];
+}
+static struct pool_workqueue *get_pwq(unsigned int cpu,
+                                      struct workqueue_struct *wq)
 {
        if (!(wq->flags & WQ_UNBOUND)) {
                if (likely(cpu < nr_cpu_ids))
-                        return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+                        return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
        } else if (likely(cpu == WORK_CPU_UNBOUND))
-                return wq->cpu_wq.single;
+                return wq->pool_wq.single;
        return NULL;
 }
@@ -530,19 +508,19 @@ static int work_next_color(int color)
 }
 /*
- * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
+ * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
- * contain the pointer to the queued cwq.  Once execution starts, the flag
+ * contain the pointer to the queued pwq.  Once execution starts, the flag
- * is cleared and the high bits contain OFFQ flags and CPU number.
+ * is cleared and the high bits contain OFFQ flags and pool ID.
 *
- * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
+ * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
- * and clear_work_data() can be used to set the cwq, cpu or clear
+ * and clear_work_data() can be used to set the pwq, pool or clear
 * work->data.  These functions should only be called while the work is
 * owned - ie. while the PENDING bit is set.
 *
- * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
+ * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
- * a work.  gcwq is available once the work has been queued anywhere after
+ * corresponding to a work.  Pool is available once the work has been
- * initialization until it is sync canceled.  cwq is available only while
+ * queued anywhere after initialization until it is sync canceled.  pwq is
- * the work item is queued.
+ * available only while the work item is queued.
 *
 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
 * canceled.  While being canceled, a work item may have its PENDING set
@@ -556,16 +534,22 @@ static inline void set_work_data(struct work_struct *work, unsigned long data,
        atomic_long_set(&work->data, data | flags | work_static(work));
 }
-static void set_work_cwq(struct work_struct *work,
+static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
-                         struct cpu_workqueue_struct *cwq,
                         unsigned long extra_flags)
 {
-        set_work_data(work, (unsigned long)cwq,
+        set_work_data(work, (unsigned long)pwq,
-                      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
+                      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
 }
-static void set_work_cpu_and_clear_pending(struct work_struct *work,
+static void set_work_pool_and_keep_pending(struct work_struct *work,
-                                           unsigned int cpu)
+                                           int pool_id)
+{
+        set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
+                      WORK_STRUCT_PENDING);
+}
+static void set_work_pool_and_clear_pending(struct work_struct *work,
+                                            int pool_id)
 {
        /*
         * The following wmb is paired with the implied mb in
@@ -574,67 +558,92 @@ static void set_work_cpu_and_clear_pending(struct work_struct *work,
         * owner.
         */
        smp_wmb();
-        set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
+        set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
 }
 static void clear_work_data(struct work_struct *work)
 {
-        smp_wmb();      /* see set_work_cpu_and_clear_pending() */
+        smp_wmb();      /* see set_work_pool_and_clear_pending() */
-        set_work_data(work, WORK_STRUCT_NO_CPU, 0);
+        set_work_data(work, WORK_STRUCT_NO_POOL, 0);
 }
-static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
+static struct pool_workqueue *get_work_pwq(struct work_struct *work)
 {
        unsigned long data = atomic_long_read(&work->data);
-        if (data & WORK_STRUCT_CWQ)
+        if (data & WORK_STRUCT_PWQ)
                return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
        else
                return NULL;
 }
-static struct global_cwq *get_work_gcwq(struct work_struct *work)
+/**
+ * get_work_pool - return the worker_pool a given work was associated with
+ * @work: the work item of interest
+ *
+ * Return the worker_pool @work was last associated with.  %NULL if none.
+ */
+static struct worker_pool *get_work_pool(struct work_struct *work)
 {
        unsigned long data = atomic_long_read(&work->data);
-        unsigned int cpu;
+        struct worker_pool *pool;
+        int pool_id;
-        if (data & WORK_STRUCT_CWQ)
+        if (data & WORK_STRUCT_PWQ)
-                return ((struct cpu_workqueue_struct *)
+                return ((struct pool_workqueue *)
-                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
+                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
-        cpu = data >> WORK_OFFQ_CPU_SHIFT;
+        pool_id = data >> WORK_OFFQ_POOL_SHIFT;
-        if (cpu == WORK_CPU_NONE)
+        if (pool_id == WORK_OFFQ_POOL_NONE)
                return NULL;
-        BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
+        pool = worker_pool_by_id(pool_id);
-        return get_gcwq(cpu);
+        WARN_ON_ONCE(!pool);
+        return pool;
+}
+/**
+ * get_work_pool_id - return the worker pool ID a given work is associated with
+ * @work: the work item of interest
+ *
+ * Return the worker_pool ID @work was last associated with.
+ * %WORK_OFFQ_POOL_NONE if none.
+ */
+static int get_work_pool_id(struct work_struct *work)
+{
+        unsigned long data = atomic_long_read(&work->data);
+        if (data & WORK_STRUCT_PWQ)
+                return ((struct pool_workqueue *)
+                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
+        return data >> WORK_OFFQ_POOL_SHIFT;
 }
 static void mark_work_canceling(struct work_struct *work)
 {
-        struct global_cwq *gcwq = get_work_gcwq(work);
+        unsigned long pool_id = get_work_pool_id(work);
-        unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
-        set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
+        pool_id <<= WORK_OFFQ_POOL_SHIFT;
-                      WORK_STRUCT_PENDING);
+        set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
 }
 static bool work_is_canceling(struct work_struct *work)
 {
        unsigned long data = atomic_long_read(&work->data);
-        return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
+        return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
 }
 /*
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
- * they're being called with gcwq->lock held.
+ * they're being called with pool->lock held.
 */
 static bool __need_more_worker(struct worker_pool *pool)
 {
-        return !atomic_read(get_pool_nr_running(pool));
+        return !atomic_read(&pool->nr_running);
 }
 /*
@@ -642,7 +651,7 @@ static bool __need_more_worker(struct worker_pool *pool)
 * running workers.
 *
 * Note that, because unbound workers never contribute to nr_running, this
- * function will always return %true for unbound gcwq as long as the
+ * function will always return %true for unbound pools as long as the
 * worklist isn't empty.
 */
 static bool need_more_worker(struct worker_pool *pool)
@@ -659,9 +668,8 @@ static bool may_start_working(struct worker_pool *pool)
 /* Do I need to keep working?  Called from currently running workers. */
 static bool keep_working(struct worker_pool *pool)
 {
-        atomic_t *nr_running = get_pool_nr_running(pool);
+        return !list_empty(&pool->worklist) &&
+                atomic_read(&pool->nr_running) <= 1;
-        return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
 }
 /* Do we need a new worker?  Called from manager. */
@@ -714,7 +722,7 @@ static struct worker *first_worker(struct worker_pool *pool)
 * Wake up the first idle worker of @pool.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
 static void wake_up_worker(struct worker_pool *pool)
 {
@@ -740,8 +748,8 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
        struct worker *worker = kthread_data(task);
        if (!(worker->flags & WORKER_NOT_RUNNING)) {
-                WARN_ON_ONCE(worker->pool->gcwq->cpu != cpu);
+                WARN_ON_ONCE(worker->pool->cpu != cpu);
-                atomic_inc(get_pool_nr_running(worker->pool));
+                atomic_inc(&worker->pool->nr_running);
        }
 }
@@ -764,12 +772,18 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
                                       unsigned int cpu)
 {
        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
-        struct worker_pool *pool = worker->pool;
+        struct worker_pool *pool;
-        atomic_t *nr_running = get_pool_nr_running(pool);
+        /*
+         * Rescuers, which may not have all the fields set up like normal
+         * workers, also reach here, let's not access anything before
+         * checking NOT_RUNNING.
+         */
        if (worker->flags & WORKER_NOT_RUNNING)
                return NULL;
+        pool = worker->pool;
        /* this can only happen on the local cpu */
        BUG_ON(cpu != raw_smp_processor_id());
@@ -781,10 +795,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
         * NOT_RUNNING is clear.  This means that we're bound to and
         * running on the local cpu w/ rq lock held and preemption
         * disabled, which in turn means that none else could be
-         * manipulating idle_list, so dereferencing idle_list without gcwq
+         * manipulating idle_list, so dereferencing idle_list without pool
         * lock is safe.
         */
-        if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
+        if (atomic_dec_and_test(&pool->nr_running) &&
+            !list_empty(&pool->worklist))
                to_wakeup = first_worker(pool);
        return to_wakeup ? to_wakeup->task : NULL;
 }
@@ -800,7 +815,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 * woken up.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock)
+ * spin_lock_irq(pool->lock)
 */
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
                                    bool wakeup)
@@ -816,14 +831,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
         */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
-                atomic_t *nr_running = get_pool_nr_running(pool);
                if (wakeup) {
-                        if (atomic_dec_and_test(nr_running) &&
+                        if (atomic_dec_and_test(&pool->nr_running) &&
                            !list_empty(&pool->worklist))
                                wake_up_worker(pool);
                } else
-                        atomic_dec(nr_running);
+                        atomic_dec(&pool->nr_running);
        }
        worker->flags |= flags;
@@ -837,7 +850,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock)
+ * spin_lock_irq(pool->lock)
 */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
@@ -855,87 +868,56 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
-                        atomic_inc(get_pool_nr_running(pool));
+                        atomic_inc(&pool->nr_running);
 }
 /**
- * busy_worker_head - return the busy hash head for a work
+ * find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
+ * @pool: pool of interest
- * @work: work to be hashed
- *
- * Return hash head of @gcwq for @work.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to the hash head.
- */
-static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
-                                           struct work_struct *work)
-{
-        const int base_shift = ilog2(sizeof(struct work_struct));
-        unsigned long v = (unsigned long)work;
-        /* simple shift and fold hash, do we need something better? */
-        v >>= base_shift;
-        v += v >> BUSY_WORKER_HASH_ORDER;
-        v &= BUSY_WORKER_HASH_MASK;
-        return &gcwq->busy_hash[v];
-}
-/**
- * __find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
- * @bwh: hash head as returned by busy_worker_head()
 * @work: work to find worker for
 *
- * Find a worker which is executing @work on @gcwq.  @bwh should be
+ * Find a worker which is executing @work on @pool by searching
- * the hash head obtained by calling busy_worker_head() with the same
+ * @pool->busy_hash which is keyed by the address of @work.  For a worker
- * work.
+ * to match, its current execution should match the address of @work and
+ * its work function.  This is to avoid unwanted dependency between
+ * unrelated work executions through a work item being recycled while still
+ * being executed.
+ *
+ * This is a bit tricky.  A work item may be freed once its execution
+ * starts and nothing prevents the freed area from being recycled for
+ * another work item.  If the same work item address ends up being reused
+ * before the original execution finishes, workqueue will identify the
+ * recycled work item as currently executing and make it wait until the
+ * current execution finishes, introducing an unwanted dependency.
+ *
+ * This function checks the work item address, work function and workqueue
+ * to avoid false positives.  Note that this isn't complete as one may
+ * construct a work function which can introduce dependency onto itself
+ * through a recycled work item.  Well, if somebody wants to shoot oneself
+ * in the foot that badly, there's only so much we can do, and if such
+ * deadlock actually occurs, it should be easy to locate the culprit work
+ * function.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 *
 * RETURNS:
 * Pointer to worker which is executing @work if found, NULL
 * otherwise.
 */
-static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
+static struct worker *find_worker_executing_work(struct worker_pool *pool,
-                                                   struct hlist_head *bwh,
+                                                 struct work_struct *work)
-                                                   struct work_struct *work)
 {
        struct worker *worker;
        struct hlist_node *tmp;
-        hlist_for_each_entry(worker, tmp, bwh, hentry)
+        hash_for_each_possible(pool->busy_hash, worker, tmp, hentry,
-                if (worker->current_work == work)
+                               (unsigned long)work)
+                if (worker->current_work == work &&
+                    worker->current_func == work->func)
                        return worker;
-        return NULL;
-}
-/**
+        return NULL;
- * find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
- * @work: work to find worker for
- *
- * Find a worker which is executing @work on @gcwq.  This function is
- * identical to __find_worker_executing_work() except that this
- * function calculates @bwh itself.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to worker which is executing @work if found, NULL
- * otherwise.
- */
-static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
-                                                 struct work_struct *work)
-{
-        return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
-                                            work);
 }
 /**
@@ -953,7 +935,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 * nested inside outer list_for_each_entry_safe().
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
 static void move_linked_works(struct work_struct *work, struct list_head *head,
                              struct work_struct **nextp)
@@ -979,67 +961,67 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
                *nextp = n;
 }
-static void cwq_activate_delayed_work(struct work_struct *work)
+static void pwq_activate_delayed_work(struct work_struct *work)
 {
-        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+        struct pool_workqueue *pwq = get_work_pwq(work);
        trace_workqueue_activate_work(work);
-        move_linked_works(work, &cwq->pool->worklist, NULL);
+        move_linked_works(work, &pwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
-        cwq->nr_active++;
+        pwq->nr_active++;
 }
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
 {
-        struct work_struct *work = list_first_entry(&cwq->delayed_works,
+        struct work_struct *work = list_first_entry(&pwq->delayed_works,
                                                    struct work_struct, entry);
-        cwq_activate_delayed_work(work);
+        pwq_activate_delayed_work(work);
 }
 /**
- * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
- * @cwq: cwq of interest
+ * @pwq: pwq of interest
 * @color: color of work which left the queue
 *
 * A work either has completed or is removed from pending queue,
- * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ * decrement nr_in_flight of its pwq and handle workqueue flushing.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
-static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
+static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
 {
        /* ignore uncolored works */
        if (color == WORK_NO_COLOR)
                return;
-        cwq->nr_in_flight[color]--;
+        pwq->nr_in_flight[color]--;
-        cwq->nr_active--;
+        pwq->nr_active--;
-        if (!list_empty(&cwq->delayed_works)) {
+        if (!list_empty(&pwq->delayed_works)) {
                /* one down, submit a delayed one */
-                if (cwq->nr_active < cwq->max_active)
+                if (pwq->nr_active < pwq->max_active)
-                        cwq_activate_first_delayed(cwq);
+                        pwq_activate_first_delayed(pwq);
        }
        /* is flush in progress and are we at the flushing tip? */
-        if (likely(cwq->flush_color != color))
+        if (likely(pwq->flush_color != color))
                return;
        /* are there still in-flight works? */
-        if (cwq->nr_in_flight[color])
+        if (pwq->nr_in_flight[color])
                return;
-        /* this cwq is done, clear flush_color */
+        /* this pwq is done, clear flush_color */
-        cwq->flush_color = -1;
+        pwq->flush_color = -1;
        /*
-         * If this was the last cwq, wake up the first flusher.  It
+         * If this was the last pwq, wake up the first flusher.  It
         * will handle the rest.
         */
-        if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+        if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
-                complete(&cwq->wq->first_flusher->done);
+                complete(&pwq->wq->first_flusher->done);
 }
 /**
@@ -1070,7 +1052,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
 static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
                               unsigned long *flags)
 {
-        struct global_cwq *gcwq;
+        struct worker_pool *pool;
+        struct pool_workqueue *pwq;
        local_irq_save(*flags);
@@ -1095,41 +1078,43 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
         * The queueing is in progress, or it is already queued. Try to
         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
         */
-        gcwq = get_work_gcwq(work);
+        pool = get_work_pool(work);
-        if (!gcwq)
+        if (!pool)
                goto fail;
-        spin_lock(&gcwq->lock);
+        spin_lock(&pool->lock);
-        if (!list_empty(&work->entry)) {
+        /*
+         * work->data is guaranteed to point to pwq only while the work
+         * item is queued on pwq->wq, and both updating work->data to point
+         * to pwq on queueing and to pool on dequeueing are done under
+         * pwq->pool->lock.  This in turn guarantees that, if work->data
+         * points to pwq which is associated with a locked pool, the work
+         * item is currently queued on that pool.
+         */
+        pwq = get_work_pwq(work);
+        if (pwq && pwq->pool == pool) {
+                debug_work_deactivate(work);
                /*
-                 * This work is queued, but perhaps we locked the wrong gcwq.
+                 * A delayed work item cannot be grabbed directly because
-                 * In that case we must see the new value after rmb(), see
+                 * it might have linked NO_COLOR work items which, if left
-                 * insert_work()->wmb().
+                 * on the delayed_list, will confuse pwq->nr_active
+                 * management later on and cause stall.  Make sure the work
+                 * item is activated before grabbing.
                 */
-                smp_rmb();
+                if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
-                if (gcwq == get_work_gcwq(work)) {
+                        pwq_activate_delayed_work(work);
-                        debug_work_deactivate(work);
-                        /*
+                list_del_init(&work->entry);
-                         * A delayed work item cannot be grabbed directly
+                pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
-                         * because it might have linked NO_COLOR work items
-                         * which, if left on the delayed_list, will confuse
-                         * cwq->nr_active management later on and cause
-                         * stall.  Make sure the work item is activated
-                         * before grabbing.
-                         */
-                        if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
-                                cwq_activate_delayed_work(work);
-                        list_del_init(&work->entry);
+                /* work->data points to pwq iff queued, point to pool */
-                        cwq_dec_nr_in_flight(get_work_cwq(work),
+                set_work_pool_and_keep_pending(work, pool->id);
-                                get_work_color(work));
-                        spin_unlock(&gcwq->lock);
+                spin_unlock(&pool->lock);
-                        return 1;
+                return 1;
-                }
        }
-        spin_unlock(&gcwq->lock);
+        spin_unlock(&pool->lock);
 fail:
        local_irq_restore(*flags);
        if (work_is_canceling(work))
@@ -1139,33 +1124,25 @@ fail:
 }
 /**
- * insert_work - insert a work into gcwq
+ * insert_work - insert a work into a pool
- * @cwq: cwq @work belongs to
+ * @pwq: pwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
- * Insert @work which belongs to @cwq into @gcwq after @head.
+ * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
- * @extra_flags is or'd to work_struct flags.
+ * work_struct flags.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
-static void insert_work(struct cpu_workqueue_struct *cwq,
+static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
-                        struct work_struct *work, struct list_head *head,
+                        struct list_head *head, unsigned int extra_flags)
-                        unsigned int extra_flags)
 {
-        struct worker_pool *pool = cwq->pool;
+        struct worker_pool *pool = pwq->pool;
        /* we own @work, set data and link */
-        set_work_cwq(work, cwq, extra_flags);
+        set_work_pwq(work, pwq, extra_flags);
-        /*
-         * Ensure that we get the right work->data if we see the
-         * result of list_add() below, see try_to_grab_pending().
-         */
-        smp_wmb();
        list_add_tail(&work->entry, head);
        /*
@@ -1181,41 +1158,24 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 /*
 * Test whether @work is being queued from another work executing on the
- * same workqueue.  This is rather expensive and should only be used from
+ * same workqueue.
- * cold paths.
 */
 static bool is_chained_work(struct workqueue_struct *wq)
 {
-        unsigned long flags;
+        struct worker *worker;
-        unsigned int cpu;
-        for_each_gcwq_cpu(cpu) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
-                struct worker *worker;
-                struct hlist_node *pos;
-                int i;
-                spin_lock_irqsave(&gcwq->lock, flags);
+        worker = current_wq_worker();
-                for_each_busy_worker(worker, i, pos, gcwq) {
+        /*
-                        if (worker->task != current)
+         * Return %true iff I'm a worker execuing a work item on @wq.  If
-                                continue;
+         * I'm @worker, it's safe to dereference it without locking.
-                        spin_unlock_irqrestore(&gcwq->lock, flags);
+         */
-                        /*
+        return worker && worker->current_pwq->wq == wq;
-                         * I'm @worker, no locking necessary.  See if @work
-                         * is headed to the same workqueue.
-                         */
-                        return worker->current_cwq->wq == wq;
-                }
-                spin_unlock_irqrestore(&gcwq->lock, flags);
-        }
-        return false;
 }
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
 {
-        struct global_cwq *gcwq;
+        struct pool_workqueue *pwq;
-        struct cpu_workqueue_struct *cwq;
        struct list_head *worklist;
        unsigned int work_flags;
        unsigned int req_cpu = cpu;
@@ -1235,9 +1195,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
-        /* determine gcwq to use */
+        /* determine the pwq to use */
        if (!(wq->flags & WQ_UNBOUND)) {
-                struct global_cwq *last_gcwq;
+                struct worker_pool *last_pool;
                if (cpu == WORK_CPU_UNBOUND)
                        cpu = raw_smp_processor_id();
@@ -1248,55 +1208,54 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                 * work needs to be queued on that cpu to guarantee
                 * non-reentrancy.
                 */
-                gcwq = get_gcwq(cpu);
+                pwq = get_pwq(cpu, wq);
-                last_gcwq = get_work_gcwq(work);
+                last_pool = get_work_pool(work);
-                if (last_gcwq && last_gcwq != gcwq) {
+                if (last_pool && last_pool != pwq->pool) {
                        struct worker *worker;
-                        spin_lock(&last_gcwq->lock);
+                        spin_lock(&last_pool->lock);
-                        worker = find_worker_executing_work(last_gcwq, work);
+                        worker = find_worker_executing_work(last_pool, work);
-                        if (worker && worker->current_cwq->wq == wq)
+                        if (worker && worker->current_pwq->wq == wq) {
-                                gcwq = last_gcwq;
+                                pwq = get_pwq(last_pool->cpu, wq);
-                        else {
+                        } else {
                                /* meh... not running there, queue here */
-                                spin_unlock(&last_gcwq->lock);
+                                spin_unlock(&last_pool->lock);
-                                spin_lock(&gcwq->lock);
+                                spin_lock(&pwq->pool->lock);
                        }
                } else {
-                        spin_lock(&gcwq->lock);
+                        spin_lock(&pwq->pool->lock);
                }
        } else {
-                gcwq = get_gcwq(WORK_CPU_UNBOUND);
+                pwq = get_pwq(WORK_CPU_UNBOUND, wq);
-                spin_lock(&gcwq->lock);
+                spin_lock(&pwq->pool->lock);
        }
-        /* gcwq determined, get cwq and queue */
+        /* pwq determined, queue */
-        cwq = get_cwq(gcwq->cpu, wq);
+        trace_workqueue_queue_work(req_cpu, pwq, work);
-        trace_workqueue_queue_work(req_cpu, cwq, work);
        if (WARN_ON(!list_empty(&work->entry))) {
-                spin_unlock(&gcwq->lock);
+                spin_unlock(&pwq->pool->lock);
                return;
        }
-        cwq->nr_in_flight[cwq->work_color]++;
+        pwq->nr_in_flight[pwq->work_color]++;
-        work_flags = work_color_to_flags(cwq->work_color);
+        work_flags = work_color_to_flags(pwq->work_color);
-        if (likely(cwq->nr_active < cwq->max_active)) {
+        if (likely(pwq->nr_active < pwq->max_active)) {
                trace_workqueue_activate_work(work);
-                cwq->nr_active++;
+                pwq->nr_active++;
-                worklist = &cwq->pool->worklist;
+                worklist = &pwq->pool->worklist;
        } else {
                work_flags |= WORK_STRUCT_DELAYED;
-                worklist = &cwq->delayed_works;
+                worklist = &pwq->delayed_works;
        }
-        insert_work(cwq, work, worklist, work_flags);
+        insert_work(pwq, work, worklist, work_flags);
-        spin_unlock(&gcwq->lock);
+        spin_unlock(&pwq->pool->lock);
 }
 /**
@@ -1347,19 +1306,17 @@ EXPORT_SYMBOL_GPL(queue_work);
 void delayed_work_timer_fn(unsigned long __data)
 {
        struct delayed_work *dwork = (struct delayed_work *)__data;
-        struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
        /* should have been called from irqsafe timer with irq already off */
-        __queue_work(dwork->cpu, cwq->wq, &dwork->work);
+        __queue_work(dwork->cpu, dwork->wq, &dwork->work);
 }
-EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
+EXPORT_SYMBOL(delayed_work_timer_fn);
 static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
                                struct delayed_work *dwork, unsigned long delay)
 {
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;
-        unsigned int lcpu;
        WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
                     timer->data != (unsigned long)dwork);
@@ -1379,30 +1336,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
        timer_stats_timer_set_start_info(&dwork->timer);
-        /*
+        dwork->wq = wq;
-         * This stores cwq for the moment, for the timer_fn.  Note that the
-         * work's gcwq is preserved to allow reentrance detection for
-         * delayed works.
-         */
-        if (!(wq->flags & WQ_UNBOUND)) {
-                struct global_cwq *gcwq = get_work_gcwq(work);
-                /*
-                 * If we cannot get the last gcwq from @work directly,
-                 * select the last CPU such that it avoids unnecessarily
-                 * triggering non-reentrancy check in __queue_work().
-                 */
-                lcpu = cpu;
-                if (gcwq)
-                        lcpu = gcwq->cpu;
-                if (lcpu == WORK_CPU_UNBOUND)
-                        lcpu = raw_smp_processor_id();
-        } else {
-                lcpu = WORK_CPU_UNBOUND;
-        }
-        set_work_cwq(work, get_cwq(lcpu, wq), 0);
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;
@@ -1519,12 +1453,11 @@ EXPORT_SYMBOL_GPL(mod_delayed_work);
 * necessary.
 *
 * LOCKING:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
 static void worker_enter_idle(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-        struct global_cwq *gcwq = pool->gcwq;
        BUG_ON(worker->flags & WORKER_IDLE);
        BUG_ON(!list_empty(&worker->entry) &&
@@ -1542,14 +1475,14 @@ static void worker_enter_idle(struct worker *worker)
                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
        /*
-         * Sanity check nr_running.  Because gcwq_unbind_fn() releases
+         * Sanity check nr_running.  Because wq_unbind_fn() releases
-         * gcwq->lock between setting %WORKER_UNBOUND and zapping
+         * pool->lock between setting %WORKER_UNBOUND and zapping
         * nr_running, the warning may trigger spuriously.  Check iff
         * unbind is not in progress.
         */
-        WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
+        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                     pool->nr_workers == pool->nr_idle &&
-                     atomic_read(get_pool_nr_running(pool)));
+                     atomic_read(&pool->nr_running));
 }
 /**
@@ -1559,7 +1492,7 @@ static void worker_enter_idle(struct worker *worker)
 * @worker is leaving idle state.  Update stats.
 *
 * LOCKING:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
 static void worker_leave_idle(struct worker *worker)
 {
@@ -1572,7 +1505,7 @@ static void worker_leave_idle(struct worker *worker)
 }
 /**
- * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
+ * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool
 * @worker: self
 *
 * Works which are scheduled while the cpu is online must at least be
@@ -1584,27 +1517,27 @@ static void worker_leave_idle(struct worker *worker)
 * themselves to the target cpu and may race with cpu going down or
 * coming online.  kthread_bind() can't be used because it may put the
 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
- * verbatim as it's best effort and blocking and gcwq may be
+ * verbatim as it's best effort and blocking and pool may be
 * [dis]associated in the meantime.
 *
- * This function tries set_cpus_allowed() and locks gcwq and verifies the
+ * This function tries set_cpus_allowed() and locks pool and verifies the
- * binding against %GCWQ_DISASSOCIATED which is set during
+ * binding against %POOL_DISASSOCIATED which is set during
 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
 * enters idle state or fetches works without dropping lock, it can
 * guarantee the scheduling requirement described in the first paragraph.
 *
 * CONTEXT:
- * Might sleep.  Called without any lock but returns with gcwq->lock
+ * Might sleep.  Called without any lock but returns with pool->lock
 * held.
 *
 * RETURNS:
- * %true if the associated gcwq is online (@worker is successfully
+ * %true if the associated pool is online (@worker is successfully
 * bound), %false if offline.
 */
 static bool worker_maybe_bind_and_lock(struct worker *worker)
-__acquires(&gcwq->lock)
+__acquires(&pool->lock)
 {
-        struct global_cwq *gcwq = worker->pool->gcwq;
+        struct worker_pool *pool = worker->pool;
        struct task_struct *task = worker->task;
        while (true) {
@@ -1612,19 +1545,19 @@ __acquires(&gcwq->lock)
                 * The following call may fail, succeed or succeed
                 * without actually migrating the task to the cpu if
                 * it races with cpu hotunplug operation.  Verify
-                 * against GCWQ_DISASSOCIATED.
+                 * against POOL_DISASSOCIATED.
                 */
-                if (!(gcwq->flags & GCWQ_DISASSOCIATED))
+                if (!(pool->flags & POOL_DISASSOCIATED))
-                        set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
+                        set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu));
-                spin_lock_irq(&gcwq->lock);
+                spin_lock_irq(&pool->lock);
-                if (gcwq->flags & GCWQ_DISASSOCIATED)
+                if (pool->flags & POOL_DISASSOCIATED)
                        return false;
-                if (task_cpu(task) == gcwq->cpu &&
+                if (task_cpu(task) == pool->cpu &&
                    cpumask_equal(&current->cpus_allowed,
-                                  get_cpu_mask(gcwq->cpu)))
+                                  get_cpu_mask(pool->cpu)))
                        return true;
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
                /*
                 * We've raced with CPU hot[un]plug.  Give it a breather
@@ -1643,15 +1576,13 @@ __acquires(&gcwq->lock)
 */
 static void idle_worker_rebind(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->pool->gcwq;
        /* CPU may go down again inbetween, clear UNBOUND only on success */
        if (worker_maybe_bind_and_lock(worker))
                worker_clr_flags(worker, WORKER_UNBOUND);
        /* rebind complete, become available again */
        list_add(&worker->entry, &worker->pool->idle_list);
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&worker->pool->lock);
 }
 /*
@@ -1663,19 +1594,18 @@ static void idle_worker_rebind(struct worker *worker)
 static void busy_worker_rebind_fn(struct work_struct *work)
 {
        struct worker *worker = container_of(work, struct worker, rebind_work);
-        struct global_cwq *gcwq = worker->pool->gcwq;
        if (worker_maybe_bind_and_lock(worker))
                worker_clr_flags(worker, WORKER_UNBOUND);
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&worker->pool->lock);
 }
 /**
- * rebind_workers - rebind all workers of a gcwq to the associated CPU
+ * rebind_workers - rebind all workers of a pool to the associated CPU
- * @gcwq: gcwq of interest
+ * @pool: pool of interest
 *
- * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
+ * @pool->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
 * is different for idle and busy ones.
 *
 * Idle ones will be removed from the idle_list and woken up.  They will
@@ -1693,38 +1623,32 @@ static void busy_worker_rebind_fn(struct work_struct *work)
 * including the manager will not appear on @idle_list until rebind is
 * complete, making local wake-ups safe.
 */
-static void rebind_workers(struct global_cwq *gcwq)
+static void rebind_workers(struct worker_pool *pool)
 {
-        struct worker_pool *pool;
        struct worker *worker, *n;
        struct hlist_node *pos;
        int i;
-        lockdep_assert_held(&gcwq->lock);
+        lockdep_assert_held(&pool->assoc_mutex);
+        lockdep_assert_held(&pool->lock);
-        for_each_worker_pool(pool, gcwq)
-                lockdep_assert_held(&pool->assoc_mutex);
        /* dequeue and kick idle ones */
-        for_each_worker_pool(pool, gcwq) {
+        list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
-                list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
+                /*
-                        /*
+                 * idle workers should be off @pool->idle_list until rebind
-                         * idle workers should be off @pool->idle_list
+                 * is complete to avoid receiving premature local wake-ups.
-                         * until rebind is complete to avoid receiving
+                 */
-                         * premature local wake-ups.
+                list_del_init(&worker->entry);
-                         */
-                        list_del_init(&worker->entry);
-                        /*
+                /*
-                         * worker_thread() will see the above dequeuing
+                 * worker_thread() will see the above dequeuing and call
-                         * and call idle_worker_rebind().
+                 * idle_worker_rebind().
-                         */
+                 */
-                        wake_up_process(worker->task);
+                wake_up_process(worker->task);
-                }
        }
        /* rebind busy workers */
-        for_each_busy_worker(worker, i, pos, gcwq) {
+        for_each_busy_worker(worker, i, pos, pool) {
                struct work_struct *rebind_work = &worker->rebind_work;
                struct workqueue_struct *wq;
@@ -1736,16 +1660,16 @@ static void rebind_workers(struct global_cwq *gcwq)
                /*
                 * wq doesn't really matter but let's keep @worker->pool
-                 * and @cwq->pool consistent for sanity.
+                 * and @pwq->pool consistent for sanity.
                 */
-                if (worker_pool_pri(worker->pool))
+                if (std_worker_pool_pri(worker->pool))
                        wq = system_highpri_wq;
                else
                        wq = system_wq;
-                insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
+                insert_work(get_pwq(pool->cpu, wq), rebind_work,
-                        worker->scheduled.next,
+                            worker->scheduled.next,
-                        work_color_to_flags(WORK_NO_COLOR));
+                            work_color_to_flags(WORK_NO_COLOR));
        }
 }
@@ -1780,19 +1704,18 @@ static struct worker *alloc_worker(void)
 */
 static struct worker *create_worker(struct worker_pool *pool)
 {
-        struct global_cwq *gcwq = pool->gcwq;
+        const char *pri = std_worker_pool_pri(pool) ? "H" : "";
-        const char *pri = worker_pool_pri(pool) ? "H" : "";
        struct worker *worker = NULL;
        int id = -1;
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        while (ida_get_new(&pool->worker_ida, &id)) {
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
                if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
                        goto fail;
-                spin_lock_irq(&gcwq->lock);
+                spin_lock_irq(&pool->lock);
        }
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        worker = alloc_worker();
        if (!worker)
@@ -1801,30 +1724,30 @@ static struct worker *create_worker(struct worker_pool *pool)
        worker->pool = pool;
        worker->id = id;
-        if (gcwq->cpu != WORK_CPU_UNBOUND)
+        if (pool->cpu != WORK_CPU_UNBOUND)
                worker->task = kthread_create_on_node(worker_thread,
-                                        worker, cpu_to_node(gcwq->cpu),
+                                        worker, cpu_to_node(pool->cpu),
-                                        "kworker/%u:%d%s", gcwq->cpu, id, pri);
+                                        "kworker/%u:%d%s", pool->cpu, id, pri);
        else
                worker->task = kthread_create(worker_thread, worker,
                                              "kworker/u:%d%s", id, pri);
        if (IS_ERR(worker->task))
                goto fail;
-        if (worker_pool_pri(pool))
+        if (std_worker_pool_pri(pool))
                set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
        /*
         * Determine CPU binding of the new worker depending on
-         * %GCWQ_DISASSOCIATED.  The caller is responsible for ensuring the
+         * %POOL_DISASSOCIATED.  The caller is responsible for ensuring the
         * flag remains stable across this function.  See the comments
         * above the flag definition for details.
         *
         * As an unbound worker may later become a regular one if CPU comes
         * online, make sure every worker has %PF_THREAD_BOUND set.
         */
-        if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
+        if (!(pool->flags & POOL_DISASSOCIATED)) {
-                kthread_bind(worker->task, gcwq->cpu);
+                kthread_bind(worker->task, pool->cpu);
        } else {
                worker->task->flags |= PF_THREAD_BOUND;
                worker->flags |= WORKER_UNBOUND;
@@ -1833,9 +1756,9 @@ static struct worker *create_worker(struct worker_pool *pool)
        return worker;
 fail:
        if (id >= 0) {
-                spin_lock_irq(&gcwq->lock);
+                spin_lock_irq(&pool->lock);
                ida_remove(&pool->worker_ida, id);
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
        }
        kfree(worker);
        return NULL;
@@ -1845,10 +1768,10 @@ fail:
 * start_worker - start a newly created worker
 * @worker: worker to start
 *
- * Make the gcwq aware of @worker and start it.
+ * Make the pool aware of @worker and start it.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
 static void start_worker(struct worker *worker)
 {
@@ -1862,15 +1785,14 @@ static void start_worker(struct worker *worker)
 * destroy_worker - destroy a workqueue worker
 * @worker: worker to be destroyed
 *
- * Destroy @worker and adjust @gcwq stats accordingly.
+ * Destroy @worker and adjust @pool stats accordingly.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock) which is released and regrabbed.
 */
 static void destroy_worker(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-        struct global_cwq *gcwq = pool->gcwq;
        int id = worker->id;
        /* sanity check frenzy */
@@ -1885,21 +1807,20 @@ static void destroy_worker(struct worker *worker)
        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        kthread_stop(worker->task);
        kfree(worker);
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        ida_remove(&pool->worker_ida, id);
 }
 static void idle_worker_timeout(unsigned long __pool)
 {
        struct worker_pool *pool = (void *)__pool;
-        struct global_cwq *gcwq = pool->gcwq;
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        if (too_many_workers(pool)) {
                struct worker *worker;
@@ -1918,20 +1839,20 @@ static void idle_worker_timeout(unsigned long __pool)
                }
        }
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
 }
 static bool send_mayday(struct work_struct *work)
 {
-        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+        struct pool_workqueue *pwq = get_work_pwq(work);
-        struct workqueue_struct *wq = cwq->wq;
+        struct workqueue_struct *wq = pwq->wq;
        unsigned int cpu;
        if (!(wq->flags & WQ_RESCUER))
                return false;
        /* mayday mayday mayday */
-        cpu = cwq->pool->gcwq->cpu;
+        cpu = pwq->pool->cpu;
        /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
        if (cpu == WORK_CPU_UNBOUND)
                cpu = 0;
@@ -1940,13 +1861,12 @@ static bool send_mayday(struct work_struct *work)
        return true;
 }
-static void gcwq_mayday_timeout(unsigned long __pool)
+static void pool_mayday_timeout(unsigned long __pool)
 {
        struct worker_pool *pool = (void *)__pool;
-        struct global_cwq *gcwq = pool->gcwq;
        struct work_struct *work;
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        if (need_to_create_worker(pool)) {
                /*
@@ -1959,7 +1879,7 @@ static void gcwq_mayday_timeout(unsigned long __pool)
                        send_mayday(work);
        }
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
@@ -1978,24 +1898,22 @@ static void gcwq_mayday_timeout(unsigned long __pool)
 * may_start_working() true.
 *
 * LOCKING:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
 *
 * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true
+ * false if no action was taken and pool->lock stayed locked, true
 * otherwise.
 */
 static bool maybe_create_worker(struct worker_pool *pool)
-__releases(&gcwq->lock)
+__releases(&pool->lock)
-__acquires(&gcwq->lock)
+__acquires(&pool->lock)
 {
-        struct global_cwq *gcwq = pool->gcwq;
        if (!need_to_create_worker(pool))
                return false;
 restart:
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
@@ -2006,7 +1924,7 @@ restart:
                worker = create_worker(pool);
                if (worker) {
                        del_timer_sync(&pool->mayday_timer);
-                        spin_lock_irq(&gcwq->lock);
+                        spin_lock_irq(&pool->lock);
                        start_worker(worker);
                        BUG_ON(need_to_create_worker(pool));
                        return true;
@@ -2023,7 +1941,7 @@ restart:
        }
        del_timer_sync(&pool->mayday_timer);
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        if (need_to_create_worker(pool))
                goto restart;
        return true;
@@ -2037,11 +1955,11 @@ restart:
 * IDLE_WORKER_TIMEOUT.
 *
 * LOCKING:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Called only from manager.
 *
 * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true
+ * false if no action was taken and pool->lock stayed locked, true
 * otherwise.
 */
 static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2071,21 +1989,21 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
 * manage_workers - manage worker pool
 * @worker: self
 *
- * Assume the manager role and manage gcwq worker pool @worker belongs
+ * Assume the manager role and manage the worker pool @worker belongs
 * to.  At any given time, there can be only zero or one manager per
- * gcwq.  The exclusion is handled automatically by this function.
+ * pool.  The exclusion is handled automatically by this function.
 *
 * The caller can safely start processing works on false return.  On
 * true return, it's guaranteed that need_to_create_worker() is false
 * and may_start_working() is true.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true if
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
- * some action was taken.
+ * multiple times.  Does GFP_KERNEL allocations.
 */
 static bool manage_workers(struct worker *worker)
 {
@@ -2107,20 +2025,20 @@ static bool manage_workers(struct worker *worker)
         * manager against CPU hotplug.
         *
         * assoc_mutex would always be free unless CPU hotplug is in
-         * progress.  trylock first without dropping @gcwq->lock.
+         * progress.  trylock first without dropping @pool->lock.
         */
        if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
-                spin_unlock_irq(&pool->gcwq->lock);
+                spin_unlock_irq(&pool->lock);
                mutex_lock(&pool->assoc_mutex);
                /*
                 * CPU hotplug could have happened while we were waiting
                 * for assoc_mutex.  Hotplug itself can't handle us
                 * because manager isn't either on idle or busy list, and
-                 * @gcwq's state and ours could have deviated.
+                 * @pool's state and ours could have deviated.
                 *
                 * As hotplug is now excluded via assoc_mutex, we can
                 * simply try to bind.  It will succeed or fail depending
-                 * on @gcwq's current state.  Try it and adjust
+                 * on @pool's current state.  Try it and adjust
                 * %WORKER_UNBOUND accordingly.
                 */
                if (worker_maybe_bind_and_lock(worker))
@@ -2157,18 +2075,15 @@ static bool manage_workers(struct worker *worker)
 * call this function to process a work.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock) which is released and regrabbed.
 */
 static void process_one_work(struct worker *worker, struct work_struct *work)
-__releases(&gcwq->lock)
+__releases(&pool->lock)
-__acquires(&gcwq->lock)
+__acquires(&pool->lock)
 {
-        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+        struct pool_workqueue *pwq = get_work_pwq(work);
        struct worker_pool *pool = worker->pool;
-        struct global_cwq *gcwq = pool->gcwq;
+        bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
-        struct hlist_head *bwh = busy_worker_head(gcwq, work);
-        bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
-        work_func_t f = work->func;
        int work_color;
        struct worker *collision;
 #ifdef CONFIG_LOCKDEP
@@ -2186,11 +2101,11 @@ __acquires(&gcwq->lock)
        /*
         * Ensure we're on the correct CPU.  DISASSOCIATED test is
         * necessary to avoid spurious warnings from rescuers servicing the
-         * unbound or a disassociated gcwq.
+         * unbound or a disassociated pool.
         */
        WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
-                     !(gcwq->flags & GCWQ_DISASSOCIATED) &&
+                     !(pool->flags & POOL_DISASSOCIATED) &&
-                     raw_smp_processor_id() != gcwq->cpu);
+                     raw_smp_processor_id() != pool->cpu);
        /*
         * A single work shouldn't be executed concurrently by
@@ -2198,7 +2113,7 @@ __acquires(&gcwq->lock)
         * already processing the work.  If so, defer the work to the
         * currently executing one.
         */
-        collision = __find_worker_executing_work(gcwq, bwh, work);
+        collision = find_worker_executing_work(pool, work);
        if (unlikely(collision)) {
                move_linked_works(work, &collision->scheduled, NULL);
                return;
@@ -2206,9 +2121,10 @@ __acquires(&gcwq->lock)
        /* claim and dequeue */
        debug_work_deactivate(work);
-        hlist_add_head(&worker->hentry, bwh);
+        hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
        worker->current_work = work;
-        worker->current_cwq = cwq;
+        worker->current_func = work->func;
+        worker->current_pwq = pwq;
        work_color = get_work_color(work);
        list_del_init(&work->entry);
@@ -2221,53 +2137,55 @@ __acquires(&gcwq->lock)
                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
        /*
-         * Unbound gcwq isn't concurrency managed and work items should be
+         * Unbound pool isn't concurrency managed and work items should be
         * executed ASAP.  Wake up another worker if necessary.
         */
        if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
                wake_up_worker(pool);
        /*
-         * Record the last CPU and clear PENDING which should be the last
+         * Record the last pool and clear PENDING which should be the last
-         * update to @work.  Also, do this inside @gcwq->lock so that
+         * update to @work.  Also, do this inside @pool->lock so that
         * PENDING and queued state changes happen together while IRQ is
         * disabled.
         */
-        set_work_cpu_and_clear_pending(work, gcwq->cpu);
+        set_work_pool_and_clear_pending(work, pool->id);
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
-        lock_map_acquire_read(&cwq->wq->lockdep_map);
+        lock_map_acquire_read(&pwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        trace_workqueue_execute_start(work);
-        f(work);
+        worker->current_func(work);
        /*
         * While we must be careful to not use "work" after this, the trace
         * point will only record its address.
         */
        trace_workqueue_execute_end(work);
        lock_map_release(&lockdep_map);
-        lock_map_release(&cwq->wq->lockdep_map);
+        lock_map_release(&pwq->wq->lockdep_map);
        if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
                pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
                       "     last function: %pf\n",
-                       current->comm, preempt_count(), task_pid_nr(current), f);
+                       current->comm, preempt_count(), task_pid_nr(current),
+                       worker->current_func);
                debug_show_held_locks(current);
                dump_stack();
        }
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        /* clear cpu intensive status */
        if (unlikely(cpu_intensive))
                worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
        /* we're done with it, release */
-        hlist_del_init(&worker->hentry);
+        hash_del(&worker->hentry);
        worker->current_work = NULL;
-        worker->current_cwq = NULL;
+        worker->current_func = NULL;
-        cwq_dec_nr_in_flight(cwq, work_color);
+        worker->current_pwq = NULL;
+        pwq_dec_nr_in_flight(pwq, work_color);
 }
 /**
@@ -2279,7 +2197,7 @@ __acquires(&gcwq->lock)
 * fetches a work from the top and executes it.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.
 */
 static void process_scheduled_works(struct worker *worker)
@@ -2295,8 +2213,8 @@ static void process_scheduled_works(struct worker *worker)
 * worker_thread - the worker thread function
 * @__worker: self
 *
- * The gcwq worker thread function.  There's a single dynamic pool of
+ * The worker thread function.  There are NR_CPU_WORKER_POOLS dynamic pools
- * these per each cpu.  These workers process all works regardless of
+ * of these per each cpu.  These workers process all works regardless of
 * their specific target workqueue.  The only exception is works which
 * belong to workqueues with a rescuer which will be explained in
 * rescuer_thread().
@@ -2305,16 +2223,15 @@ static int worker_thread(void *__worker)
 {
        struct worker *worker = __worker;
        struct worker_pool *pool = worker->pool;
-        struct global_cwq *gcwq = pool->gcwq;
        /* tell the scheduler that this is a workqueue worker */
        worker->task->flags |= PF_WQ_WORKER;
 woke_up:
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        /* we are off idle list if destruction or rebind is requested */
        if (unlikely(list_empty(&worker->entry))) {
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
                /* if DIE is set, destruction is requested */
                if (worker->flags & WORKER_DIE) {
@@ -2373,52 +2290,59 @@ sleep:
                goto recheck;
        /*
-         * gcwq->lock is held and there's no work to process and no
+         * pool->lock is held and there's no work to process and no need to
-         * need to manage, sleep.  Workers are woken up only while
+         * manage, sleep.  Workers are woken up only while holding
-         * holding gcwq->lock or from local cpu, so setting the
+         * pool->lock or from local cpu, so setting the current state
-         * current state before releasing gcwq->lock is enough to
+         * before releasing pool->lock is enough to prevent losing any
-         * prevent losing any event.
+         * event.
         */
        worker_enter_idle(worker);
        __set_current_state(TASK_INTERRUPTIBLE);
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        schedule();
        goto woke_up;
 }
 /**
 * rescuer_thread - the rescuer thread function
- * @__wq: the associated workqueue
+ * @__rescuer: self
 *
 * Workqueue rescuer thread function.  There's one rescuer for each
 * workqueue which has WQ_RESCUER set.
 *
- * Regular work processing on a gcwq may block trying to create a new
+ * Regular work processing on a pool may block trying to create a new
 * worker which uses GFP_KERNEL allocation which has slight chance of
 * developing into deadlock if some works currently on the same queue
 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
 * the problem rescuer solves.
 *
- * When such condition is possible, the gcwq summons rescuers of all
+ * When such condition is possible, the pool summons rescuers of all
- * workqueues which have works queued on the gcwq and let them process
+ * workqueues which have works queued on the pool and let them process
 * those works so that forward progress can be guaranteed.
 *
 * This should happen rarely.
 */
-static int rescuer_thread(void *__wq)
+static int rescuer_thread(void *__rescuer)
 {
-        struct workqueue_struct *wq = __wq;
+        struct worker *rescuer = __rescuer;
-        struct worker *rescuer = wq->rescuer;
+        struct workqueue_struct *wq = rescuer->rescue_wq;
        struct list_head *scheduled = &rescuer->scheduled;
        bool is_unbound = wq->flags & WQ_UNBOUND;
        unsigned int cpu;
        set_user_nice(current, RESCUER_NICE_LEVEL);
+        /*
+         * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
+         * doesn't participate in concurrency management.
+         */
+        rescuer->task->flags |= PF_WQ_WORKER;
 repeat:
        set_current_state(TASK_INTERRUPTIBLE);
        if (kthread_should_stop()) {
                __set_current_state(TASK_RUNNING);
+                rescuer->task->flags &= ~PF_WQ_WORKER;
                return 0;
        }
@@ -2428,9 +2352,8 @@ repeat:
         */
        for_each_mayday_cpu(cpu, wq->mayday_mask) {
                unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
-                struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
+                struct pool_workqueue *pwq = get_pwq(tcpu, wq);
-                struct worker_pool *pool = cwq->pool;
+                struct worker_pool *pool = pwq->pool;
-                struct global_cwq *gcwq = pool->gcwq;
                struct work_struct *work, *n;
                __set_current_state(TASK_RUNNING);
@@ -2446,22 +2369,24 @@ repeat:
                 */
                BUG_ON(!list_empty(&rescuer->scheduled));
                list_for_each_entry_safe(work, n, &pool->worklist, entry)
-                        if (get_work_cwq(work) == cwq)
+                        if (get_work_pwq(work) == pwq)
                                move_linked_works(work, scheduled, &n);
                process_scheduled_works(rescuer);
                /*
-                 * Leave this gcwq.  If keep_working() is %true, notify a
+                 * Leave this pool.  If keep_working() is %true, notify a
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
                 */
                if (keep_working(pool))
                        wake_up_worker(pool);
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
        }
+        /* rescuers should never participate in concurrency management */
+        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
        goto repeat;
 }
@@ -2479,7 +2404,7 @@ static void wq_barrier_func(struct work_struct *work)
 /**
 * insert_wq_barrier - insert a barrier work
- * @cwq: cwq to insert barrier into
+ * @pwq: pwq to insert barrier into
 * @barr: wq_barrier to insert
 * @target: target work to attach @barr to
 * @worker: worker currently executing @target, NULL if @target is not executing
@@ -2496,12 +2421,12 @@ static void wq_barrier_func(struct work_struct *work)
 * after a work with LINKED flag set.
 *
 * Note that when @worker is non-NULL, @target may be modified
- * underneath us, so we can't reliably determine cwq from @target.
+ * underneath us, so we can't reliably determine pwq from @target.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
-static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+static void insert_wq_barrier(struct pool_workqueue *pwq,
                              struct wq_barrier *barr,
                              struct work_struct *target, struct worker *worker)
 {
@@ -2509,7 +2434,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
        unsigned int linked = 0;
        /*
-         * debugobject calls are safe here even with gcwq->lock locked
+         * debugobject calls are safe here even with pool->lock locked
         * as we know for sure that this will not trigger any of the
         * checks and call back into the fixup functions where we
         * might deadlock.
@@ -2534,23 +2459,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
        }
        debug_work_activate(&barr->work);
-        insert_work(cwq, &barr->work, head,
+        insert_work(pwq, &barr->work, head,
                    work_color_to_flags(WORK_NO_COLOR) | linked);
 }
 /**
- * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
 * @wq: workqueue being flushed
 * @flush_color: new flush color, < 0 for no-op
 * @work_color: new work color, < 0 for no-op
 *
- * Prepare cwqs for workqueue flushing.
+ * Prepare pwqs for workqueue flushing.
 *
- * If @flush_color is non-negative, flush_color on all cwqs should be
+ * If @flush_color is non-negative, flush_color on all pwqs should be
- * -1.  If no cwq has in-flight commands at the specified color, all
+ * -1.  If no pwq has in-flight commands at the specified color, all
- * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
+ * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
- * has in flight commands, its cwq->flush_color is set to
+ * has in flight commands, its pwq->flush_color is set to
- * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
+ * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
 * wakeup logic is armed and %true is returned.
 *
 * The caller should have initialized @wq->first_flusher prior to
@@ -2558,7 +2483,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 * @flush_color is negative, no flush color update is done and %false
 * is returned.
 *
- * If @work_color is non-negative, all cwqs should have the same
+ * If @work_color is non-negative, all pwqs should have the same
 * work_color which is previous to @work_color and all will be
 * advanced to @work_color.
 *
@@ -2569,42 +2494,42 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 * %true if @flush_color >= 0 and there's something to flush.  %false
 * otherwise.
 */
-static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
+static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
                                      int flush_color, int work_color)
 {
        bool wait = false;
        unsigned int cpu;
        if (flush_color >= 0) {
-                BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
+                BUG_ON(atomic_read(&wq->nr_pwqs_to_flush));
-                atomic_set(&wq->nr_cwqs_to_flush, 1);
+                atomic_set(&wq->nr_pwqs_to_flush, 1);
        }
-        for_each_cwq_cpu(cpu, wq) {
+        for_each_pwq_cpu(cpu, wq) {
-                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                struct global_cwq *gcwq = cwq->pool->gcwq;
+                struct worker_pool *pool = pwq->pool;
-                spin_lock_irq(&gcwq->lock);
+                spin_lock_irq(&pool->lock);
                if (flush_color >= 0) {
-                        BUG_ON(cwq->flush_color != -1);
+                        BUG_ON(pwq->flush_color != -1);
-                        if (cwq->nr_in_flight[flush_color]) {
+                        if (pwq->nr_in_flight[flush_color]) {
-                                cwq->flush_color = flush_color;
+                                pwq->flush_color = flush_color;
-                                atomic_inc(&wq->nr_cwqs_to_flush);
+                                atomic_inc(&wq->nr_pwqs_to_flush);
                                wait = true;
                        }
                }
                if (work_color >= 0) {
-                        BUG_ON(work_color != work_next_color(cwq->work_color));
+                        BUG_ON(work_color != work_next_color(pwq->work_color));
-                        cwq->work_color = work_color;
+                        pwq->work_color = work_color;
                }
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
        }
-        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
+        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
                complete(&wq->first_flusher->done);
        return wait;
@@ -2655,7 +2580,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                        wq->first_flusher = &this_flusher;
-                        if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
+                        if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
                                                       wq->work_color)) {
                                /* nothing to flush, done */
                                wq->flush_color = next_color;
@@ -2666,7 +2591,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                        /* wait in queue */
                        BUG_ON(wq->flush_color == this_flusher.flush_color);
                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
-                        flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }
        } else {
                /*
@@ -2733,7 +2658,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                        list_splice_tail_init(&wq->flusher_overflow,
                                              &wq->flusher_queue);
-                        flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }
                if (list_empty(&wq->flusher_queue)) {
@@ -2743,7 +2668,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                /*
                 * Need to flush more colors.  Make the next flusher
-                 * the new first flusher and arm cwqs.
+                 * the new first flusher and arm pwqs.
                 */
                BUG_ON(wq->flush_color == wq->work_color);
                BUG_ON(wq->flush_color != next->flush_color);
@@ -2751,7 +2676,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                list_del_init(&next->list);
                wq->first_flusher = next;
-                if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
+                if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
                        break;
                /*
@@ -2794,13 +2719,13 @@ void drain_workqueue(struct workqueue_struct *wq)
 reflush:
        flush_workqueue(wq);
-        for_each_cwq_cpu(cpu, wq) {
+        for_each_pwq_cpu(cpu, wq) {
-                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                struct pool_workqueue *pwq = get_pwq(cpu, wq);
                bool drained;
-                spin_lock_irq(&cwq->pool->gcwq->lock);
+                spin_lock_irq(&pwq->pool->lock);
-                drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
+                drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
-                spin_unlock_irq(&cwq->pool->gcwq->lock);
+                spin_unlock_irq(&pwq->pool->lock);
                if (drained)
                        continue;
@@ -2822,34 +2747,29 @@ EXPORT_SYMBOL_GPL(drain_workqueue);
 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 {
        struct worker *worker = NULL;
-        struct global_cwq *gcwq;
+        struct worker_pool *pool;
-        struct cpu_workqueue_struct *cwq;
+        struct pool_workqueue *pwq;
        might_sleep();
-        gcwq = get_work_gcwq(work);
+        pool = get_work_pool(work);
-        if (!gcwq)
+        if (!pool)
                return false;
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
-        if (!list_empty(&work->entry)) {
+        /* see the comment in try_to_grab_pending() with the same code */
-                /*
+        pwq = get_work_pwq(work);
-                 * See the comment near try_to_grab_pending()->smp_rmb().
+        if (pwq) {
-                 * If it was re-queued to a different gcwq under us, we
+                if (unlikely(pwq->pool != pool))
-                 * are not going to wait.
-                 */
-                smp_rmb();
-                cwq = get_work_cwq(work);
-                if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
                        goto already_gone;
        } else {
-                worker = find_worker_executing_work(gcwq, work);
+                worker = find_worker_executing_work(pool, work);
                if (!worker)
                        goto already_gone;
-                cwq = worker->current_cwq;
+                pwq = worker->current_pwq;
        }
-        insert_wq_barrier(cwq, barr, work, worker);
+        insert_wq_barrier(pwq, barr, work, worker);
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        /*
         * If @max_active is 1 or rescuer is in use, flushing another work
@@ -2857,15 +2777,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
         * flusher is not running on the same workqueue by verifying write
         * access.
         */
-        if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
+        if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
-                lock_map_acquire(&cwq->wq->lockdep_map);
+                lock_map_acquire(&pwq->wq->lockdep_map);
        else
-                lock_map_acquire_read(&cwq->wq->lockdep_map);
+                lock_map_acquire_read(&pwq->wq->lockdep_map);
-        lock_map_release(&cwq->wq->lockdep_map);
+        lock_map_release(&pwq->wq->lockdep_map);
        return true;
 already_gone:
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        return false;
 }
@@ -2961,8 +2881,7 @@ bool flush_delayed_work(struct delayed_work *dwork)
 {
        local_irq_disable();
        if (del_timer_sync(&dwork->timer))
-                __queue_work(dwork->cpu,
+                __queue_work(dwork->cpu, dwork->wq, &dwork->work);
-                             get_work_cwq(&dwork->work)->wq, &dwork->work);
        local_irq_enable();
        return flush_work(&dwork->work);
 }
@@ -2992,7 +2911,8 @@ bool cancel_delayed_work(struct delayed_work *dwork)
        if (unlikely(ret < 0))
                return false;
-        set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
+        set_work_pool_and_clear_pending(&dwork->work,
+                                        get_work_pool_id(&dwork->work));
        local_irq_restore(flags);
        return ret;
 }
@@ -3171,46 +3091,46 @@ int keventd_up(void)
        return system_wq != NULL;
 }
-static int alloc_cwqs(struct workqueue_struct *wq)
+static int alloc_pwqs(struct workqueue_struct *wq)
 {
        /*
-         * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
+         * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
         * Make sure that the alignment isn't lower than that of
         * unsigned long long.
         */
-        const size_t size = sizeof(struct cpu_workqueue_struct);
+        const size_t size = sizeof(struct pool_workqueue);
        const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
                                   __alignof__(unsigned long long));
        if (!(wq->flags & WQ_UNBOUND))
-                wq->cpu_wq.pcpu = __alloc_percpu(size, align);
+                wq->pool_wq.pcpu = __alloc_percpu(size, align);
        else {
                void *ptr;
                /*
-                 * Allocate enough room to align cwq and put an extra
+                 * Allocate enough room to align pwq and put an extra
                 * pointer at the end pointing back to the originally
                 * allocated pointer which will be used for free.
                 */
                ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
                if (ptr) {
-                        wq->cpu_wq.single = PTR_ALIGN(ptr, align);
+                        wq->pool_wq.single = PTR_ALIGN(ptr, align);
-                        *(void **)(wq->cpu_wq.single + 1) = ptr;
+                        *(void **)(wq->pool_wq.single + 1) = ptr;
                }
        }
        /* just in case, make sure it's actually aligned */
-        BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
+        BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align));
-        return wq->cpu_wq.v ? 0 : -ENOMEM;
+        return wq->pool_wq.v ? 0 : -ENOMEM;
 }
-static void free_cwqs(struct workqueue_struct *wq)
+static void free_pwqs(struct workqueue_struct *wq)
 {
        if (!(wq->flags & WQ_UNBOUND))
-                free_percpu(wq->cpu_wq.pcpu);
+                free_percpu(wq->pool_wq.pcpu);
-        else if (wq->cpu_wq.single) {
+        else if (wq->pool_wq.single) {
-                /* the pointer to free is stored right after the cwq */
+                /* the pointer to free is stored right after the pwq */
-                kfree(*(void **)(wq->cpu_wq.single + 1));
+                kfree(*(void **)(wq->pool_wq.single + 1));
        }
 }
@@ -3264,27 +3184,25 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        wq->flags = flags;
        wq->saved_max_active = max_active;
        mutex_init(&wq->flush_mutex);
-        atomic_set(&wq->nr_cwqs_to_flush, 0);
+        atomic_set(&wq->nr_pwqs_to_flush, 0);
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
        lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
        INIT_LIST_HEAD(&wq->list);
-        if (alloc_cwqs(wq) < 0)
+        if (alloc_pwqs(wq) < 0)
                goto err;
-        for_each_cwq_cpu(cpu, wq) {
+        for_each_pwq_cpu(cpu, wq) {
-                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                struct global_cwq *gcwq = get_gcwq(cpu);
-                int pool_idx = (bool)(flags & WQ_HIGHPRI);
+                BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
+                pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
-                BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
+                pwq->wq = wq;
-                cwq->pool = &gcwq->pools[pool_idx];
+                pwq->flush_color = -1;
-                cwq->wq = wq;
+                pwq->max_active = max_active;
-                cwq->flush_color = -1;
+                INIT_LIST_HEAD(&pwq->delayed_works);
-                cwq->max_active = max_active;
-                INIT_LIST_HEAD(&cwq->delayed_works);
        }
        if (flags & WQ_RESCUER) {
@@ -3297,7 +3215,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                if (!rescuer)
                        goto err;
-                rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+                rescuer->rescue_wq = wq;
+                rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
                                               wq->name);
                if (IS_ERR(rescuer->task))
                        goto err;
@@ -3314,8 +3233,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        spin_lock(&workqueue_lock);
        if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
-                for_each_cwq_cpu(cpu, wq)
+                for_each_pwq_cpu(cpu, wq)
-                        get_cwq(cpu, wq)->max_active = 0;
+                        get_pwq(cpu, wq)->max_active = 0;
        list_add(&wq->list, &workqueues);
@@ -3324,7 +3243,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        return wq;
 err:
        if (wq) {
-                free_cwqs(wq);
+                free_pwqs(wq);
                free_mayday_mask(wq->mayday_mask);
                kfree(wq->rescuer);
                kfree(wq);
@@ -3355,14 +3274,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
        spin_unlock(&workqueue_lock);
        /* sanity check */
-        for_each_cwq_cpu(cpu, wq) {
+        for_each_pwq_cpu(cpu, wq) {
-                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                struct pool_workqueue *pwq = get_pwq(cpu, wq);
                int i;
                for (i = 0; i < WORK_NR_COLORS; i++)
-                        BUG_ON(cwq->nr_in_flight[i]);
+                        BUG_ON(pwq->nr_in_flight[i]);
-                BUG_ON(cwq->nr_active);
+                BUG_ON(pwq->nr_active);
-                BUG_ON(!list_empty(&cwq->delayed_works));
+                BUG_ON(!list_empty(&pwq->delayed_works));
        }
        if (wq->flags & WQ_RESCUER) {
@@ -3371,29 +3290,29 @@ void destroy_workqueue(struct workqueue_struct *wq)
                kfree(wq->rescuer);
        }
-        free_cwqs(wq);
+        free_pwqs(wq);
        kfree(wq);
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 /**
- * cwq_set_max_active - adjust max_active of a cwq
+ * pwq_set_max_active - adjust max_active of a pwq
- * @cwq: target cpu_workqueue_struct
+ * @pwq: target pool_workqueue
 * @max_active: new max_active value.
 *
- * Set @cwq->max_active to @max_active and activate delayed works if
+ * Set @pwq->max_active to @max_active and activate delayed works if
 * increased.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
-static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
+static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
 {
-        cwq->max_active = max_active;
+        pwq->max_active = max_active;
-        while (!list_empty(&cwq->delayed_works) &&
+        while (!list_empty(&pwq->delayed_works) &&
-               cwq->nr_active < cwq->max_active)
+               pwq->nr_active < pwq->max_active)
-                cwq_activate_first_delayed(cwq);
+                pwq_activate_first_delayed(pwq);
 }
 /**
@@ -3416,16 +3335,17 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
        wq->saved_max_active = max_active;
-        for_each_cwq_cpu(cpu, wq) {
+        for_each_pwq_cpu(cpu, wq) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct pool_workqueue *pwq = get_pwq(cpu, wq);
+                struct worker_pool *pool = pwq->pool;
-                spin_lock_irq(&gcwq->lock);
+                spin_lock_irq(&pool->lock);
                if (!(wq->flags & WQ_FREEZABLE) ||
-                    !(gcwq->flags & GCWQ_FREEZING))
+                    !(pool->flags & POOL_FREEZING))
-                        cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);
+                        pwq_set_max_active(pwq, max_active);
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
        }
        spin_unlock(&workqueue_lock);
@@ -3446,57 +3366,38 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
 */
 bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
 {
-        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+        struct pool_workqueue *pwq = get_pwq(cpu, wq);
-        return !list_empty(&cwq->delayed_works);
+        return !list_empty(&pwq->delayed_works);
 }
 EXPORT_SYMBOL_GPL(workqueue_congested);
 /**
- * work_cpu - return the last known associated cpu for @work
- * @work: the work of interest
- *
- * RETURNS:
- * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
- */
-unsigned int work_cpu(struct work_struct *work)
-{
-        struct global_cwq *gcwq = get_work_gcwq(work);
-        return gcwq ? gcwq->cpu : WORK_CPU_NONE;
-}
-EXPORT_SYMBOL_GPL(work_cpu);
-/**
 * work_busy - test whether a work is currently pending or running
 * @work: the work to be tested
 *
 * Test whether @work is currently pending or running.  There is no
 * synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
- * Especially for reentrant wqs, the pending state might hide the
- * running state.
 *
 * RETURNS:
 * OR'd bitmask of WORK_BUSY_* bits.
 */
 unsigned int work_busy(struct work_struct *work)
 {
-        struct global_cwq *gcwq = get_work_gcwq(work);
+        struct worker_pool *pool = get_work_pool(work);
        unsigned long flags;
        unsigned int ret = 0;
-        if (!gcwq)
-                return 0;
-        spin_lock_irqsave(&gcwq->lock, flags);
        if (work_pending(work))
                ret |= WORK_BUSY_PENDING;
-        if (find_worker_executing_work(gcwq, work))
-                ret |= WORK_BUSY_RUNNING;
-        spin_unlock_irqrestore(&gcwq->lock, flags);
+        if (pool) {
+                spin_lock_irqsave(&pool->lock, flags);
+                if (find_worker_executing_work(pool, work))
+                        ret |= WORK_BUSY_RUNNING;
+                spin_unlock_irqrestore(&pool->lock, flags);
+        }
        return ret;
 }
@@ -3506,65 +3407,49 @@ EXPORT_SYMBOL_GPL(work_busy);
 * CPU hotplug.
 *
 * There are two challenges in supporting CPU hotplug.  Firstly, there
- * are a lot of assumptions on strong associations among work, cwq and
+ * are a lot of assumptions on strong associations among work, pwq and
- * gcwq which make migrating pending and scheduled works very
+ * pool which make migrating pending and scheduled works very
 * difficult to implement without impacting hot paths.  Secondly,
- * gcwqs serve mix of short, long and very long running works making
+ * worker pools serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
- * This is solved by allowing a gcwq to be disassociated from the CPU
+ * This is solved by allowing the pools to be disassociated from the CPU
 * running as an unbound one and allowing it to be reattached later if the
 * cpu comes back online.
 */
-/* claim manager positions of all pools */
+static void wq_unbind_fn(struct work_struct *work)
-static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
 {
-        struct worker_pool *pool;
+        int cpu = smp_processor_id();
-        for_each_worker_pool(pool, gcwq)
-                mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
-        spin_lock_irq(&gcwq->lock);
-}
-/* release manager positions */
-static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
-{
-        struct worker_pool *pool;
-        spin_unlock_irq(&gcwq->lock);
-        for_each_worker_pool(pool, gcwq)
-                mutex_unlock(&pool->assoc_mutex);
-}
-static void gcwq_unbind_fn(struct work_struct *work)
-{
-        struct global_cwq *gcwq = get_gcwq(smp_processor_id());
        struct worker_pool *pool;
        struct worker *worker;
        struct hlist_node *pos;
        int i;
-        BUG_ON(gcwq->cpu != smp_processor_id());
+        for_each_std_worker_pool(pool, cpu) {
+                BUG_ON(cpu != smp_processor_id());
-        gcwq_claim_assoc_and_lock(gcwq);
+                mutex_lock(&pool->assoc_mutex);
+                spin_lock_irq(&pool->lock);
-        /*
+                /*
-         * We've claimed all manager positions.  Make all workers unbound
+                 * We've claimed all manager positions.  Make all workers
-         * and set DISASSOCIATED.  Before this, all workers except for the
+                 * unbound and set DISASSOCIATED.  Before this, all workers
-         * ones which are still executing works from before the last CPU
+                 * except for the ones which are still executing works from
-         * down must be on the cpu.  After this, they may become diasporas.
+                 * before the last CPU down must be on the cpu.  After
-         */
+                 * this, they may become diasporas.
-        for_each_worker_pool(pool, gcwq)
+                 */
                list_for_each_entry(worker, &pool->idle_list, entry)
                        worker->flags |= WORKER_UNBOUND;
-        for_each_busy_worker(worker, i, pos, gcwq)
+                for_each_busy_worker(worker, i, pos, pool)
-                worker->flags |= WORKER_UNBOUND;
+                        worker->flags |= WORKER_UNBOUND;
-        gcwq->flags |= GCWQ_DISASSOCIATED;
+                pool->flags |= POOL_DISASSOCIATED;
-        gcwq_release_assoc_and_unlock(gcwq);
+                spin_unlock_irq(&pool->lock);
+                mutex_unlock(&pool->assoc_mutex);
+        }
        /*
         * Call schedule() so that we cross rq->lock and thus can guarantee
@@ -3576,16 +3461,16 @@ static void gcwq_unbind_fn(struct work_struct *work)
        /*
         * Sched callbacks are disabled now.  Zap nr_running.  After this,
         * nr_running stays zero and need_more_worker() and keep_working()
-         * are always true as long as the worklist is not empty.  @gcwq now
+         * are always true as long as the worklist is not empty.  Pools on
-         * behaves as unbound (in terms of concurrency management) gcwq
+         * @cpu now behave as unbound (in terms of concurrency management)
-         * which is served by workers tied to the CPU.
+         * pools which are served by workers tied to the CPU.
         *
         * On return from this function, the current worker would trigger
         * unbound chain execution of pending work items if other workers
         * didn't already.
         */
-        for_each_worker_pool(pool, gcwq)
+        for_each_std_worker_pool(pool, cpu)
-                atomic_set(get_pool_nr_running(pool), 0);
+                atomic_set(&pool->nr_running, 0);
 }
 /*
@@ -3597,12 +3482,11 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
                                               void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
-        struct global_cwq *gcwq = get_gcwq(cpu);
        struct worker_pool *pool;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_UP_PREPARE:
-                for_each_worker_pool(pool, gcwq) {
+                for_each_std_worker_pool(pool, cpu) {
                        struct worker *worker;
                        if (pool->nr_workers)
@@ -3612,18 +3496,24 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
                        if (!worker)
                                return NOTIFY_BAD;
-                        spin_lock_irq(&gcwq->lock);
+                        spin_lock_irq(&pool->lock);
                        start_worker(worker);
-                        spin_unlock_irq(&gcwq->lock);
+                        spin_unlock_irq(&pool->lock);
                }
                break;
        case CPU_DOWN_FAILED:
        case CPU_ONLINE:
-                gcwq_claim_assoc_and_lock(gcwq);
+                for_each_std_worker_pool(pool, cpu) {
-                gcwq->flags &= ~GCWQ_DISASSOCIATED;
+                        mutex_lock(&pool->assoc_mutex);
-                rebind_workers(gcwq);
+                        spin_lock_irq(&pool->lock);
-                gcwq_release_assoc_and_unlock(gcwq);
+                        pool->flags &= ~POOL_DISASSOCIATED;
+                        rebind_workers(pool);
+                        spin_unlock_irq(&pool->lock);
+                        mutex_unlock(&pool->assoc_mutex);
+                }
                break;
        }
        return NOTIFY_OK;
@@ -3643,7 +3533,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
                /* unbinding should happen on the local CPU */
-                INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+                INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
                queue_work_on(cpu, system_highpri_wq, &unbind_work);
                flush_work(&unbind_work);
                break;
@@ -3696,10 +3586,10 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
 *
 * Start freezing workqueues.  After this function returns, all freezable
 * workqueues will queue new works to their frozen_works list instead of
- * gcwq->worklist.
+ * pool->worklist.
 *
 * CONTEXT:
- * Grabs and releases workqueue_lock and gcwq->lock's.
+ * Grabs and releases workqueue_lock and pool->lock's.
 */
 void freeze_workqueues_begin(void)
 {
@@ -3710,23 +3600,26 @@ void freeze_workqueues_begin(void)
        BUG_ON(workqueue_freezing);
        workqueue_freezing = true;
-        for_each_gcwq_cpu(cpu) {
+        for_each_wq_cpu(cpu) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker_pool *pool;
                struct workqueue_struct *wq;
-                spin_lock_irq(&gcwq->lock);
+                for_each_std_worker_pool(pool, cpu) {
+                        spin_lock_irq(&pool->lock);
-                BUG_ON(gcwq->flags & GCWQ_FREEZING);
+                        WARN_ON_ONCE(pool->flags & POOL_FREEZING);
-                gcwq->flags |= GCWQ_FREEZING;
+                        pool->flags |= POOL_FREEZING;
-                list_for_each_entry(wq, &workqueues, list) {
+                        list_for_each_entry(wq, &workqueues, list) {
-                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                                struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                        if (cwq && wq->flags & WQ_FREEZABLE)
+                                if (pwq && pwq->pool == pool &&
-                                cwq->max_active = 0;
+                                    (wq->flags & WQ_FREEZABLE))
-                }
+                                        pwq->max_active = 0;
+                        }
-                spin_unlock_irq(&gcwq->lock);
+                        spin_unlock_irq(&pool->lock);
+                }
        }
        spin_unlock(&workqueue_lock);
@@ -3754,20 +3647,20 @@ bool freeze_workqueues_busy(void)
        BUG_ON(!workqueue_freezing);
-        for_each_gcwq_cpu(cpu) {
+        for_each_wq_cpu(cpu) {
                struct workqueue_struct *wq;
                /*
                 * nr_active is monotonically decreasing.  It's safe
                 * to peek without lock.
                 */
                list_for_each_entry(wq, &workqueues, list) {
-                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                        struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                        if (!cwq || !(wq->flags & WQ_FREEZABLE))
+                        if (!pwq || !(wq->flags & WQ_FREEZABLE))
                                continue;
-                        BUG_ON(cwq->nr_active < 0);
+                        BUG_ON(pwq->nr_active < 0);
-                        if (cwq->nr_active) {
+                        if (pwq->nr_active) {
                                busy = true;
                                goto out_unlock;
                        }
@@ -3782,10 +3675,10 @@ out_unlock:
 * thaw_workqueues - thaw workqueues
 *
 * Thaw workqueues.  Normal queueing is restored and all collected
- * frozen works are transferred to their respective gcwq worklists.
+ * frozen works are transferred to their respective pool worklists.
 *
 * CONTEXT:
- * Grabs and releases workqueue_lock and gcwq->lock's.
+ * Grabs and releases workqueue_lock and pool->lock's.
 */
 void thaw_workqueues(void)
 {
@@ -3796,30 +3689,31 @@ void thaw_workqueues(void)
        if (!workqueue_freezing)
                goto out_unlock;
-        for_each_gcwq_cpu(cpu) {
+        for_each_wq_cpu(cpu) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
                struct worker_pool *pool;
                struct workqueue_struct *wq;
-                spin_lock_irq(&gcwq->lock);
+                for_each_std_worker_pool(pool, cpu) {
+                        spin_lock_irq(&pool->lock);
-                BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
+                        WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
-                gcwq->flags &= ~GCWQ_FREEZING;
+                        pool->flags &= ~POOL_FREEZING;
-                list_for_each_entry(wq, &workqueues, list) {
+                        list_for_each_entry(wq, &workqueues, list) {
-                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                                struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                        if (!cwq || !(wq->flags & WQ_FREEZABLE))
+                                if (!pwq || pwq->pool != pool ||
-                                continue;
+                                    !(wq->flags & WQ_FREEZABLE))
+                                        continue;
-                        /* restore max_active and repopulate worklist */
+                                /* restore max_active and repopulate worklist */
-                        cwq_set_max_active(cwq, wq->saved_max_active);
+                                pwq_set_max_active(pwq, wq->saved_max_active);
-                }
+                        }
-                for_each_worker_pool(pool, gcwq)
                        wake_up_worker(pool);
-                spin_unlock_irq(&gcwq->lock);
+                        spin_unlock_irq(&pool->lock);
+                }
        }
        workqueue_freezing = false;
@@ -3831,60 +3725,56 @@ out_unlock:
 static int __init init_workqueues(void)
 {
        unsigned int cpu;
-        int i;
-        /* make sure we have enough bits for OFFQ CPU number */
+        /* make sure we have enough bits for OFFQ pool ID */
-        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
+        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
-                     WORK_CPU_LAST);
+                     WORK_CPU_END * NR_STD_WORKER_POOLS);
        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
        hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
-        /* initialize gcwqs */
+        /* initialize CPU pools */
-        for_each_gcwq_cpu(cpu) {
+        for_each_wq_cpu(cpu) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
                struct worker_pool *pool;
-                spin_lock_init(&gcwq->lock);
+                for_each_std_worker_pool(pool, cpu) {
-                gcwq->cpu = cpu;
+                        spin_lock_init(&pool->lock);
-                gcwq->flags |= GCWQ_DISASSOCIATED;
+                        pool->cpu = cpu;
+                        pool->flags |= POOL_DISASSOCIATED;
-                for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
-                        INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
-                for_each_worker_pool(pool, gcwq) {
-                        pool->gcwq = gcwq;
                        INIT_LIST_HEAD(&pool->worklist);
                        INIT_LIST_HEAD(&pool->idle_list);
+                        hash_init(pool->busy_hash);
                        init_timer_deferrable(&pool->idle_timer);
                        pool->idle_timer.function = idle_worker_timeout;
                        pool->idle_timer.data = (unsigned long)pool;
-                        setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
+                        setup_timer(&pool->mayday_timer, pool_mayday_timeout,
                                    (unsigned long)pool);
                        mutex_init(&pool->assoc_mutex);
                        ida_init(&pool->worker_ida);
+                        /* alloc pool ID */
+                        BUG_ON(worker_pool_assign_id(pool));
                }
        }
        /* create the initial worker */
-        for_each_online_gcwq_cpu(cpu) {
+        for_each_online_wq_cpu(cpu) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
                struct worker_pool *pool;
-                if (cpu != WORK_CPU_UNBOUND)
+                for_each_std_worker_pool(pool, cpu) {
-                        gcwq->flags &= ~GCWQ_DISASSOCIATED;
-                for_each_worker_pool(pool, gcwq) {
                        struct worker *worker;
+                        if (cpu != WORK_CPU_UNBOUND)
+                                pool->flags &= ~POOL_DISASSOCIATED;
                        worker = create_worker(pool);
                        BUG_ON(!worker);
-                        spin_lock_irq(&gcwq->lock);
+                        spin_lock_irq(&pool->lock);
                        start_worker(worker);
-                        spin_unlock_irq(&gcwq->lock);
+                        spin_unlock_irq(&pool->lock);
                }
        }
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
new file mode 100644
index 000000000000..07650264ec15
--- /dev/null
+++ b/kernel/workqueue_internal.h
@@ -0,0 +1,65 @@
+/*
+ * kernel/workqueue_internal.h
+ *
+ * Workqueue internal header file.  Only to be included by workqueue and
+ * core kernel subsystems.
+ */
+#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
+#define _KERNEL_WORKQUEUE_INTERNAL_H
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+struct worker_pool;
+/*
+ * The poor guys doing the actual heavy lifting.  All on-duty workers are
+ * either serving the manager role, on idle list or on busy hash.  For
+ * details on the locking annotation (L, I, X...), refer to workqueue.c.
+ *
+ * Only to be used in workqueue and async.
+ */
+struct worker {
+        /* on idle list while idle, on busy hash table while busy */
+        union {
+                struct list_head        entry;  /* L: while idle */
+                struct hlist_node       hentry; /* L: while busy */
+        };
+        struct work_struct      *current_work;  /* L: work being processed */
+        work_func_t             current_func;   /* L: current_work's fn */
+        struct pool_workqueue   *current_pwq; /* L: current_work's pwq */
+        struct list_head        scheduled;      /* L: scheduled works */
+        struct task_struct      *task;          /* I: worker task */
+        struct worker_pool      *pool;          /* I: the associated pool */
+        /* 64 bytes boundary on 64bit, 32 on 32bit */
+        unsigned long           last_active;    /* L: last active timestamp */
+        unsigned int            flags;          /* X: flags */
+        int                     id;             /* I: worker id */
+        /* for rebinding worker to CPU */
+        struct work_struct      rebind_work;    /* L: for busy worker */
+        /* used only by rescuers to point to the target workqueue */
+        struct workqueue_struct *rescue_wq;     /* I: the workqueue to rescue */
+};
+/**
+ * current_wq_worker - return struct worker if %current is a workqueue worker
+ */
+static inline struct worker *current_wq_worker(void)
+{
+        if (current->flags & PF_WQ_WORKER)
+                return kthread_data(current);
+        return NULL;
+}
+/*
+ * Scheduler hooks for concurrency managed workqueue.  Only to be used from
+ * sched.c and workqueue.c.
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+                                       unsigned int cpu);
+#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
deleted file mode 100644
index 2d10fc98dc79..000000000000
--- a/kernel/workqueue_sched.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * kernel/workqueue_sched.h
- *
- * Scheduler hooks for concurrency managed workqueue.  Only to be
- * included from sched.c and workqueue.c.
- */
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
-                                       unsigned int cpu);
author	Ralf Baechle <ralf@linux-mips.org>	2013-02-21 10:16:55 -0500
committer	Ralf Baechle <ralf@linux-mips.org>	2013-02-22 04:07:30 -0500
commit	edb15d83a875a1f4b1576188844db5c330c3267d (patch)
tree	74d54eab401b6ccf2a6ad4821227108a8d160f03 /kernel
parent	8bfc245f9ad7bd4e461179e4e7852ef99b8b6144 (diff)
parent	a0b1c42951dd06ec83cc1bc2c9788131d9fefcd8 (diff)