162 files changed, 10846 insertions, 7394 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 86e3285ae7e5..bbde5f1a4486 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -7,7 +7,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
            rcupdate.o extable.o params.o posix-timers.o \
-            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+            kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o cred.o \
            async.o range.o groups.o lglock.o smpboot.o
@@ -25,9 +25,7 @@ endif
 obj-y += sched/
 obj-y += power/
-ifeq ($(CONFIG_CHECKPOINT_RESTORE),y)
+obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
-obj-$(CONFIG_X86) += kcmp.o
-endif
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -54,7 +52,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
@@ -110,6 +108,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 $(obj)/configs.o: $(obj)/config_data.h
@@ -126,20 +125,32 @@ $(obj)/config_data.h: $(obj)/config_data.gz FORCE
 $(obj)/time.o: $(obj)/timeconst.h
-quiet_cmd_timeconst  = TIMEC   $@
+quiet_cmd_hzfile = HZFILE  $@
-      cmd_timeconst  = $(PERL) $< $(CONFIG_HZ) > $@
+      cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
+targets += hz.bc
+$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
+        $(call if_changed,hzfile)
+quiet_cmd_bc  = BC      $@
+      cmd_bc  = bc -q $(filter-out FORCE,$^) > $@
 targets += timeconst.h
-$(obj)/timeconst.h: $(src)/timeconst.pl FORCE
+$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
-        $(call if_changed,timeconst)
+        $(call if_changed,bc)
 ifeq ($(CONFIG_MODULE_SIG),y)
 #
 # Pull the signing certificate and any extra certificates into the kernel
 #
+quiet_cmd_touch = TOUCH   $@
+      cmd_touch = touch   $@
 extra_certificates:
-        touch $@
+        $(call cmd,touch)
-kernel/modsign_pubkey.o: signing_key.x509 extra_certificates
+kernel/modsign_certificate.o: signing_key.x509 extra_certificates
 ###############################################################################
 #
@@ -148,23 +159,7 @@ kernel/modsign_pubkey.o: signing_key.x509 extra_certificates
 # fail and that the kernel may be used afterwards.
 #
 ###############################################################################
-sign_key_with_hash :=
+ifndef CONFIG_MODULE_SIG_HASH
-ifeq ($(CONFIG_MODULE_SIG_SHA1),y)
-sign_key_with_hash := -sha1
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA224),y)
-sign_key_with_hash := -sha224
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA256),y)
-sign_key_with_hash := -sha256
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA384),y)
-sign_key_with_hash := -sha384
-endif
-ifeq ($(CONFIG_MODULE_SIG_SHA512),y)
-sign_key_with_hash := -sha512
-endif
-ifeq ($(sign_key_with_hash),)
 $(error Could not determine digest type to use from kernel config)
 endif
@@ -177,8 +172,8 @@ signing_key.priv signing_key.x509: x509.genkey
        @echo "### needs to be run as root, and uses a hardware random"
        @echo "### number generator if one is available."
        @echo "###"
-        openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \
+        openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
-                -x509 -config x509.genkey \
+                -batch -x509 -config x509.genkey \
                -outform DER -out signing_key.x509 \
                -keyout signing_key.priv
        @echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index 051e071a06e7..b9bd7f098ee5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -205,7 +205,7 @@ static int acct_on(struct filename *pathname)
        if (IS_ERR(file))
                return PTR_ERR(file);
-        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
+        if (!S_ISREG(file_inode(file)->i_mode)) {
                filp_close(file, NULL);
                return -EACCES;
        }
@@ -566,6 +566,7 @@ out:
 void acct_collect(long exitcode, int group_dead)
 {
        struct pacct_struct *pacct = &current->signal->pacct;
+        cputime_t utime, stime;
        unsigned long vsize = 0;
        if (group_dead && current->mm) {
@@ -593,8 +594,9 @@ void acct_collect(long exitcode, int group_dead)
                pacct->ac_flag |= ACORE;
        if (current->flags & PF_SIGNALED)
                pacct->ac_flag |= AXSIG;
-        pacct->ac_utime += current->utime;
+        task_cputime(current, &utime, &stime);
-        pacct->ac_stime += current->stime;
+        pacct->ac_utime += utime;
+        pacct->ac_stime += stime;
        pacct->ac_minflt += current->min_flt;
        pacct->ac_majflt += current->maj_flt;
        spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 9d3118384858..8ddee2c3e5b0 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -57,56 +57,52 @@ asynchronous and synchronous parts of the kernel.
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include "workqueue_internal.h"
 static async_cookie_t next_cookie = 1;
-#define MAX_WORK        32768
+#define MAX_WORK                32768
+#define ASYNC_COOKIE_MAX        ULLONG_MAX      /* infinity cookie */
-static LIST_HEAD(async_pending);
+static LIST_HEAD(async_global_pending); /* pending from all registered doms */
-static ASYNC_DOMAIN(async_running);
+static ASYNC_DOMAIN(async_dfl_domain);
-static LIST_HEAD(async_domains);
 static DEFINE_SPINLOCK(async_lock);
-static DEFINE_MUTEX(async_register_mutex);
 struct async_entry {
-        struct list_head        list;
+        struct list_head        domain_list;
+        struct list_head        global_list;
        struct work_struct      work;
        async_cookie_t          cookie;
        async_func_ptr          *func;
        void                    *data;
-        struct async_domain     *running;
+        struct async_domain     *domain;
 };
 static DECLARE_WAIT_QUEUE_HEAD(async_done);
 static atomic_t entry_count;
+static async_cookie_t lowest_in_progress(struct async_domain *domain)
-/*
- * MUST be called with the lock held!
- */
-static async_cookie_t  __lowest_in_progress(struct async_domain *running)
 {
-        struct async_entry *entry;
+        struct async_entry *first = NULL;
+        async_cookie_t ret = ASYNC_COOKIE_MAX;
-        if (!list_empty(&running->domain)) {
+        unsigned long flags;
-                entry = list_first_entry(&running->domain, typeof(*entry), list);
-                return entry->cookie;
-        }
-        list_for_each_entry(entry, &async_pending, list)
+        spin_lock_irqsave(&async_lock, flags);
-                if (entry->running == running)
-                        return entry->cookie;
-        return next_cookie;     /* "infinity" value */
+        if (domain) {
-}
+                if (!list_empty(&domain->pending))
+                        first = list_first_entry(&domain->pending,
+                                        struct async_entry, domain_list);
+        } else {
+                if (!list_empty(&async_global_pending))
+                        first = list_first_entry(&async_global_pending,
+                                        struct async_entry, global_list);
+        }
-static async_cookie_t  lowest_in_progress(struct async_domain *running)
+        if (first)
-{
+                ret = first->cookie;
-        unsigned long flags;
-        async_cookie_t ret;
-        spin_lock_irqsave(&async_lock, flags);
-        ret = __lowest_in_progress(running);
        spin_unlock_irqrestore(&async_lock, flags);
        return ret;
 }
@@ -120,14 +116,8 @@ static void async_run_entry_fn(struct work_struct *work)
                container_of(work, struct async_entry, work);
        unsigned long flags;
        ktime_t uninitialized_var(calltime), delta, rettime;
-        struct async_domain *running = entry->running;
-        /* 1) move self to the running queue */
+        /* 1) run (and print duration) */
-        spin_lock_irqsave(&async_lock, flags);
-        list_move_tail(&entry->list, &running->domain);
-        spin_unlock_irqrestore(&async_lock, flags);
-        /* 2) run (and print duration) */
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                printk(KERN_DEBUG "calling  %lli_%pF @ %i\n",
                        (long long)entry->cookie,
@@ -144,23 +134,22 @@ static void async_run_entry_fn(struct work_struct *work)
                        (long long)ktime_to_ns(delta) >> 10);
        }
-        /* 3) remove self from the running queue */
+        /* 2) remove self from the pending queues */
        spin_lock_irqsave(&async_lock, flags);
-        list_del(&entry->list);
+        list_del_init(&entry->domain_list);
-        if (running->registered && --running->count == 0)
+        list_del_init(&entry->global_list);
-                list_del_init(&running->node);
-        /* 4) free the entry */
+        /* 3) free the entry */
        kfree(entry);
        atomic_dec(&entry_count);
        spin_unlock_irqrestore(&async_lock, flags);
-        /* 5) wake up any waiters */
+        /* 4) wake up any waiters */
        wake_up(&async_done);
 }
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
 {
        struct async_entry *entry;
        unsigned long flags;
@@ -183,19 +172,28 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
                ptr(data, newcookie);
                return newcookie;
        }
+        INIT_LIST_HEAD(&entry->domain_list);
+        INIT_LIST_HEAD(&entry->global_list);
        INIT_WORK(&entry->work, async_run_entry_fn);
        entry->func = ptr;
        entry->data = data;
-        entry->running = running;
+        entry->domain = domain;
        spin_lock_irqsave(&async_lock, flags);
+        /* allocate cookie and queue */
        newcookie = entry->cookie = next_cookie++;
-        list_add_tail(&entry->list, &async_pending);
-        if (running->registered && running->count++ == 0)
+        list_add_tail(&entry->domain_list, &domain->pending);
-                list_add_tail(&running->node, &async_domains);
+        if (domain->registered)
+                list_add_tail(&entry->global_list, &async_global_pending);
        atomic_inc(&entry_count);
        spin_unlock_irqrestore(&async_lock, flags);
+        /* mark that this task has queued an async job, used by module init */
+        current->flags |= PF_USED_ASYNC;
        /* schedule for execution */
        queue_work(system_unbound_wq, &entry->work);
@@ -212,7 +210,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
 */
 async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
 {
-        return __async_schedule(ptr, data, &async_running);
+        return __async_schedule(ptr, data, &async_dfl_domain);
 }
 EXPORT_SYMBOL_GPL(async_schedule);
@@ -220,18 +218,18 @@ EXPORT_SYMBOL_GPL(async_schedule);
 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
 * @ptr: function to execute asynchronously
 * @data: data pointer to pass to the function
- * @running: running list for the domain
+ * @domain: the domain
 *
 * Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_domain() functions
+ * @domain may be used in the async_synchronize_*_domain() functions to
- * to wait within a certain synchronization domain rather than globally.
+ * wait within a certain synchronization domain rather than globally.  A
- * A synchronization domain is specified via the running queue @running to use.
+ * synchronization domain is specified via @domain.  Note: This function
- * Note: This function may be called from atomic or non-atomic contexts.
+ * may be called from atomic or non-atomic contexts.
 */
 async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
-                                     struct async_domain *running)
+                                     struct async_domain *domain)
 {
-        return __async_schedule(ptr, data, running);
+        return __async_schedule(ptr, data, domain);
 }
 EXPORT_SYMBOL_GPL(async_schedule_domain);
@@ -242,18 +240,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
 */
 void async_synchronize_full(void)
 {
-        mutex_lock(&async_register_mutex);
+        async_synchronize_full_domain(NULL);
-        do {
-                struct async_domain *domain = NULL;
-                spin_lock_irq(&async_lock);
-                if (!list_empty(&async_domains))
-                        domain = list_first_entry(&async_domains, typeof(*domain), node);
-                spin_unlock_irq(&async_lock);
-                async_synchronize_cookie_domain(next_cookie, domain);
-        } while (!list_empty(&async_domains));
-        mutex_unlock(&async_register_mutex);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
@@ -268,51 +255,45 @@ EXPORT_SYMBOL_GPL(async_synchronize_full);
 */
 void async_unregister_domain(struct async_domain *domain)
 {
-        mutex_lock(&async_register_mutex);
        spin_lock_irq(&async_lock);
-        WARN_ON(!domain->registered || !list_empty(&domain->node) ||
+        WARN_ON(!domain->registered || !list_empty(&domain->pending));
-                !list_empty(&domain->domain));
        domain->registered = 0;
        spin_unlock_irq(&async_lock);
-        mutex_unlock(&async_register_mutex);
 }
 EXPORT_SYMBOL_GPL(async_unregister_domain);
 /**
 * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @domain: running list to synchronize on
+ * @domain: the domain to synchronize
 *
 * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @domain have been done.
+ * synchronization domain specified by @domain have been done.
 */
 void async_synchronize_full_domain(struct async_domain *domain)
 {
-        async_synchronize_cookie_domain(next_cookie, domain);
+        async_synchronize_cookie_domain(ASYNC_COOKIE_MAX, domain);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 /**
 * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
 * @cookie: async_cookie_t to use as checkpoint
- * @running: running list to synchronize on
+ * @domain: the domain to synchronize (%NULL for all registered domains)
 *
 * This function waits until all asynchronous function calls for the
- * synchronization domain specified by running list @running submitted
+ * synchronization domain specified by @domain submitted prior to @cookie
- * prior to @cookie have been done.
+ * have been done.
 */
-void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
 {
        ktime_t uninitialized_var(starttime), delta, endtime;
-        if (!running)
-                return;
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
                starttime = ktime_get();
        }
-        wait_event(async_done, lowest_in_progress(running) >= cookie);
+        wait_event(async_done, lowest_in_progress(domain) >= cookie);
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                endtime = ktime_get();
@@ -334,6 +315,18 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
 */
 void async_synchronize_cookie(async_cookie_t cookie)
 {
-        async_synchronize_cookie_domain(cookie, &async_running);
+        async_synchronize_cookie_domain(cookie, &async_dfl_domain);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_cookie);
+/**
+ * current_is_async - is %current an async worker task?
+ *
+ * Returns %true if %current is an async worker task.
+ */
+bool current_is_async(void)
+{
+        struct worker *worker = current_wq_worker();
+        return worker && worker->current_func == async_run_entry_fn;
+}
diff --git a/kernel/audit.c b/kernel/audit.c
index 40414e9143db..d596e5355f15 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -272,6 +272,8 @@ static int audit_log_config_change(char *function_name, int new, int old,
        int rc = 0;
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+        if (unlikely(!ab))
+                return rc;
        audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
                         old, from_kuid(&init_user_ns, loginuid), sessionid);
        if (sid) {
@@ -619,6 +621,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
        }
        *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+        if (unlikely(!*ab))
+                return rc;
        audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
                         task_tgid_vnr(current),
                         from_kuid(&init_user_ns, current_uid()),
@@ -1097,6 +1101,23 @@ static inline void audit_get_stamp(struct audit_context *ctx,
        }
 }
+/*
+ * Wait for auditd to drain the queue a little
+ */
+static void wait_for_auditd(unsigned long sleep_time)
+{
+        DECLARE_WAITQUEUE(wait, current);
+        set_current_state(TASK_INTERRUPTIBLE);
+        add_wait_queue(&audit_backlog_wait, &wait);
+        if (audit_backlog_limit &&
+            skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+                schedule_timeout(sleep_time);
+        __set_current_state(TASK_RUNNING);
+        remove_wait_queue(&audit_backlog_wait, &wait);
+}
 /* Obtain an audit buffer.  This routine does locking to obtain the
 * audit buffer, but then no locking is required for calls to
 * audit_log_*format.  If the tsk is a task that is currently in a
@@ -1142,20 +1163,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
        while (audit_backlog_limit
               && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
-                if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
+                if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
-                    && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
+                        unsigned long sleep_time;
-                        /* Wait for auditd to drain the queue a little */
+                        sleep_time = timeout_start + audit_backlog_wait_time -
-                        DECLARE_WAITQUEUE(wait, current);
+                                        jiffies;
-                        set_current_state(TASK_INTERRUPTIBLE);
+                        if ((long)sleep_time > 0)
-                        add_wait_queue(&audit_backlog_wait, &wait);
+                                wait_for_auditd(sleep_time);
-                        if (audit_backlog_limit &&
-                            skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
-                                schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
-                        __set_current_state(TASK_RUNNING);
-                        remove_wait_queue(&audit_backlog_wait, &wait);
                        continue;
                }
                if (audit_rate_check() && printk_ratelimit())
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ed206fd88cca..642a89c4f3d6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -249,7 +249,7 @@ static void untag_chunk(struct node *p)
                list_del_rcu(&chunk->hash);
                spin_unlock(&hash_lock);
                spin_unlock(&entry->lock);
-                fsnotify_destroy_mark(entry);
+                fsnotify_destroy_mark(entry, audit_tree_group);
                goto out;
        }
@@ -291,7 +291,7 @@ static void untag_chunk(struct node *p)
                owner->root = new;
        spin_unlock(&hash_lock);
        spin_unlock(&entry->lock);
-        fsnotify_destroy_mark(entry);
+        fsnotify_destroy_mark(entry, audit_tree_group);
        fsnotify_put_mark(&new->mark);  /* drop initial reference */
        goto out;
@@ -331,7 +331,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
                spin_unlock(&hash_lock);
                chunk->dead = 1;
                spin_unlock(&entry->lock);
-                fsnotify_destroy_mark(entry);
+                fsnotify_destroy_mark(entry, audit_tree_group);
                fsnotify_put_mark(entry);
                return 0;
        }
@@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
                spin_unlock(&chunk_entry->lock);
                spin_unlock(&old_entry->lock);
-                fsnotify_destroy_mark(chunk_entry);
+                fsnotify_destroy_mark(chunk_entry, audit_tree_group);
                fsnotify_put_mark(chunk_entry);
                fsnotify_put_mark(old_entry);
@@ -443,17 +443,32 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        spin_unlock(&hash_lock);
        spin_unlock(&chunk_entry->lock);
        spin_unlock(&old_entry->lock);
-        fsnotify_destroy_mark(old_entry);
+        fsnotify_destroy_mark(old_entry, audit_tree_group);
        fsnotify_put_mark(chunk_entry); /* drop initial reference */
        fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
        return 0;
 }
+static void audit_log_remove_rule(struct audit_krule *rule)
+{
+        struct audit_buffer *ab;
+        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+        if (unlikely(!ab))
+                return;
+        audit_log_format(ab, "op=");
+        audit_log_string(ab, "remove rule");
+        audit_log_format(ab, " dir=");
+        audit_log_untrustedstring(ab, rule->tree->pathname);
+        audit_log_key(ab, rule->filterkey);
+        audit_log_format(ab, " list=%d res=1", rule->listnr);
+        audit_log_end(ab);
+}
 static void kill_rules(struct audit_tree *tree)
 {
        struct audit_krule *rule, *next;
        struct audit_entry *entry;
-        struct audit_buffer *ab;
        list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
                entry = container_of(rule, struct audit_entry, rule);
@@ -461,14 +476,7 @@ static void kill_rules(struct audit_tree *tree)
                list_del_init(&rule->rlist);
                if (rule->tree) {
                        /* not a half-baked one */
-                        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+                        audit_log_remove_rule(rule);
-                        audit_log_format(ab, "op=");
-                        audit_log_string(ab, "remove rule");
-                        audit_log_format(ab, " dir=");
-                        audit_log_untrustedstring(ab, rule->tree->pathname);
-                        audit_log_key(ab, rule->filterkey);
-                        audit_log_format(ab, " list=%d res=1", rule->listnr);
-                        audit_log_end(ab);
                        rule->tree = NULL;
                        list_del_rcu(&entry->list);
                        list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 9a9ae6e3d290..22831c4d369c 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -240,6 +240,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
        if (audit_enabled) {
                struct audit_buffer *ab;
                ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+                if (unlikely(!ab))
+                        return;
                audit_log_format(ab, "auid=%u ses=%u op=",
                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
                                 audit_get_sessionid(current));
@@ -350,7 +352,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
        }
        mutex_unlock(&audit_filter_mutex);
-        fsnotify_destroy_mark(&parent->mark);
+        fsnotify_destroy_mark(&parent->mark, audit_watch_group);
 }
 /* Get path information necessary for adding watches. */
@@ -457,7 +459,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
                if (list_empty(&parent->watches)) {
                        audit_get_parent(parent);
-                        fsnotify_destroy_mark(&parent->mark);
+                        fsnotify_destroy_mark(&parent->mark, audit_watch_group);
                        audit_put_parent(parent);
                }
        }
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7f19f23d38a3..f9fc54bbe06f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1144,7 +1144,6 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
 * audit_receive_filter - apply all rules to the specified message type
 * @type: audit message type
 * @pid: target pid for netlink audit messages
- * @uid: target uid for netlink audit messages
 * @seq: netlink audit message sequence (serial) number
 * @data: payload data
 * @datasz: size of payload data
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2f186ed80c40..a371f857a0a9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -200,7 +200,6 @@ struct audit_context {
        struct list_head    names_list; /* anchor for struct audit_names->list */
        char *              filterkey;  /* key for rule that triggered record */
        struct path         pwd;
-        struct audit_context *previous; /* For nested syscalls */
        struct audit_aux_data *aux;
        struct audit_aux_data *aux_pids;
        struct sockaddr_storage *sockaddr;
@@ -1091,29 +1090,13 @@ int audit_alloc(struct task_struct *tsk)
 static inline void audit_free_context(struct audit_context *context)
 {
-        struct audit_context *previous;
+        audit_free_names(context);
-        int                  count = 0;
+        unroll_tree_refs(context, NULL, 0);
+        free_tree_refs(context);
-        do {
+        audit_free_aux(context);
-                previous = context->previous;
+        kfree(context->filterkey);
-                if (previous || (count &&  count < 10)) {
+        kfree(context->sockaddr);
-                        ++count;
+        kfree(context);
-                        printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
-                               " freeing multiple contexts (%d)\n",
-                               context->serial, context->major,
-                               context->name_count, count);
-                }
-                audit_free_names(context);
-                unroll_tree_refs(context, NULL, 0);
-                free_tree_refs(context);
-                audit_free_aux(context);
-                kfree(context->filterkey);
-                kfree(context->sockaddr);
-                kfree(context);
-                context  = previous;
-        } while (context);
-        if (count >= 10)
-                printk(KERN_ERR "audit: freed %d contexts\n", count);
 }
 void audit_log_task_context(struct audit_buffer *ab)
@@ -1159,7 +1142,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
        cred = current_cred();
        spin_lock_irq(&tsk->sighand->siglock);
-        if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+        if (tsk->signal && tsk->signal->tty)
                tty = tsk->signal->tty->name;
        else
                tty = "(none)";
@@ -1481,14 +1464,14 @@ static void show_special(struct audit_context *context, int *call_panic)
                        audit_log_end(ab);
                        ab = audit_log_start(context, GFP_KERNEL,
                                             AUDIT_IPC_SET_PERM);
+                        if (unlikely(!ab))
+                                return;
                        audit_log_format(ab,
                                "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
                                context->ipc.qbytes,
                                context->ipc.perm_uid,
                                context->ipc.perm_gid,
                                context->ipc.perm_mode);
-                        if (!ab)
-                                return;
                }
                break; }
        case AUDIT_MQ_OPEN: {
@@ -1783,42 +1766,6 @@ void __audit_syscall_entry(int arch, int major,
        if (!context)
                return;
-        /*
-         * This happens only on certain architectures that make system
-         * calls in kernel_thread via the entry.S interface, instead of
-         * with direct calls.  (If you are porting to a new
-         * architecture, hitting this condition can indicate that you
-         * got the _exit/_leave calls backward in entry.S.)
-         *
-         * i386     no
-         * x86_64   no
-         * ppc64    yes (see arch/powerpc/platforms/iseries/misc.S)
-         *
-         * This also happens with vm86 emulation in a non-nested manner
-         * (entries without exits), so this case must be caught.
-         */
-        if (context->in_syscall) {
-                struct audit_context *newctx;
-#if AUDIT_DEBUG
-                printk(KERN_ERR
-                       "audit(:%d) pid=%d in syscall=%d;"
-                       " entering syscall=%d\n",
-                       context->serial, tsk->pid, context->major, major);
-#endif
-                newctx = audit_alloc_context(context->state);
-                if (newctx) {
-                        newctx->previous   = context;
-                        context            = newctx;
-                        tsk->audit_context = newctx;
-                } else  {
-                        /* If we can't alloc a new context, the best we
-                         * can do is to leak memory (any pending putname
-                         * will be lost).  The only other alternative is
-                         * to abandon auditing. */
-                        audit_zero_context(context, context->state);
-                }
-        }
        BUG_ON(context->in_syscall || context->name_count);
        if (!audit_enabled)
@@ -1881,28 +1828,21 @@ void __audit_syscall_exit(int success, long return_code)
        if (!list_empty(&context->killed_trees))
                audit_kill_trees(&context->killed_trees);
-        if (context->previous) {
+        audit_free_names(context);
-                struct audit_context *new_context = context->previous;
+        unroll_tree_refs(context, NULL, 0);
-                context->previous  = NULL;
+        audit_free_aux(context);
-                audit_free_context(context);
+        context->aux = NULL;
-                tsk->audit_context = new_context;
+        context->aux_pids = NULL;
-        } else {
+        context->target_pid = 0;
-                audit_free_names(context);
+        context->target_sid = 0;
-                unroll_tree_refs(context, NULL, 0);
+        context->sockaddr_len = 0;
-                audit_free_aux(context);
+        context->type = 0;
-                context->aux = NULL;
+        context->fds[0] = -1;
-                context->aux_pids = NULL;
+        if (context->state != AUDIT_RECORD_CONTEXT) {
-                context->target_pid = 0;
+                kfree(context->filterkey);
-                context->target_sid = 0;
+                context->filterkey = NULL;
-                context->sockaddr_len = 0;
-                context->type = 0;
-                context->fds[0] = -1;
-                if (context->state != AUDIT_RECORD_CONTEXT) {
-                        kfree(context->filterkey);
-                        context->filterkey = NULL;
-                }
-                tsk->audit_context = context;
        }
+        tsk->audit_context = context;
 }
 static inline void handle_one(const struct inode *inode)
@@ -2735,7 +2675,7 @@ void __audit_mmap_fd(int fd, int flags)
        context->type = AUDIT_MMAP;
 }
-static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+static void audit_log_task(struct audit_buffer *ab)
 {
        kuid_t auid, uid;
        kgid_t gid;
@@ -2753,6 +2693,11 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
        audit_log_task_context(ab);
        audit_log_format(ab, " pid=%d comm=", current->pid);
        audit_log_untrustedstring(ab, current->comm);
+}
+static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+{
+        audit_log_task(ab);
        audit_log_format(ab, " reason=");
        audit_log_string(ab, reason);
        audit_log_format(ab, " sig=%ld", signr);
@@ -2775,6 +2720,8 @@ void audit_core_dumps(long signr)
                return;
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+        if (unlikely(!ab))
+                return;
        audit_log_abend(ab, "memory violation", signr);
        audit_log_end(ab);
 }
@@ -2783,8 +2730,11 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
 {
        struct audit_buffer *ab;
-        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
-        audit_log_abend(ab, "seccomp", signr);
+        if (unlikely(!ab))
+                return;
+        audit_log_task(ab);
+        audit_log_format(ab, " sig=%ld", signr);
        audit_log_format(ab, " syscall=%ld", syscall);
        audit_log_format(ab, " compat=%d", is_compat_task());
        audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d97259484..f6c2ce5701e1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
 EXPORT_SYMBOL(ns_capable);
 /**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file:  The file we want to check
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
+{
+        if (WARN_ON_ONCE(!cap_valid(cap)))
+                return false;
+        if (security_capable(file->f_cred, ns, cap) == 0)
+                return true;
+        return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24f724620dd..a32f9432666c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,7 @@
 #include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
-#include <linux/hash.h>
+#include <linux/hashtable.h>
 #include <linux/namei.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
@@ -138,6 +138,9 @@ struct cgroupfs_root {
        /* Hierarchy-specific flags */
        unsigned long flags;
+        /* IDs for cgroups in this hierarchy */
+        struct ida cgroup_ida;
        /* The path to use for release notifications. */
        char release_agent_path[PATH_MAX];
@@ -171,8 +174,8 @@ struct css_id {
         * The css to which this ID points. This pointer is set to valid value
         * after cgroup is populated. If cgroup is removed, this will be NULL.
         * This pointer is expected to be RCU-safe because destroy()
-         * is called after synchronize_rcu(). But for safe use, css_is_removed()
+         * is called after synchronize_rcu(). But for safe use, css_tryget()
-         * css_tryget() should be used for avoiding race.
+         * should be used for avoiding race.
         */
        struct cgroup_subsys_state __rcu *css;
        /*
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
 */
 static int need_forkexit_callback __read_mostly;
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+                              struct cftype cfts[], bool is_add);
 #ifdef CONFIG_PROVE_LOCKING
 int cgroup_lock_is_held(void)
 {
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
        return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
-static int clone_children(const struct cgroup *cgrp)
-{
-        return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
-}
 /*
 * for_each_subsys() allows you to iterate on each subsystem attached to
 * an active hierarchy
@@ -374,22 +376,18 @@ static int css_set_count;
 * account cgroups in empty hierarchies.
 */
 #define CSS_SET_HASH_BITS       7
-#define CSS_SET_TABLE_SIZE      (1 << CSS_SET_HASH_BITS)
+static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
-static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
-static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
+static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 {
        int i;
-        int index;
+        unsigned long key = 0UL;
-        unsigned long tmp = 0UL;
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
-                tmp += (unsigned long)css[i];
+                key += (unsigned long)css[i];
-        tmp = (tmp >> 16) ^ tmp;
+        key = (key >> 16) ^ key;
-        index = hash_long(tmp, CSS_SET_HASH_BITS);
+        return key;
-        return &css_set_table[index];
 }
 /* We don't maintain the lists running through each css_set to its
@@ -416,7 +414,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
        }
        /* This css_set is dead. unlink it and release cgroup refcounts */
-        hlist_del(&cg->hlist);
+        hash_del(&cg->hlist);
        css_set_count--;
        list_for_each_entry_safe(link, saved_link, &cg->cg_links,
@@ -424,12 +422,20 @@ static void __put_css_set(struct css_set *cg, int taskexit)
                struct cgroup *cgrp = link->cgrp;
                list_del(&link->cg_link_list);
                list_del(&link->cgrp_link_list);
+                /*
+                 * We may not be holding cgroup_mutex, and if cgrp->count is
+                 * dropped to 0 the cgroup can be destroyed at any time, hence
+                 * rcu_read_lock is used to keep it alive.
+                 */
+                rcu_read_lock();
                if (atomic_dec_and_test(&cgrp->count) &&
                    notify_on_release(cgrp)) {
                        if (taskexit)
                                set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
+                rcu_read_unlock();
                kfree(link);
        }
@@ -548,9 +554,8 @@ static struct css_set *find_existing_css_set(
 {
        int i;
        struct cgroupfs_root *root = cgrp->root;
-        struct hlist_head *hhead;
-        struct hlist_node *node;
        struct css_set *cg;
+        unsigned long key;
        /*
         * Build the set of subsystem state objects that we want to see in the
@@ -570,8 +575,8 @@ static struct css_set *find_existing_css_set(
                }
        }
-        hhead = css_set_hash(template);
+        key = css_set_hash(template);
-        hlist_for_each_entry(cg, node, hhead, hlist) {
+        hash_for_each_possible(css_set_table, cg, hlist, key) {
                if (!compare_css_sets(cg, oldcg, cgrp, template))
                        continue;
@@ -655,8 +660,8 @@ static struct css_set *find_css_set(
        struct list_head tmp_cg_links;
-        struct hlist_head *hhead;
        struct cg_cgroup_link *link;
+        unsigned long key;
        /* First see if we already have a cgroup group that matches
         * the desired set */
@@ -702,8 +707,8 @@ static struct css_set *find_css_set(
        css_set_count++;
        /* Add this cgroup group to the hash table */
-        hhead = css_set_hash(res->subsys);
+        key = css_set_hash(res->subsys);
-        hlist_add_head(&res->hlist, hhead);
+        hash_add(css_set_table, &res->hlist, key);
        write_unlock(&css_set_lock);
@@ -782,12 +787,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 *      The task_lock() exception
 *
 * The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
+ * cgroup_attach_task(), which overwrites one task's cgroup pointer with
 * another.  It does so using cgroup_mutex, however there are
 * several performance critical places that need to reference
 * task->cgroup without the expense of grabbing a system global
 * mutex.  Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroup pointer we use
 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
 * the task_struct routinely used for such matters.
 *
@@ -854,28 +859,44 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
        return inode;
 }
-/*
+static void cgroup_free_fn(struct work_struct *work)
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
- */
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
+        struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
        struct cgroup_subsys *ss;
-        int ret = 0;
-        for_each_subsys(cgrp->root, ss) {
+        mutex_lock(&cgroup_mutex);
-                if (!ss->pre_destroy)
+        /*
-                        continue;
+         * Release the subsystem state objects.
+         */
+        for_each_subsys(cgrp->root, ss)
+                ss->css_free(cgrp);
-                ret = ss->pre_destroy(cgrp);
+        cgrp->root->number_of_cgroups--;
-                if (ret) {
+        mutex_unlock(&cgroup_mutex);
-                        /* ->pre_destroy() failure is being deprecated */
-                        WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
-                        break;
-                }
-        }
-        return ret;
+        /*
+         * Drop the active superblock reference that we took when we
+         * created the cgroup
+         */
+        deactivate_super(cgrp->root->sb);
+        /*
+         * if we're getting rid of the cgroup, refcount should ensure
+         * that there are no pidlists left.
+         */
+        BUG_ON(!list_empty(&cgrp->pidlists));
+        simple_xattrs_free(&cgrp->xattrs);
+        ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+        kfree(cgrp);
+}
+static void cgroup_free_rcu(struct rcu_head *head)
+{
+        struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
+        schedule_work(&cgrp->free_work);
 }
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -883,41 +904,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
        /* is dentry a directory ? if so, kfree() associated cgroup */
        if (S_ISDIR(inode->i_mode)) {
                struct cgroup *cgrp = dentry->d_fsdata;
-                struct cgroup_subsys *ss;
-                BUG_ON(!(cgroup_is_removed(cgrp)));
-                /* It's possible for external users to be holding css
-                 * reference counts on a cgroup; css_put() needs to
-                 * be able to access the cgroup after decrementing
-                 * the reference count in order to know if it needs to
-                 * queue the cgroup to be handled by the release
-                 * agent */
-                synchronize_rcu();
-                mutex_lock(&cgroup_mutex);
-                /*
-                 * Release the subsystem state objects.
-                 */
-                for_each_subsys(cgrp->root, ss)
-                        ss->destroy(cgrp);
-                cgrp->root->number_of_cgroups--;
-                mutex_unlock(&cgroup_mutex);
-                /*
-                 * Drop the active superblock reference that we took when we
-                 * created the cgroup
-                 */
-                deactivate_super(cgrp->root->sb);
-                /*
+                BUG_ON(!(cgroup_is_removed(cgrp)));
-                 * if we're getting rid of the cgroup, refcount should ensure
+                call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
-                 * that there are no pidlists left.
-                 */
-                BUG_ON(!list_empty(&cgrp->pidlists));
-                simple_xattrs_free(&cgrp->xattrs);
-                kfree_rcu(cgrp, rcu_head);
        } else {
                struct cfent *cfe = __d_cfe(dentry);
                struct cgroup *cgrp = dentry->d_parent->d_fsdata;
@@ -946,13 +935,17 @@ static void remove_dir(struct dentry *d)
        dput(parent);
 }
-static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 {
        struct cfent *cfe;
        lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
        lockdep_assert_held(&cgroup_mutex);
+        /*
+         * If we're doing cleanup due to failure of cgroup_create(),
+         * the corresponding @cfe may not exist.
+         */
        list_for_each_entry(cfe, &cgrp->files, node) {
                struct dentry *d = cfe->dentry;
@@ -965,9 +958,8 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
                list_del_init(&cfe->node);
                dput(d);
-                return 0;
+                break;
        }
-        return -ENOENT;
 }
 /**
@@ -987,7 +979,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
                if (!test_bit(ss->subsys_id, &subsys_mask))
                        continue;
                list_for_each_entry(set, &ss->cftsets, node)
-                        cgroup_rm_file(cgrp, set->cfts);
+                        cgroup_addrm_files(cgrp, NULL, set->cfts, false);
        }
        if (base_files) {
                while (!list_empty(&cgrp->files))
@@ -1015,33 +1007,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 }
 /*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
-        if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
-                wake_up_all(&cgroup_rmdir_waitq);
-}
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
-        css_get(css);
-}
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
-        cgroup_wakeup_rmdir_waiter(css->cgroup);
-        css_put(css);
-}
-/*
 * Call with cgroup_mutex held. Drops reference counts on modules, including
 * any duplicate ones that parse_cgroupfs_options took. If this function
 * returns an error, no reference counts are touched.
@@ -1131,7 +1096,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                }
        }
        root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
-        synchronize_rcu();
        return 0;
 }
@@ -1150,7 +1114,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
                seq_puts(seq, ",xattr");
        if (strlen(root->release_agent_path))
                seq_printf(seq, ",release_agent=%s", root->release_agent_path);
-        if (clone_children(&root->top_cgroup))
+        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
                seq_puts(seq, ",clone_children");
        if (strlen(root->name))
                seq_printf(seq, ",name=%s", root->name);
@@ -1162,7 +1126,7 @@ struct cgroup_sb_opts {
        unsigned long subsys_mask;
        unsigned long flags;
        char *release_agent;
-        bool clone_children;
+        bool cpuset_clone_children;
        char *name;
        /* User explicitly requested empty subsystem */
        bool none;
@@ -1213,7 +1177,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
                        continue;
                }
                if (!strcmp(token, "clone_children")) {
-                        opts->clone_children = true;
+                        opts->cpuset_clone_children = true;
                        continue;
                }
                if (!strcmp(token, "xattr")) {
@@ -1381,7 +1345,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                goto out_unlock;
-        /* See feature-removal-schedule.txt */
        if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
                pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
                           task_tgid_nr(current), current->comm);
@@ -1397,14 +1360,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
                goto out_unlock;
        }
+        /*
+         * Clear out the files of subsystems that should be removed, do
+         * this before rebind_subsystems, since rebind_subsystems may
+         * change this hierarchy's subsys_list.
+         */
+        cgroup_clear_directory(cgrp->dentry, false, removed_mask);
        ret = rebind_subsystems(root, opts.subsys_mask);
        if (ret) {
+                /* rebind_subsystems failed, re-populate the removed files */
+                cgroup_populate_dir(cgrp, false, removed_mask);
                drop_parsed_module_refcounts(opts.subsys_mask);
                goto out_unlock;
        }
-        /* clear out any existing files and repopulate subsystem files */
-        cgroup_clear_directory(cgrp->dentry, false, removed_mask);
        /* re-populate subsystem files */
        cgroup_populate_dir(cgrp, false, added_mask);
@@ -1432,8 +1402,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
        INIT_LIST_HEAD(&cgrp->children);
        INIT_LIST_HEAD(&cgrp->files);
        INIT_LIST_HEAD(&cgrp->css_sets);
+        INIT_LIST_HEAD(&cgrp->allcg_node);
        INIT_LIST_HEAD(&cgrp->release_list);
        INIT_LIST_HEAD(&cgrp->pidlists);
+        INIT_WORK(&cgrp->free_work, cgroup_free_fn);
        mutex_init(&cgrp->pidlist_mutex);
        INIT_LIST_HEAD(&cgrp->event_list);
        spin_lock_init(&cgrp->event_list_lock);
@@ -1450,8 +1422,8 @@ static void init_cgroup_root(struct cgroupfs_root *root)
        root->number_of_cgroups = 1;
        cgrp->root = root;
        cgrp->top_cgroup = cgrp;
-        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        init_cgroup_housekeeping(cgrp);
+        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 }
 static bool init_root_id(struct cgroupfs_root *root)
@@ -1518,12 +1490,13 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
        root->subsys_mask = opts->subsys_mask;
        root->flags = opts->flags;
+        ida_init(&root->cgroup_ida);
        if (opts->release_agent)
                strcpy(root->release_agent_path, opts->release_agent);
        if (opts->name)
                strcpy(root->name, opts->name);
-        if (opts->clone_children)
+        if (opts->cpuset_clone_children)
-                set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
+                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
        return root;
 }
@@ -1536,6 +1509,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root)
        spin_lock(&hierarchy_id_lock);
        ida_remove(&hierarchy_ida, root->hierarchy_id);
        spin_unlock(&hierarchy_id_lock);
+        ida_destroy(&root->cgroup_ida);
        kfree(root);
 }
@@ -1636,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                struct cgroupfs_root *existing_root;
                const struct cred *cred;
                int i;
+                struct css_set *cg;
                BUG_ON(sb->s_root != NULL);
@@ -1689,19 +1664,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
                /* Link the top cgroup in this hierarchy into all
                 * the css_set objects */
                write_lock(&css_set_lock);
-                for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+                hash_for_each(css_set_table, i, cg, hlist)
-                        struct hlist_head *hhead = &css_set_table[i];
+                        link_css_set(&tmp_cg_links, cg, root_cgrp);
-                        struct hlist_node *node;
-                        struct css_set *cg;
-                        hlist_for_each_entry(cg, node, hhead, hlist)
-                                link_css_set(&tmp_cg_links, cg, root_cgrp);
-                }
                write_unlock(&css_set_lock);
                free_cg_links(&tmp_cg_links);
-                BUG_ON(!list_empty(&root_cgrp->sibling));
                BUG_ON(!list_empty(&root_cgrp->children));
                BUG_ON(root->number_of_cgroups != 1);
@@ -1750,7 +1718,6 @@ static void cgroup_kill_sb(struct super_block *sb) {
        BUG_ON(root->number_of_cgroups != 1);
        BUG_ON(!list_empty(&cgrp->children));
-        BUG_ON(!list_empty(&cgrp->sibling));
        mutex_lock(&cgroup_mutex);
        mutex_lock(&cgroup_root_mutex);
@@ -1808,11 +1775,13 @@ static struct kobject *cgroup_kobj;
 */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
+        struct dentry *dentry = cgrp->dentry;
        char *start;
-        struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
-                                                      cgroup_lock_is_held());
-        if (!dentry || cgrp == dummytop) {
+        rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
+                           "cgroup_path() called without proper locking");
+        if (cgrp == dummytop) {
                /*
                 * Inactive subsystems have no dentry for their root
                 * cgroup
@@ -1821,9 +1790,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
                return 0;
        }
-        start = buf + buflen;
+        start = buf + buflen - 1;
-        *--start = '\0';
+        *start = '\0';
        for (;;) {
                int len = dentry->d_name.len;
@@ -1834,8 +1803,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
                if (!cgrp)
                        break;
-                dentry = rcu_dereference_check(cgrp->dentry,
+                dentry = cgrp->dentry;
-                                               cgroup_lock_is_held());
                if (!cgrp->parent)
                        continue;
                if (--start < buf)
@@ -1930,9 +1898,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
 /*
 * cgroup_task_migrate - move a task from one cgroup to another.
 *
- * 'guarantee' is set if the caller promises that a new css_set for the task
+ * Must be called with cgroup_mutex and threadgroup locked.
- * will already exist. If not set, this function might sleep, and can fail with
- * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
 */
 static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
                                struct task_struct *tsk, struct css_set *newcg)
@@ -2024,13 +1990,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                        ss->attach(cgrp, &tset);
        }
-        synchronize_rcu();
-        /*
-         * wake up rmdir() waiter. the rmdir should fail since the cgroup
-         * is no longer empty.
-         */
-        cgroup_wakeup_rmdir_waiter(cgrp);
 out:
        if (retval) {
                for_each_subsys(root, ss) {
@@ -2199,8 +2158,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
        /*
         * step 5: success! and cleanup
         */
-        synchronize_rcu();
-        cgroup_wakeup_rmdir_waiter(cgrp);
        retval = 0;
 out_put_css_set_refs:
        if (retval) {
@@ -2686,7 +2643,7 @@ static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, un
 */
 static inline struct cftype *__file_cft(struct file *file)
 {
-        if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
+        if (file_inode(file)->i_fop != &cgroup_file_operations)
                return ERR_PTR(-EINVAL);
        return __d_cft(file->f_dentry);
 }
@@ -2711,10 +2668,17 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
                /* start off with i_nlink == 2 (for "." entry) */
                inc_nlink(inode);
+                inc_nlink(dentry->d_parent->d_inode);
-                /* start with the directory inode held, so that we can
+                /*
-                 * populate it without racing with another mkdir */
+                 * Control reaches here with cgroup_mutex held.
-                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+                 * @inode->i_mutex should nest outside cgroup_mutex but we
+                 * want to populate it immediately without releasing
+                 * cgroup_mutex.  As @inode isn't visible to anyone else
+                 * yet, trylock will always succeed without affecting
+                 * lockdep checks.
+                 */
+                WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
        } else if (S_ISREG(mode)) {
                inode->i_size = 0;
                inode->i_fop = &cgroup_file_operations;
@@ -2725,32 +2689,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
        return 0;
 }
-/*
- * cgroup_create_dir - create a directory for an object.
- * @cgrp: the cgroup we create the directory for. It must have a valid
- *        ->parent field. And we are going to fill its ->dentry field.
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new directory.
- */
-static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-                                umode_t mode)
-{
-        struct dentry *parent;
-        int error = 0;
-        parent = cgrp->parent->dentry;
-        error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
-        if (!error) {
-                dentry->d_fsdata = cgrp;
-                inc_nlink(parent->d_inode);
-                rcu_assign_pointer(cgrp->dentry, dentry);
-                dget(dentry);
-        }
-        dput(dentry);
-        return error;
-}
 /**
 * cgroup_file_mode - deduce file mode of a control file
 * @cft: the control file in question
@@ -2791,12 +2729,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
        simple_xattrs_init(&cft->xattrs);
-        /* does @cft->flags tell us to skip creation on @cgrp? */
-        if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
-                return 0;
-        if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
-                return 0;
        if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
                strcpy(name, subsys->name);
                strcat(name, ".");
@@ -2837,14 +2769,20 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
        int err, ret = 0;
        for (cft = cfts; cft->name[0] != '\0'; cft++) {
-                if (is_add)
+                /* does cft->flags tell us to skip this file on @cgrp? */
+                if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+                        continue;
+                if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+                        continue;
+                if (is_add) {
                        err = cgroup_add_file(cgrp, subsys, cft);
-                else
+                        if (err)
-                        err = cgroup_rm_file(cgrp, cft);
+                                pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
-                if (err) {
+                                        cft->name, err);
-                        pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
-                                   is_add ? "add" : "remove", cft->name, err);
                        ret = err;
+                } else {
+                        cgroup_rm_file(cgrp, cft);
                }
        }
        return ret;
@@ -3044,6 +2982,118 @@ static void cgroup_enable_task_cg_lists(void)
        write_unlock(&css_set_lock);
 }
+/**
+ * cgroup_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_pre().  Find the next
+ * descendant to visit for pre-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
+                                          struct cgroup *cgroup)
+{
+        struct cgroup *next;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        /* if first iteration, pretend we just visited @cgroup */
+        if (!pos) {
+                if (list_empty(&cgroup->children))
+                        return NULL;
+                pos = cgroup;
+        }
+        /* visit the first child if exists */
+        next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
+        if (next)
+                return next;
+        /* no child, visit my or the closest ancestor's next sibling */
+        do {
+                next = list_entry_rcu(pos->sibling.next, struct cgroup,
+                                      sibling);
+                if (&next->sibling != &pos->parent->children)
+                        return next;
+                pos = pos->parent;
+        } while (pos != cgroup);
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
+/**
+ * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
+ * @pos: cgroup of interest
+ *
+ * Return the rightmost descendant of @pos.  If there's no descendant,
+ * @pos is returned.  This can be used during pre-order traversal to skip
+ * subtree of @pos.
+ */
+struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
+{
+        struct cgroup *last, *tmp;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        do {
+                last = pos;
+                /* ->prev isn't RCU safe, walk ->next till the end */
+                pos = NULL;
+                list_for_each_entry_rcu(tmp, &last->children, sibling)
+                        pos = tmp;
+        } while (pos);
+        return last;
+}
+EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
+static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
+{
+        struct cgroup *last;
+        do {
+                last = pos;
+                pos = list_first_or_null_rcu(&pos->children, struct cgroup,
+                                             sibling);
+        } while (pos);
+        return last;
+}
+/**
+ * cgroup_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * To be used by cgroup_for_each_descendant_post().  Find the next
+ * descendant to visit for post-order traversal of @cgroup's descendants.
+ */
+struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
+                                           struct cgroup *cgroup)
+{
+        struct cgroup *next;
+        WARN_ON_ONCE(!rcu_read_lock_held());
+        /* if first iteration, visit the leftmost descendant */
+        if (!pos) {
+                next = cgroup_leftmost_descendant(cgroup);
+                return next != cgroup ? next : NULL;
+        }
+        /* if there's an unvisited sibling, visit its leftmost descendant */
+        next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+        if (&next->sibling != &pos->parent->children)
+                return cgroup_leftmost_descendant(next);
+        /* no sibling left, visit parent */
+        next = pos->parent;
+        return next != cgroup ? next : NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
        __acquires(css_set_lock)
 {
@@ -3390,7 +3440,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 {
        struct cgroup_pidlist *l;
        /* don't need task_nsproxy() if we're looking at ourself */
-        struct pid_namespace *ns = current->nsproxy->pid_ns;
+        struct pid_namespace *ns = task_active_pid_ns(current);
        /*
         * We can't drop the pidlist_mutex before taking the l->mutex in case
@@ -3734,8 +3784,13 @@ static void cgroup_event_remove(struct work_struct *work)
                        remove);
        struct cgroup *cgrp = event->cgrp;
+        remove_wait_queue(event->wqh, &event->wait);
        event->cft->unregister_event(cgrp, event->cft, event->eventfd);
+        /* Notify userspace the event is going away. */
+        eventfd_signal(event->eventfd, 1);
        eventfd_ctx_put(event->eventfd);
        kfree(event);
        dput(cgrp->dentry);
@@ -3755,15 +3810,25 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
        unsigned long flags = (unsigned long)key;
        if (flags & POLLHUP) {
-                __remove_wait_queue(event->wqh, &event->wait);
-                spin_lock(&cgrp->event_list_lock);
-                list_del(&event->list);
-                spin_unlock(&cgrp->event_list_lock);
                /*
-                 * We are in atomic context, but cgroup_event_remove() may
+                 * If the event has been detached at cgroup removal, we
-                 * sleep, so we have to call it in workqueue.
+                 * can simply return knowing the other side will cleanup
+                 * for us.
+                 *
+                 * We can't race against event freeing since the other
+                 * side will require wqh->lock via remove_wait_queue(),
+                 * which we hold.
                 */
-                schedule_work(&event->remove);
+                spin_lock(&cgrp->event_list_lock);
+                if (!list_empty(&event->list)) {
+                        list_del_init(&event->list);
+                        /*
+                         * We are in atomic context, but cgroup_event_remove()
+                         * may sleep, so we have to call it in workqueue.
+                         */
+                        schedule_work(&event->remove);
+                }
+                spin_unlock(&cgrp->event_list_lock);
        }
        return 0;
@@ -3789,6 +3854,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                                      const char *buffer)
 {
        struct cgroup_event *event = NULL;
+        struct cgroup *cgrp_cfile;
        unsigned int efd, cfd;
        struct file *efile = NULL;
        struct file *cfile = NULL;
@@ -3834,7 +3900,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
        /* the process need read permission on control file */
        /* AV: shouldn't we check that it's been opened for read instead? */
-        ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
+        ret = inode_permission(file_inode(cfile), MAY_READ);
        if (ret < 0)
                goto fail;
@@ -3844,6 +3910,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
                goto fail;
        }
+        /*
+         * The file to be monitored must be in the same cgroup as
+         * cgroup.event_control is.
+         */
+        cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+        if (cgrp_cfile != cgrp) {
+                ret = -EINVAL;
+                goto fail;
+        }
        if (!event->cft->register_event || !event->cft->unregister_event) {
                ret = -EINVAL;
                goto fail;
@@ -3894,7 +3970,7 @@ fail:
 static u64 cgroup_clone_children_read(struct cgroup *cgrp,
                                    struct cftype *cft)
 {
-        return clone_children(cgrp);
+        return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
 }
 static int cgroup_clone_children_write(struct cgroup *cgrp,
@@ -3902,9 +3978,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
                                     u64 val)
 {
        if (val)
-                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
        else
-                clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+                clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
        return 0;
 }
@@ -4017,19 +4093,57 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
        css->flags = 0;
        css->id = NULL;
        if (cgrp == dummytop)
-                set_bit(CSS_ROOT, &css->flags);
+                css->flags |= CSS_ROOT;
        BUG_ON(cgrp->subsys[ss->subsys_id]);
        cgrp->subsys[ss->subsys_id] = css;
        /*
-         * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
+         * css holds an extra ref to @cgrp->dentry which is put on the last
-         * which is put on the last css_put().  dput() requires process
+         * css_put().  dput() requires process context, which css_put() may
-         * context, which css_put() may be called without.  @css->dput_work
+         * be called without.  @css->dput_work will be used to invoke
-         * will be used to invoke dput() asynchronously from css_put().
+         * dput() asynchronously from css_put().
         */
        INIT_WORK(&css->dput_work, css_dput_fn);
-        if (ss->__DEPRECATED_clear_css_refs)
+}
-                set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
+/* invoke ->post_create() on a new CSS and mark it online if successful */
+static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+        int ret = 0;
+        lockdep_assert_held(&cgroup_mutex);
+        if (ss->css_online)
+                ret = ss->css_online(cgrp);
+        if (!ret)
+                cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE;
+        return ret;
+}
+/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */
+static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
+        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+        struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+        lockdep_assert_held(&cgroup_mutex);
+        if (!(css->flags & CSS_ONLINE))
+                return;
+        /*
+         * css_offline() should be called with cgroup_mutex unlocked.  See
+         * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
+         * details.  This temporary unlocking should go away once
+         * cgroup_mutex is unexported from controllers.
+         */
+        if (ss->css_offline) {
+                mutex_unlock(&cgroup_mutex);
+                ss->css_offline(cgrp);
+                mutex_lock(&cgroup_mutex);
+        }
+        cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
 }
 /*
@@ -4049,10 +4163,27 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        struct cgroup_subsys *ss;
        struct super_block *sb = root->sb;
+        /* allocate the cgroup and its ID, 0 is reserved for the root */
        cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
        if (!cgrp)
                return -ENOMEM;
+        cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
+        if (cgrp->id < 0)
+                goto err_free_cgrp;
+        /*
+         * Only live parents can have children.  Note that the liveliness
+         * check isn't strictly necessary because cgroup_mkdir() and
+         * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
+         * anyway so that locking is contained inside cgroup proper and we
+         * don't get nasty surprises if we ever grow another caller.
+         */
+        if (!cgroup_lock_live_group(parent)) {
+                err = -ENODEV;
+                goto err_free_id;
+        }
        /* Grab a reference on the superblock so the hierarchy doesn't
         * get deleted on unmount if there are child cgroups.  This
         * can be done outside cgroup_mutex, since the sb can't
@@ -4060,10 +4191,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
         * fs */
        atomic_inc(&sb->s_active);
-        mutex_lock(&cgroup_mutex);
        init_cgroup_housekeeping(cgrp);
+        dentry->d_fsdata = cgrp;
+        cgrp->dentry = dentry;
        cgrp->parent = parent;
        cgrp->root = parent->root;
        cgrp->top_cgroup = parent->top_cgroup;
@@ -4071,26 +4203,49 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-        if (clone_children(parent))
+        if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
-                set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+                set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css;
-                css = ss->create(cgrp);
+                css = ss->css_alloc(cgrp);
                if (IS_ERR(css)) {
                        err = PTR_ERR(css);
-                        goto err_destroy;
+                        goto err_free_all;
                }
                init_cgroup_css(css, ss, cgrp);
                if (ss->use_id) {
                        err = alloc_css_id(ss, parent, cgrp);
                        if (err)
-                                goto err_destroy;
+                                goto err_free_all;
                }
-                /* At error, ->destroy() callback has to free assigned ID. */
+        }
-                if (clone_children(parent) && ss->post_clone)
-                        ss->post_clone(cgrp);
+        /*
+         * Create directory.  cgroup_create_file() returns with the new
+         * directory locked on success so that it can be populated without
+         * dropping cgroup_mutex.
+         */
+        err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
+        if (err < 0)
+                goto err_free_all;
+        lockdep_assert_held(&dentry->d_inode->i_mutex);
+        /* allocation complete, commit to creation */
+        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
+        list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
+        root->number_of_cgroups++;
+        /* each css holds a ref to the cgroup's dentry */
+        for_each_subsys(root, ss)
+                dget(dentry);
+        /* creation succeeded, notify subsystems */
+        for_each_subsys(root, ss) {
+                err = online_css(ss, cgrp);
+                if (err)
+                        goto err_destroy;
                if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
                    parent->parent) {
@@ -4102,50 +4257,34 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                }
        }
-        list_add(&cgrp->sibling, &cgrp->parent->children);
-        root->number_of_cgroups++;
-        err = cgroup_create_dir(cgrp, dentry, mode);
-        if (err < 0)
-                goto err_remove;
-        /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
-        for_each_subsys(root, ss)
-                if (!ss->__DEPRECATED_clear_css_refs)
-                        dget(dentry);
-        /* The cgroup directory was pre-locked for us */
-        BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
-        list_add_tail(&cgrp->allcg_node, &root->allcg_list);
        err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
-        /* If err < 0, we have a half-filled directory - oh well ;) */
+        if (err)
+                goto err_destroy;
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
        return 0;
- err_remove:
+err_free_all:
-        list_del(&cgrp->sibling);
-        root->number_of_cgroups--;
- err_destroy:
        for_each_subsys(root, ss) {
                if (cgrp->subsys[ss->subsys_id])
-                        ss->destroy(cgrp);
+                        ss->css_free(cgrp);
        }
        mutex_unlock(&cgroup_mutex);
        /* Release the reference count that we took on the superblock */
        deactivate_super(sb);
+err_free_id:
+        ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_cgrp:
        kfree(cgrp);
        return err;
+err_destroy:
+        cgroup_destroy_locked(cgrp);
+        mutex_unlock(&cgroup_mutex);
+        mutex_unlock(&dentry->d_inode->i_mutex);
+        return err;
 }
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -4197,153 +4336,60 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
        return 0;
 }
-/*
+static int cgroup_destroy_locked(struct cgroup *cgrp)
- * Atomically mark all (or else none) of the cgroup's CSS objects as
+        __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
- *
- * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
- * not, cgroup removal behaves differently.
- *
- * If clear is set, css refcnt for the subsystem should be zero before
- * cgroup removal can be committed.  This is implemented by
- * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
- * called multiple times until all css refcnts reach zero and is allowed to
- * veto removal on any invocation.  This behavior is deprecated and will be
- * removed as soon as the existing user (memcg) is updated.
- *
- * If clear is not set, each css holds an extra reference to the cgroup's
- * dentry and cgroup removal proceeds regardless of css refs.
- * ->pre_destroy() will be called at least once and is not allowed to fail.
- * On the last put of each css, whenever that may be, the extra dentry ref
- * is put so that dentry destruction happens only after all css's are
- * released.
- */
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
 {
+        struct dentry *d = cgrp->dentry;
+        struct cgroup *parent = cgrp->parent;
+        DEFINE_WAIT(wait);
+        struct cgroup_event *event, *tmp;
        struct cgroup_subsys *ss;
-        unsigned long flags;
+        LIST_HEAD(tmp_list);
-        bool failed = false;
+        lockdep_assert_held(&d->d_inode->i_mutex);
+        lockdep_assert_held(&cgroup_mutex);
-        local_irq_save(flags);
+        if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
+                return -EBUSY;
        /*
-         * Block new css_tryget() by deactivating refcnt.  If all refcnts
+         * Block new css_tryget() by deactivating refcnt and mark @cgrp
-         * for subsystems w/ clear_css_refs set were 1 at the moment of
+         * removed.  This makes future css_tryget() and child creation
-         * deactivation, we succeeded.
+         * attempts fail thus maintaining the removal conditions verified
+         * above.
         */
        for_each_subsys(cgrp->root, ss) {
                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
                WARN_ON(atomic_read(&css->refcnt) < 0);
                atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-                if (ss->__DEPRECATED_clear_css_refs)
-                        failed |= css_refcnt(css) != 1;
        }
+        set_bit(CGRP_REMOVED, &cgrp->flags);
-        /*
+        /* tell subsystems to initate destruction */
-         * If succeeded, set REMOVED and put all the base refs; otherwise,
+        for_each_subsys(cgrp->root, ss)
-         * restore refcnts to positive values.  Either way, all in-progress
+                offline_css(ss, cgrp);
-         * css_tryget() will be released.
-         */
-        for_each_subsys(cgrp->root, ss) {
-                struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-                if (!failed) {
-                        set_bit(CSS_REMOVED, &css->flags);
-                        css_put(css);
-                } else {
-                        atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
-                }
-        }
-        local_irq_restore(flags);
-        return !failed;
-}
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
-{
-        struct cgroup *cgrp = dentry->d_fsdata;
-        struct dentry *d;
-        struct cgroup *parent;
-        DEFINE_WAIT(wait);
-        struct cgroup_event *event, *tmp;
-        int ret;
-        /* the vfs holds both inode->i_mutex already */
-again:
-        mutex_lock(&cgroup_mutex);
-        if (atomic_read(&cgrp->count) != 0) {
-                mutex_unlock(&cgroup_mutex);
-                return -EBUSY;
-        }
-        if (!list_empty(&cgrp->children)) {
-                mutex_unlock(&cgroup_mutex);
-                return -EBUSY;
-        }
-        mutex_unlock(&cgroup_mutex);
-        /*
-         * In general, subsystem has no css->refcnt after pre_destroy(). But
-         * in racy cases, subsystem may have to get css->refcnt after
-         * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
-         * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
-         * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
-         * and subsystem's reference count handling. Please see css_get/put
-         * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
-         */
-        set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
        /*
-         * Call pre_destroy handlers of subsys. Notify subsystems
+         * Put all the base refs.  Each css holds an extra reference to the
-         * that rmdir() request comes.
+         * cgroup's dentry and cgroup removal proceeds regardless of css
+         * refs.  On the last put of each css, whenever that may be, the
+         * extra dentry ref is put so that dentry destruction happens only
+         * after all css's are released.
         */
-        ret = cgroup_call_pre_destroy(cgrp);
+        for_each_subsys(cgrp->root, ss)
-        if (ret) {
+                css_put(cgrp->subsys[ss->subsys_id]);
-                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-                return ret;
-        }
-        mutex_lock(&cgroup_mutex);
-        parent = cgrp->parent;
-        if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
-                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-                mutex_unlock(&cgroup_mutex);
-                return -EBUSY;
-        }
-        prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
-        if (!cgroup_clear_css_refs(cgrp)) {
-                mutex_unlock(&cgroup_mutex);
-                /*
-                 * Because someone may call cgroup_wakeup_rmdir_waiter() before
-                 * prepare_to_wait(), we need to check this flag.
-                 */
-                if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
-                        schedule();
-                finish_wait(&cgroup_rmdir_waitq, &wait);
-                clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-                if (signal_pending(current))
-                        return -EINTR;
-                goto again;
-        }
-        /* NO css_tryget() can success after here. */
-        finish_wait(&cgroup_rmdir_waitq, &wait);
-        clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
        raw_spin_lock(&release_list_lock);
-        set_bit(CGRP_REMOVED, &cgrp->flags);
        if (!list_empty(&cgrp->release_list))
                list_del_init(&cgrp->release_list);
        raw_spin_unlock(&release_list_lock);
        /* delete this cgroup from parent->children */
-        list_del_init(&cgrp->sibling);
+        list_del_rcu(&cgrp->sibling);
        list_del_init(&cgrp->allcg_node);
-        d = dget(cgrp->dentry);
+        dget(d);
        cgroup_d_remove_dir(d);
        dput(d);
@@ -4353,21 +4399,29 @@ again:
        /*
         * Unregister events and notify userspace.
         * Notify userspace about cgroup removing only after rmdir of cgroup
-         * directory to avoid race between userspace and kernelspace
+         * directory to avoid race between userspace and kernelspace.
         */
        spin_lock(&cgrp->event_list_lock);
        list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-                list_del(&event->list);
+                list_del_init(&event->list);
-                remove_wait_queue(event->wqh, &event->wait);
-                eventfd_signal(event->eventfd, 1);
                schedule_work(&event->remove);
        }
        spin_unlock(&cgrp->event_list_lock);
-        mutex_unlock(&cgroup_mutex);
        return 0;
 }
+static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+{
+        int ret;
+        mutex_lock(&cgroup_mutex);
+        ret = cgroup_destroy_locked(dentry->d_fsdata);
+        mutex_unlock(&cgroup_mutex);
+        return ret;
+}
 static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
 {
        INIT_LIST_HEAD(&ss->cftsets);
@@ -4388,13 +4442,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
+        mutex_lock(&cgroup_mutex);
        /* init base cftset */
        cgroup_init_cftsets(ss);
        /* Create the top cgroup state for this subsystem */
        list_add(&ss->sibling, &rootnode.subsys_list);
        ss->root = &rootnode;
-        css = ss->create(dummytop);
+        css = ss->css_alloc(dummytop);
        /* We don't handle early failures gracefully */
        BUG_ON(IS_ERR(css));
        init_cgroup_css(css, ss, dummytop);
@@ -4403,7 +4459,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         * pointer to this state - since the subsystem is
         * newly registered, all tasks and hence the
         * init_css_set is in the subsystem's top cgroup. */
-        init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
+        init_css_set.subsys[ss->subsys_id] = css;
        need_forkexit_callback |= ss->fork || ss->exit;
@@ -4413,6 +4469,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        BUG_ON(!list_empty(&init_task.tasks));
        ss->active = 1;
+        BUG_ON(online_css(ss, dummytop));
+        mutex_unlock(&cgroup_mutex);
        /* this function shouldn't be used with modular subsystems, since they
         * need to register a subsys_id, among other things */
@@ -4430,12 +4489,15 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 */
 int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 {
-        int i;
        struct cgroup_subsys_state *css;
+        int i, ret;
+        struct hlist_node *tmp;
+        struct css_set *cg;
+        unsigned long key;
        /* check name and function validity */
        if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
-            ss->create == NULL || ss->destroy == NULL)
+            ss->css_alloc == NULL || ss->css_free == NULL)
                return -EINVAL;
        /*
@@ -4464,10 +4526,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        subsys[ss->subsys_id] = ss;
        /*
-         * no ss->create seems to need anything important in the ss struct, so
+         * no ss->css_alloc seems to need anything important in the ss
-         * this can happen first (i.e. before the rootnode attachment).
+         * struct, so this can happen first (i.e. before the rootnode
+         * attachment).
         */
-        css = ss->create(dummytop);
+        css = ss->css_alloc(dummytop);
        if (IS_ERR(css)) {
                /* failure case - need to deassign the subsys[] slot. */
                subsys[ss->subsys_id] = NULL;
@@ -4482,14 +4545,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
        init_cgroup_css(css, ss, dummytop);
        /* init_idr must be after init_cgroup_css because it sets css->id. */
        if (ss->use_id) {
-                int ret = cgroup_init_idr(ss, css);
+                ret = cgroup_init_idr(ss, css);
-                if (ret) {
+                if (ret)
-                        dummytop->subsys[ss->subsys_id] = NULL;
+                        goto err_unload;
-                        ss->destroy(dummytop);
-                        subsys[ss->subsys_id] = NULL;
-                        mutex_unlock(&cgroup_mutex);
-                        return ret;
-                }
        }
        /*
@@ -4501,31 +4559,34 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
         * this is all done under the css_set_lock.
         */
        write_lock(&css_set_lock);
-        for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
+        hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
-                struct css_set *cg;
+                /* skip entries that we already rehashed */
-                struct hlist_node *node, *tmp;
+                if (cg->subsys[ss->subsys_id])
-                struct hlist_head *bucket = &css_set_table[i], *new_bucket;
+                        continue;
+                /* remove existing entry */
-                hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
+                hash_del(&cg->hlist);
-                        /* skip entries that we already rehashed */
+                /* set new value */
-                        if (cg->subsys[ss->subsys_id])
+                cg->subsys[ss->subsys_id] = css;
-                                continue;
+                /* recompute hash and restore entry */
-                        /* remove existing entry */
+                key = css_set_hash(cg->subsys);
-                        hlist_del(&cg->hlist);
+                hash_add(css_set_table, &cg->hlist, key);
-                        /* set new value */
-                        cg->subsys[ss->subsys_id] = css;
-                        /* recompute hash and restore entry */
-                        new_bucket = css_set_hash(cg->subsys);
-                        hlist_add_head(&cg->hlist, new_bucket);
-                }
        }
        write_unlock(&css_set_lock);
        ss->active = 1;
+        ret = online_css(ss, dummytop);
+        if (ret)
+                goto err_unload;
        /* success! */
        mutex_unlock(&cgroup_mutex);
        return 0;
+err_unload:
+        mutex_unlock(&cgroup_mutex);
+        /* @ss can't be mounted here as try_module_get() would fail */
+        cgroup_unload_subsys(ss);
+        return ret;
 }
 EXPORT_SYMBOL_GPL(cgroup_load_subsys);
@@ -4540,7 +4601,6 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
        struct cg_cgroup_link *link;
-        struct hlist_head *hhead;
        BUG_ON(ss->module == NULL);
@@ -4552,6 +4612,13 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        BUG_ON(ss->root != &rootnode);
        mutex_lock(&cgroup_mutex);
+        offline_css(ss, dummytop);
+        ss->active = 0;
+        if (ss->use_id)
+                idr_destroy(&ss->idr);
        /* deassign the subsys_id */
        subsys[ss->subsys_id] = NULL;
@@ -4565,22 +4632,22 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
        write_lock(&css_set_lock);
        list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
                struct css_set *cg = link->cg;
+                unsigned long key;
-                hlist_del(&cg->hlist);
+                hash_del(&cg->hlist);
-                BUG_ON(!cg->subsys[ss->subsys_id]);
                cg->subsys[ss->subsys_id] = NULL;
-                hhead = css_set_hash(cg->subsys);
+                key = css_set_hash(cg->subsys);
-                hlist_add_head(&cg->hlist, hhead);
+                hash_add(css_set_table, &cg->hlist, key);
        }
        write_unlock(&css_set_lock);
        /*
-         * remove subsystem's css from the dummytop and free it - need to free
+         * remove subsystem's css from the dummytop and free it - need to
-         * before marking as null because ss->destroy needs the cgrp->subsys
+         * free before marking as null because ss->css_free needs the
-         * pointer to find their state. note that this also takes care of
+         * cgrp->subsys pointer to find their state. note that this also
-         * freeing the css_id.
+         * takes care of freeing the css_id.
         */
-        ss->destroy(dummytop);
+        ss->css_free(dummytop);
        dummytop->subsys[ss->subsys_id] = NULL;
        mutex_unlock(&cgroup_mutex);
@@ -4612,9 +4679,6 @@ int __init cgroup_init_early(void)
        list_add(&init_css_set_link.cg_link_list,
                 &init_css_set.cg_links);
-        for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
-                INIT_HLIST_HEAD(&css_set_table[i]);
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                struct cgroup_subsys *ss = subsys[i];
@@ -4624,8 +4688,8 @@ int __init cgroup_init_early(void)
                BUG_ON(!ss->name);
                BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
-                BUG_ON(!ss->create);
+                BUG_ON(!ss->css_alloc);
-                BUG_ON(!ss->destroy);
+                BUG_ON(!ss->css_free);
                if (ss->subsys_id != i) {
                        printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
                               ss->name, ss->subsys_id);
@@ -4648,7 +4712,7 @@ int __init cgroup_init(void)
 {
        int err;
        int i;
-        struct hlist_head *hhead;
+        unsigned long key;
        err = bdi_init(&cgroup_backing_dev_info);
        if (err)
@@ -4667,8 +4731,8 @@ int __init cgroup_init(void)
        }
        /* Add init_css_set to the hash table */
-        hhead = css_set_hash(init_css_set.subsys);
+        key = css_set_hash(init_css_set.subsys);
-        hlist_add_head(&init_css_set.hlist, hhead);
+        hash_add(css_set_table, &init_css_set.hlist, key);
        BUG_ON(!init_root_id(&rootnode));
        cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
@@ -4832,44 +4896,19 @@ void cgroup_fork(struct task_struct *child)
 }
 /**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
-        if (need_forkexit_callback) {
-                int i;
-                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                        struct cgroup_subsys *ss = subsys[i];
-                        /*
-                         * forkexit callbacks are only supported for
-                         * builtin subsystems.
-                         */
-                        if (!ss || ss->module)
-                                continue;
-                        if (ss->fork)
-                                ss->fork(child);
-                }
-        }
-}
-/**
 * cgroup_post_fork - called on a new task after adding it to the task list
 * @child: the task in question
 *
- * Adds the task to the list running through its css_set if necessary.
+ * Adds the task to the list running through its css_set if necessary and
- * Has to be after the task is visible on the task list in case we race
+ * call the subsystem fork() callbacks.  Has to be after the task is
- * with the first call to cgroup_iter_start() - to guarantee that the
+ * visible on the task list in case we race with the first call to
- * new task ends up on its list.
+ * cgroup_iter_start() - to guarantee that the new task ends up on its
+ * list.
 */
 void cgroup_post_fork(struct task_struct *child)
 {
+        int i;
        /*
         * use_task_css_set_links is set to 1 before we walk the tasklist
         * under the tasklist_lock and we read it here after we added the child
@@ -4889,7 +4928,30 @@ void cgroup_post_fork(struct task_struct *child)
                task_unlock(child);
                write_unlock(&css_set_lock);
        }
+        /*
+         * Call ss->fork().  This must happen after @child is linked on
+         * css_set; otherwise, @child might change state between ->fork()
+         * and addition to css_set.
+         */
+        if (need_forkexit_callback) {
+                for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+                        struct cgroup_subsys *ss = subsys[i];
+                        /*
+                         * fork/exit callbacks are supported only for
+                         * builtin subsystems and we don't need further
+                         * synchronization as they never go away.
+                         */
+                        if (!ss || ss->module)
+                                continue;
+                        if (ss->fork)
+                                ss->fork(child);
+                }
+        }
 }
 /**
 * cgroup_exit - detach cgroup from exiting task
 * @tsk: pointer to task_struct of exiting process
@@ -4965,8 +5027,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        }
        task_unlock(tsk);
-        if (cg)
+        put_css_set_taskexit(cg);
-                put_css_set_taskexit(cg);
 }
 /**
@@ -5022,15 +5083,17 @@ static void check_for_release(struct cgroup *cgrp)
 /* Caller must verify that the css is not for root cgroup */
 bool __css_tryget(struct cgroup_subsys_state *css)
 {
-        do {
+        while (true) {
-                int v = css_refcnt(css);
+                int t, v;
-                if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+                v = css_refcnt(css);
+                t = atomic_cmpxchg(&css->refcnt, v, v + 1);
+                if (likely(t == v))
                        return true;
+                else if (t < 0)
+                        return false;
                cpu_relax();
-        } while (!test_bit(CSS_REMOVED, &css->flags));
+        }
-        return false;
 }
 EXPORT_SYMBOL_GPL(__css_tryget);
@@ -5049,11 +5112,9 @@ void __css_put(struct cgroup_subsys_state *css)
                        set_bit(CGRP_RELEASABLE, &cgrp->flags);
                        check_for_release(cgrp);
                }
-                cgroup_wakeup_rmdir_waiter(cgrp);
                break;
        case 0:
-                if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
+                schedule_work(&css->dput_work);
-                        schedule_work(&css->dput_work);
                break;
        }
        rcu_read_unlock();
@@ -5257,7 +5318,7 @@ EXPORT_SYMBOL_GPL(free_css_id);
 static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
 {
        struct css_id *newid;
-        int myid, error, size;
+        int ret, size;
        BUG_ON(!ss->use_id);
@@ -5265,35 +5326,24 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
        newid = kzalloc(size, GFP_KERNEL);
        if (!newid)
                return ERR_PTR(-ENOMEM);
-        /* get id */
-        if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
+        idr_preload(GFP_KERNEL);
-                error = -ENOMEM;
-                goto err_out;
-        }
        spin_lock(&ss->id_lock);
        /* Don't use 0. allocates an ID of 1-65535 */
-        error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+        ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
        spin_unlock(&ss->id_lock);
+        idr_preload_end();
        /* Returns error when there are no free spaces for new ID.*/
-        if (error) {
+        if (ret < 0)
-                error = -ENOSPC;
                goto err_out;
-        }
-        if (myid > CSS_ID_MAX)
-                goto remove_idr;
-        newid->id = myid;
+        newid->id = ret;
        newid->depth = depth;
        return newid;
-remove_idr:
-        error = -ENOSPC;
-        spin_lock(&ss->id_lock);
-        idr_remove(&ss->idr, myid);
-        spin_unlock(&ss->id_lock);
 err_out:
        kfree(newid);
-        return ERR_PTR(error);
+        return ERR_PTR(ret);
 }
@@ -5424,7 +5474,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
        struct inode *inode;
        struct cgroup_subsys_state *css;
-        inode = f->f_dentry->d_inode;
+        inode = file_inode(f);
        /* check in cgroup filesystem dir */
        if (inode->i_op != &cgroup_dir_inode_operations)
                return ERR_PTR(-EBADF);
@@ -5439,7 +5489,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
 }
 #ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
+static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
 {
        struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
@@ -5449,7 +5499,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
        return css;
 }
-static void debug_destroy(struct cgroup *cont)
+static void debug_css_free(struct cgroup *cont)
 {
        kfree(cont->subsys[debug_subsys_id]);
 }
@@ -5578,8 +5628,8 @@ static struct cftype debug_files[] =  {
 struct cgroup_subsys debug_subsys = {
        .name = "debug",
-        .create = debug_create,
+        .css_alloc = debug_css_alloc,
-        .destroy = debug_destroy,
+        .css_free = debug_css_free,
        .subsys_id = debug_subsys_id,
        .base_cftypes = debug_files,
 };
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index b1724ce98981..75dda1ea5026 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -22,24 +22,33 @@
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
-enum freezer_state {
+/*
-        CGROUP_THAWED = 0,
+ * A cgroup is freezing if any FREEZING flags are set.  FREEZING_SELF is
-        CGROUP_FREEZING,
+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
-        CGROUP_FROZEN,
+ * for "THAWED".  FREEZING_PARENT is set if the parent freezer is FREEZING
+ * for whatever reason.  IOW, a cgroup has FREEZING_PARENT set if one of
+ * its ancestors has FREEZING_SELF set.
+ */
+enum freezer_state_flags {
+        CGROUP_FREEZER_ONLINE   = (1 << 0), /* freezer is fully online */
+        CGROUP_FREEZING_SELF    = (1 << 1), /* this freezer is freezing */
+        CGROUP_FREEZING_PARENT  = (1 << 2), /* the parent freezer is freezing */
+        CGROUP_FROZEN           = (1 << 3), /* this and its descendants frozen */
+        /* mask for all FREEZING flags */
+        CGROUP_FREEZING         = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
 };
 struct freezer {
-        struct cgroup_subsys_state css;
+        struct cgroup_subsys_state      css;
-        enum freezer_state state;
+        unsigned int                    state;
-        spinlock_t lock; /* protects _writes_ to state */
+        spinlock_t                      lock;
 };
-static inline struct freezer *cgroup_freezer(
+static inline struct freezer *cgroup_freezer(struct cgroup *cgroup)
-                struct cgroup *cgroup)
 {
-        return container_of(
+        return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id),
-                cgroup_subsys_state(cgroup, freezer_subsys_id),
+                            struct freezer, css);
-                struct freezer, css);
 }
 static inline struct freezer *task_freezer(struct task_struct *task)
@@ -48,14 +57,21 @@ static inline struct freezer *task_freezer(struct task_struct *task)
                            struct freezer, css);
 }
+static struct freezer *parent_freezer(struct freezer *freezer)
+{
+        struct cgroup *pcg = freezer->css.cgroup->parent;
+        if (pcg)
+                return cgroup_freezer(pcg);
+        return NULL;
+}
 bool cgroup_freezing(struct task_struct *task)
 {
-        enum freezer_state state;
        bool ret;
        rcu_read_lock();
-        state = task_freezer(task)->state;
+        ret = task_freezer(task)->state & CGROUP_FREEZING;
-        ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
        rcu_read_unlock();
        return ret;
@@ -65,70 +81,18 @@ bool cgroup_freezing(struct task_struct *task)
 * cgroups_write_string() limits the size of freezer state strings to
 * CGROUP_LOCAL_BUFFER_SIZE
 */
-static const char *freezer_state_strs[] = {
+static const char *freezer_state_strs(unsigned int state)
-        "THAWED",
+{
-        "FREEZING",
+        if (state & CGROUP_FROZEN)
-        "FROZEN",
+                return "FROZEN";
+        if (state & CGROUP_FREEZING)
+                return "FREEZING";
+        return "THAWED";
 };
-/*
- * State diagram
- * Transitions are caused by userspace writes to the freezer.state file.
- * The values in parenthesis are state labels. The rest are edge labels.
- *
- * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
- *    ^ ^                    |                     |
- *    | \_______THAWED_______/                     |
- *    \__________________________THAWED____________/
- */
 struct cgroup_subsys freezer_subsys;
-/* Locks taken and their ordering
+static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup)
- * ------------------------------
- * cgroup_mutex (AKA cgroup_lock)
- * freezer->lock
- * css_set_lock
- * task->alloc_lock (AKA task_lock)
- * task->sighand->siglock
- *
- * cgroup code forces css_set_lock to be taken before task->alloc_lock
- *
- * freezer_create(), freezer_destroy():
- * cgroup_mutex [ by cgroup core ]
- *
- * freezer_can_attach():
- * cgroup_mutex (held by caller of can_attach)
- *
- * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
- * freezer->lock
- *  sighand->siglock (if the cgroup is freezing)
- *
- * freezer_read():
- * cgroup_mutex
- *  freezer->lock
- *   write_lock css_set_lock (cgroup iterator start)
- *    task->alloc_lock
- *   read_lock css_set_lock (cgroup iterator start)
- *
- * freezer_write() (freeze):
- * cgroup_mutex
- *  freezer->lock
- *   write_lock css_set_lock (cgroup iterator start)
- *    task->alloc_lock
- *   read_lock css_set_lock (cgroup iterator start)
- *    sighand->siglock (fake signal delivery inside freeze_task())
- *
- * freezer_write() (unfreeze):
- * cgroup_mutex
- *  freezer->lock
- *   write_lock css_set_lock (cgroup iterator start)
- *    task->alloc_lock
- *   read_lock css_set_lock (cgroup iterator start)
- *    task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
- *     sighand->siglock
- */
-static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
 {
        struct freezer *freezer;
@@ -137,160 +101,244 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
                return ERR_PTR(-ENOMEM);
        spin_lock_init(&freezer->lock);
-        freezer->state = CGROUP_THAWED;
        return &freezer->css;
 }
-static void freezer_destroy(struct cgroup *cgroup)
+/**
+ * freezer_css_online - commit creation of a freezer cgroup
+ * @cgroup: cgroup being created
+ *
+ * We're committing to creation of @cgroup.  Mark it online and inherit
+ * parent's freezing state while holding both parent's and our
+ * freezer->lock.
+ */
+static int freezer_css_online(struct cgroup *cgroup)
+{
+        struct freezer *freezer = cgroup_freezer(cgroup);
+        struct freezer *parent = parent_freezer(freezer);
+        /*
+         * The following double locking and freezing state inheritance
+         * guarantee that @cgroup can never escape ancestors' freezing
+         * states.  See cgroup_for_each_descendant_pre() for details.
+         */
+        if (parent)
+                spin_lock_irq(&parent->lock);
+        spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
+        freezer->state |= CGROUP_FREEZER_ONLINE;
+        if (parent && (parent->state & CGROUP_FREEZING)) {
+                freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+                atomic_inc(&system_freezing_cnt);
+        }
+        spin_unlock(&freezer->lock);
+        if (parent)
+                spin_unlock_irq(&parent->lock);
+        return 0;
+}
+/**
+ * freezer_css_offline - initiate destruction of @cgroup
+ * @cgroup: cgroup being destroyed
+ *
+ * @cgroup is going away.  Mark it dead and decrement system_freezing_count
+ * if it was holding one.
+ */
+static void freezer_css_offline(struct cgroup *cgroup)
 {
        struct freezer *freezer = cgroup_freezer(cgroup);
-        if (freezer->state != CGROUP_THAWED)
+        spin_lock_irq(&freezer->lock);
+        if (freezer->state & CGROUP_FREEZING)
                atomic_dec(&system_freezing_cnt);
-        kfree(freezer);
+        freezer->state = 0;
+        spin_unlock_irq(&freezer->lock);
 }
-/* task is frozen or will freeze immediately when next it gets woken */
+static void freezer_css_free(struct cgroup *cgroup)
-static bool is_task_frozen_enough(struct task_struct *task)
 {
-        return frozen(task) ||
+        kfree(cgroup_freezer(cgroup));
-                (task_is_stopped_or_traced(task) && freezing(task));
 }
 /*
- * The call to cgroup_lock() in the freezer.state write method prevents
+ * Tasks can be migrated into a different freezer anytime regardless of its
- * a write to that file racing against an attach, and hence the
+ * current state.  freezer_attach() is responsible for making new tasks
- * can_attach() result will remain valid until the attach completes.
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock.  freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
 */
-static int freezer_can_attach(struct cgroup *new_cgroup,
+static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
-                              struct cgroup_taskset *tset)
 {
-        struct freezer *freezer;
+        struct freezer *freezer = cgroup_freezer(new_cgrp);
        struct task_struct *task;
+        bool clear_frozen = false;
+        spin_lock_irq(&freezer->lock);
        /*
-         * Anything frozen can't move or be moved to/from.
+         * Make the new tasks conform to the current state of @new_cgrp.
+         * For simplicity, when migrating any task to a FROZEN cgroup, we
+         * revert it to FREEZING and let update_if_frozen() determine the
+         * correct state later.
+         *
+         * Tasks in @tset are on @new_cgrp but may not conform to its
+         * current state before executing the following - !frozen tasks may
+         * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
         */
-        cgroup_taskset_for_each(task, new_cgroup, tset)
+        cgroup_taskset_for_each(task, new_cgrp, tset) {
-                if (cgroup_freezing(task))
+                if (!(freezer->state & CGROUP_FREEZING)) {
-                        return -EBUSY;
+                        __thaw_task(task);
+                } else {
+                        freeze_task(task);
+                        freezer->state &= ~CGROUP_FROZEN;
+                        clear_frozen = true;
+                }
+        }
-        freezer = cgroup_freezer(new_cgroup);
+        spin_unlock_irq(&freezer->lock);
-        if (freezer->state != CGROUP_THAWED)
-                return -EBUSY;
-        return 0;
+        /*
+         * Propagate FROZEN clearing upwards.  We may race with
+         * update_if_frozen(), but as long as both work bottom-up, either
+         * update_if_frozen() sees child's FROZEN cleared or we clear the
+         * parent's FROZEN later.  No parent w/ !FROZEN children can be
+         * left FROZEN.
+         */
+        while (clear_frozen && (freezer = parent_freezer(freezer))) {
+                spin_lock_irq(&freezer->lock);
+                freezer->state &= ~CGROUP_FROZEN;
+                clear_frozen = freezer->state & CGROUP_FREEZING;
+                spin_unlock_irq(&freezer->lock);
+        }
 }
 static void freezer_fork(struct task_struct *task)
 {
        struct freezer *freezer;
-        /*
-         * No lock is needed, since the task isn't on tasklist yet,
-         * so it can't be moved to another cgroup, which means the
-         * freezer won't be removed and will be valid during this
-         * function call.  Nevertheless, apply RCU read-side critical
-         * section to suppress RCU lockdep false positives.
-         */
        rcu_read_lock();
        freezer = task_freezer(task);
-        rcu_read_unlock();
        /*
         * The root cgroup is non-freezable, so we can skip the
         * following check.
         */
        if (!freezer->css.cgroup->parent)
-                return;
+                goto out;
        spin_lock_irq(&freezer->lock);
-        BUG_ON(freezer->state == CGROUP_FROZEN);
+        if (freezer->state & CGROUP_FREEZING)
-        /* Locking avoids race with FREEZING -> THAWED transitions. */
-        if (freezer->state == CGROUP_FREEZING)
                freeze_task(task);
        spin_unlock_irq(&freezer->lock);
+out:
+        rcu_read_unlock();
 }
-/*
+/**
- * caller must hold freezer->lock
+ * update_if_frozen - update whether a cgroup finished freezing
+ * @cgroup: cgroup of interest
+ *
+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
+ * calling this function.  If the current state is FREEZING but not FROZEN,
+ * this function checks whether all tasks of this cgroup and the descendant
+ * cgroups finished freezing and, if so, sets FROZEN.
+ *
+ * The caller is responsible for grabbing RCU read lock and calling
+ * update_if_frozen() on all descendants prior to invoking this function.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @cgroup, so we can't verify task states against
+ * @freezer state here.  See freezer_attach() for details.
 */
-static void update_if_frozen(struct cgroup *cgroup,
+static void update_if_frozen(struct cgroup *cgroup)
-                                 struct freezer *freezer)
 {
+        struct freezer *freezer = cgroup_freezer(cgroup);
+        struct cgroup *pos;
        struct cgroup_iter it;
        struct task_struct *task;
-        unsigned int nfrozen = 0, ntotal = 0;
-        enum freezer_state old_state = freezer->state;
-        cgroup_iter_start(cgroup, &it);
+        WARN_ON_ONCE(!rcu_read_lock_held());
-        while ((task = cgroup_iter_next(cgroup, &it))) {
-                ntotal++;
+        spin_lock_irq(&freezer->lock);
-                if (freezing(task) && is_task_frozen_enough(task))
-                        nfrozen++;
+        if (!(freezer->state & CGROUP_FREEZING) ||
+            (freezer->state & CGROUP_FROZEN))
+                goto out_unlock;
+        /* are all (live) children frozen? */
+        cgroup_for_each_child(pos, cgroup) {
+                struct freezer *child = cgroup_freezer(pos);
+                if ((child->state & CGROUP_FREEZER_ONLINE) &&
+                    !(child->state & CGROUP_FROZEN))
+                        goto out_unlock;
        }
-        if (old_state == CGROUP_THAWED) {
+        /* are all tasks frozen? */
-                BUG_ON(nfrozen > 0);
+        cgroup_iter_start(cgroup, &it);
-        } else if (old_state == CGROUP_FREEZING) {
-                if (nfrozen == ntotal)
+        while ((task = cgroup_iter_next(cgroup, &it))) {
-                        freezer->state = CGROUP_FROZEN;
+                if (freezing(task)) {
-        } else { /* old_state == CGROUP_FROZEN */
+                        /*
-                BUG_ON(nfrozen != ntotal);
+                         * freezer_should_skip() indicates that the task
+                         * should be skipped when determining freezing
+                         * completion.  Consider it frozen in addition to
+                         * the usual frozen condition.
+                         */
+                        if (!frozen(task) && !freezer_should_skip(task))
+                                goto out_iter_end;
+                }
        }
+        freezer->state |= CGROUP_FROZEN;
+out_iter_end:
        cgroup_iter_end(cgroup, &it);
+out_unlock:
+        spin_unlock_irq(&freezer->lock);
 }
 static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
                        struct seq_file *m)
 {
-        struct freezer *freezer;
+        struct cgroup *pos;
-        enum freezer_state state;
-        if (!cgroup_lock_live_group(cgroup))
+        rcu_read_lock();
-                return -ENODEV;
-        freezer = cgroup_freezer(cgroup);
+        /* update states bottom-up */
-        spin_lock_irq(&freezer->lock);
+        cgroup_for_each_descendant_post(pos, cgroup)
-        state = freezer->state;
+                update_if_frozen(pos);
-        if (state == CGROUP_FREEZING) {
+        update_if_frozen(cgroup);
-                /* We change from FREEZING to FROZEN lazily if the cgroup was
-                 * only partially frozen when we exitted write. */
+        rcu_read_unlock();
-                update_if_frozen(cgroup, freezer);
-                state = freezer->state;
-        }
-        spin_unlock_irq(&freezer->lock);
-        cgroup_unlock();
-        seq_puts(m, freezer_state_strs[state]);
+        seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state));
        seq_putc(m, '\n');
        return 0;
 }
-static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void freeze_cgroup(struct freezer *freezer)
 {
+        struct cgroup *cgroup = freezer->css.cgroup;
        struct cgroup_iter it;
        struct task_struct *task;
-        unsigned int num_cant_freeze_now = 0;
        cgroup_iter_start(cgroup, &it);
-        while ((task = cgroup_iter_next(cgroup, &it))) {
+        while ((task = cgroup_iter_next(cgroup, &it)))
-                if (!freeze_task(task))
+                freeze_task(task);
-                        continue;
-                if (is_task_frozen_enough(task))
-                        continue;
-                if (!freezing(task) && !freezer_should_skip(task))
-                        num_cant_freeze_now++;
-        }
        cgroup_iter_end(cgroup, &it);
-        return num_cant_freeze_now ? -EBUSY : 0;
 }
-static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void unfreeze_cgroup(struct freezer *freezer)
 {
+        struct cgroup *cgroup = freezer->css.cgroup;
        struct cgroup_iter it;
        struct task_struct *task;
@@ -300,59 +348,111 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
        cgroup_iter_end(cgroup, &it);
 }
-static int freezer_change_state(struct cgroup *cgroup,
+/**
-                                enum freezer_state goal_state)
+ * freezer_apply_state - apply state change to a single cgroup_freezer
+ * @freezer: freezer to apply state change to
+ * @freeze: whether to freeze or unfreeze
+ * @state: CGROUP_FREEZING_* flag to set or clear
+ *
+ * Set or clear @state on @cgroup according to @freeze, and perform
+ * freezing or thawing as necessary.
+ */
+static void freezer_apply_state(struct freezer *freezer, bool freeze,
+                                unsigned int state)
 {
-        struct freezer *freezer;
+        /* also synchronizes against task migration, see freezer_attach() */
-        int retval = 0;
+        lockdep_assert_held(&freezer->lock);
-        freezer = cgroup_freezer(cgroup);
-        spin_lock_irq(&freezer->lock);
+        if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+                return;
-        update_if_frozen(cgroup, freezer);
+        if (freeze) {
+                if (!(freezer->state & CGROUP_FREEZING))
-        switch (goal_state) {
-        case CGROUP_THAWED:
-                if (freezer->state != CGROUP_THAWED)
-                        atomic_dec(&system_freezing_cnt);
-                freezer->state = CGROUP_THAWED;
-                unfreeze_cgroup(cgroup, freezer);
-                break;
-        case CGROUP_FROZEN:
-                if (freezer->state == CGROUP_THAWED)
                        atomic_inc(&system_freezing_cnt);
-                freezer->state = CGROUP_FREEZING;
+                freezer->state |= state;
-                retval = try_to_freeze_cgroup(cgroup, freezer);
+                freeze_cgroup(freezer);
-                break;
+        } else {
-        default:
+                bool was_freezing = freezer->state & CGROUP_FREEZING;
-                BUG();
+                freezer->state &= ~state;
+                if (!(freezer->state & CGROUP_FREEZING)) {
+                        if (was_freezing)
+                                atomic_dec(&system_freezing_cnt);
+                        freezer->state &= ~CGROUP_FROZEN;
+                        unfreeze_cgroup(freezer);
+                }
        }
+}
+/**
+ * freezer_change_state - change the freezing state of a cgroup_freezer
+ * @freezer: freezer of interest
+ * @freeze: whether to freeze or thaw
+ *
+ * Freeze or thaw @freezer according to @freeze.  The operations are
+ * recursive - all descendants of @freezer will be affected.
+ */
+static void freezer_change_state(struct freezer *freezer, bool freeze)
+{
+        struct cgroup *pos;
+        /* update @freezer */
+        spin_lock_irq(&freezer->lock);
+        freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF);
        spin_unlock_irq(&freezer->lock);
-        return retval;
+        /*
+         * Update all its descendants in pre-order traversal.  Each
+         * descendant will try to inherit its parent's FREEZING state as
+         * CGROUP_FREEZING_PARENT.
+         */
+        rcu_read_lock();
+        cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) {
+                struct freezer *pos_f = cgroup_freezer(pos);
+                struct freezer *parent = parent_freezer(pos_f);
+                /*
+                 * Our update to @parent->state is already visible which is
+                 * all we need.  No need to lock @parent.  For more info on
+                 * synchronization, see freezer_post_create().
+                 */
+                spin_lock_irq(&pos_f->lock);
+                freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING,
+                                    CGROUP_FREEZING_PARENT);
+                spin_unlock_irq(&pos_f->lock);
+        }
+        rcu_read_unlock();
 }
-static int freezer_write(struct cgroup *cgroup,
+static int freezer_write(struct cgroup *cgroup, struct cftype *cft,
-                         struct cftype *cft,
                         const char *buffer)
 {
-        int retval;
+        bool freeze;
-        enum freezer_state goal_state;
-        if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
+        if (strcmp(buffer, freezer_state_strs(0)) == 0)
-                goal_state = CGROUP_THAWED;
+                freeze = false;
-        else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
+        else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0)
-                goal_state = CGROUP_FROZEN;
+                freeze = true;
        else
                return -EINVAL;
-        if (!cgroup_lock_live_group(cgroup))
+        freezer_change_state(cgroup_freezer(cgroup), freeze);
-                return -ENODEV;
+        return 0;
-        retval = freezer_change_state(cgroup, goal_state);
+}
-        cgroup_unlock();
-        return retval;
+static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft)
+{
+        struct freezer *freezer = cgroup_freezer(cgroup);
+        return (bool)(freezer->state & CGROUP_FREEZING_SELF);
+}
+static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft)
+{
+        struct freezer *freezer = cgroup_freezer(cgroup);
+        return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
 }
 static struct cftype files[] = {
@@ -362,23 +462,27 @@ static struct cftype files[] = {
                .read_seq_string = freezer_read,
                .write_string = freezer_write,
        },
+        {
+                .name = "self_freezing",
+                .flags = CFTYPE_NOT_ON_ROOT,
+                .read_u64 = freezer_self_freezing_read,
+        },
+        {
+                .name = "parent_freezing",
+                .flags = CFTYPE_NOT_ON_ROOT,
+                .read_u64 = freezer_parent_freezing_read,
+        },
        { }     /* terminate */
 };
 struct cgroup_subsys freezer_subsys = {
        .name           = "freezer",
-        .create         = freezer_create,
+        .css_alloc      = freezer_css_alloc,
-        .destroy        = freezer_destroy,
+        .css_online     = freezer_css_online,
+        .css_offline    = freezer_css_offline,
+        .css_free       = freezer_css_free,
        .subsys_id      = freezer_subsys_id,
-        .can_attach     = freezer_can_attach,
+        .attach         = freezer_attach,
        .fork           = freezer_fork,
        .base_cftypes   = files,
-        /*
-         * freezer subsys doesn't handle hierarchy at all.  Frozen state
-         * should be inherited through the hierarchy - if a parent is
-         * frozen, all its children should be frozen.  Fix it and remove
-         * the following.
-         */
-        .broken_hierarchy = true,
 };
diff --git a/kernel/compat.c b/kernel/compat.c
index c28a306ae05c..19971d8c7299 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -290,8 +290,8 @@ static inline long put_compat_itimerval(struct compat_itimerval __user *o,
                 __put_user(i->it_value.tv_usec, &o->it_value.tv_usec)));
 }
-asmlinkage long compat_sys_getitimer(int which,
+COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
-                struct compat_itimerval __user *it)
+                struct compat_itimerval __user *, it)
 {
        struct itimerval kit;
        int error;
@@ -302,9 +302,9 @@ asmlinkage long compat_sys_getitimer(int which,
        return error;
 }
-asmlinkage long compat_sys_setitimer(int which,
+COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
-                struct compat_itimerval __user *in,
+                struct compat_itimerval __user *, in,
-                struct compat_itimerval __user *out)
+                struct compat_itimerval __user *, out)
 {
        struct itimerval kin, kout;
        int error;
@@ -381,9 +381,9 @@ static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
        memcpy(blocked->sig, &set, sizeof(set));
 }
-asmlinkage long compat_sys_sigprocmask(int how,
+COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
-                                       compat_old_sigset_t __user *nset,
+                       compat_old_sigset_t __user *, nset,
-                                       compat_old_sigset_t __user *oset)
+                       compat_old_sigset_t __user *, oset)
 {
        old_sigset_t old_set, new_set;
        sigset_t new_blocked;
@@ -535,9 +535,11 @@ asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
        return 0;
 }
-asmlinkage long
+COMPAT_SYSCALL_DEFINE4(wait4,
-compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
+        compat_pid_t, pid,
-        struct compat_rusage __user *ru)
+        compat_uint_t __user *, stat_addr,
+        int, options,
+        struct compat_rusage __user *, ru)
 {
        if (!ru) {
                return sys_wait4(pid, stat_addr, options, NULL);
@@ -564,9 +566,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
        }
 }
-asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
+COMPAT_SYSCALL_DEFINE5(waitid,
-                struct compat_siginfo __user *uinfo, int options,
+                int, which, compat_pid_t, pid,
-                struct compat_rusage __user *uru)
+                struct compat_siginfo __user *, uinfo, int, options,
+                struct compat_rusage __user *, uru)
 {
        siginfo_t info;
        struct rusage ru;
@@ -584,9 +587,13 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
                return ret;
        if (uru) {
-                ret = put_compat_rusage(&ru, uru);
+                /* sys_waitid() overwrites everything in ru */
+                if (COMPAT_USE_64BIT_TIME)
+                        ret = copy_to_user(uru, &ru, sizeof(ru));
+                else
+                        ret = put_compat_rusage(&ru, uru);
                if (ret)
-                        return ret;
+                        return -EFAULT;
        }
        BUG_ON(info.si_code & __SI_MASK);
@@ -964,7 +971,7 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
 }
 void
-sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
+sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
 {
        switch (_NSIG_WORDS) {
        case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
@@ -975,10 +982,20 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
 }
 EXPORT_SYMBOL_GPL(sigset_from_compat);
-asmlinkage long
+void
-compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
+sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
-                struct compat_siginfo __user *uinfo,
+{
-                struct compat_timespec __user *uts, compat_size_t sigsetsize)
+        switch (_NSIG_WORDS) {
+        case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
+        case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
+        case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
+        case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+        }
+}
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+                struct compat_siginfo __user *, uinfo,
+                struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
 {
        compat_sigset_t s32;
        sigset_t s;
@@ -994,7 +1011,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
        sigset_from_compat(&s, &s32);
        if (uts) {
-                if (get_compat_timespec(&t, uts))
+                if (compat_get_timespec(&t, uts))
                        return -EFAULT;
        }
@@ -1006,18 +1023,6 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
        }
        return ret;
-}
-asmlinkage long
-compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
-                             struct compat_siginfo __user *uinfo)
-{
-        siginfo_t info;
-        if (copy_siginfo_from_user32(&info, uinfo))
-                return -EFAULT;
-        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
 }
 #ifdef __ARCH_WANT_COMPAT_SYS_TIME
@@ -1060,23 +1065,6 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
 #endif /* __ARCH_WANT_COMPAT_SYS_TIME */
-#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
-asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize)
-{
-        sigset_t newset;
-        compat_sigset_t newset32;
-        /* XXX: Don't preclude handling different sized sigset_t's.  */
-        if (sigsetsize != sizeof(sigset_t))
-                return -EINVAL;
-        if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
-                return -EFAULT;
-        sigset_from_compat(&newset, &newset32);
-        return sigsuspend(&newset);
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
 {
        struct timex txc;
@@ -1215,6 +1203,22 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
        return 0;
 }
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+                       compat_pid_t, pid,
+                       struct compat_timespec __user *, interval)
+{
+        struct timespec t;
+        int ret;
+        mm_segment_t old_fs = get_fs();
+        set_fs(KERNEL_DS);
+        ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
+        set_fs(old_fs);
+        if (put_compat_timespec(&t, interval))
+                return -EFAULT;
+        return ret;
+}
 /*
 * Allocate user-space memory for the duration of a single system call,
 * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
new file mode 100644
index 000000000000..65349f07b878
--- /dev/null
+++ b/kernel/context_tracking.c
@@ -0,0 +1,145 @@
+/*
+ * Context tracking: Probe on high level context boundaries such as kernel
+ * and userspace. This includes syscalls and exceptions entry/exit.
+ *
+ * This is used by RCU to remove its dependency on the timer tick while a CPU
+ * runs in userspace.
+ *
+ *  Started by Frederic Weisbecker:
+ *
+ * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton,
+ * Steven Rostedt, Peter Zijlstra for suggestions and improvements.
+ *
+ */
+#include <linux/context_tracking.h>
+#include <linux/kvm_host.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/export.h>
+DEFINE_PER_CPU(struct context_tracking, context_tracking) = {
+#ifdef CONFIG_CONTEXT_TRACKING_FORCE
+        .active = true,
+#endif
+};
+/**
+ * user_enter - Inform the context tracking that the CPU is going to
+ *              enter userspace mode.
+ *
+ * This function must be called right before we switch from the kernel
+ * to userspace, when it's guaranteed the remaining kernel instructions
+ * to execute won't use any RCU read side critical section because this
+ * function sets RCU in extended quiescent state.
+ */
+void user_enter(void)
+{
+        unsigned long flags;
+        /*
+         * Some contexts may involve an exception occuring in an irq,
+         * leading to that nesting:
+         * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+         * This would mess up the dyntick_nesting count though. And rcu_irq_*()
+         * helpers are enough to protect RCU uses inside the exception. So
+         * just return immediately if we detect we are in an IRQ.
+         */
+        if (in_interrupt())
+                return;
+        /* Kernel threads aren't supposed to go to userspace */
+        WARN_ON_ONCE(!current->mm);
+        local_irq_save(flags);
+        if (__this_cpu_read(context_tracking.active) &&
+            __this_cpu_read(context_tracking.state) != IN_USER) {
+                /*
+                 * At this stage, only low level arch entry code remains and
+                 * then we'll run in userspace. We can assume there won't be
+                 * any RCU read-side critical section until the next call to
+                 * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency
+                 * on the tick.
+                 */
+                vtime_user_enter(current);
+                rcu_user_enter();
+                __this_cpu_write(context_tracking.state, IN_USER);
+        }
+        local_irq_restore(flags);
+}
+/**
+ * user_exit - Inform the context tracking that the CPU is
+ *             exiting userspace mode and entering the kernel.
+ *
+ * This function must be called after we entered the kernel from userspace
+ * before any use of RCU read side critical section. This potentially include
+ * any high level kernel code like syscalls, exceptions, signal handling, etc...
+ *
+ * This call supports re-entrancy. This way it can be called from any exception
+ * handler without needing to know if we came from userspace or not.
+ */
+void user_exit(void)
+{
+        unsigned long flags;
+        if (in_interrupt())
+                return;
+        local_irq_save(flags);
+        if (__this_cpu_read(context_tracking.state) == IN_USER) {
+                /*
+                 * We are going to run code that may use RCU. Inform
+                 * RCU core about that (ie: we may need the tick again).
+                 */
+                rcu_user_exit();
+                vtime_user_exit(current);
+                __this_cpu_write(context_tracking.state, IN_KERNEL);
+        }
+        local_irq_restore(flags);
+}
+void guest_enter(void)
+{
+        if (vtime_accounting_enabled())
+                vtime_guest_enter(current);
+        else
+                __guest_enter();
+}
+EXPORT_SYMBOL_GPL(guest_enter);
+void guest_exit(void)
+{
+        if (vtime_accounting_enabled())
+                vtime_guest_exit(current);
+        else
+                __guest_exit();
+}
+EXPORT_SYMBOL_GPL(guest_exit);
+/**
+ * context_tracking_task_switch - context switch the syscall callbacks
+ * @prev: the task that is being switched out
+ * @next: the task that is being switched in
+ *
+ * The context tracking uses the syscall slow path to implement its user-kernel
+ * boundaries probes on syscalls. This way it doesn't impact the syscall fast
+ * path on CPUs that don't do context tracking.
+ *
+ * But we need to clear the flag on the previous task because it may later
+ * migrate to some CPU that doesn't do the context tracking. As such the TIF
+ * flag may not be desired there.
+ */
+void context_tracking_task_switch(struct task_struct *prev,
+                             struct task_struct *next)
+{
+        if (__this_cpu_read(context_tracking.active)) {
+                clear_tsk_thread_flag(prev, TIF_NOHZ);
+                set_tsk_thread_flag(next, TIF_NOHZ);
+        }
+}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 42bd331ee0ab..b5e4ab2d427e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -224,11 +224,13 @@ void clear_tasks_mm_cpumask(int cpu)
 static inline void check_for_tasks(int cpu)
 {
        struct task_struct *p;
+        cputime_t utime, stime;
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
+                task_cputime(p, &utime, &stime);
                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
-                    (p->utime || p->stime))
+                    (utime || stime))
                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
                                "(state = %ld, flags = %x)\n",
                                p->comm, task_pid_nr(p), cpu,
@@ -254,6 +256,8 @@ static int __ref take_cpu_down(void *_param)
                return err;
        cpu_notify(CPU_DYING | param->mod, param->hcpu);
+        /* Park the stopper thread */
+        kthread_park(current);
        return 0;
 }
@@ -348,11 +352,13 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
        struct task_struct *idle;
-        if (cpu_online(cpu) || !cpu_present(cpu))
-                return -EINVAL;
        cpu_hotplug_begin();
+        if (cpu_online(cpu) || !cpu_present(cpu)) {
+                ret = -EINVAL;
+                goto out;
+        }
        idle = idle_thread_get(cpu);
        if (IS_ERR(idle)) {
                ret = PTR_ERR(idle);
@@ -601,6 +607,11 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
 static int __init cpu_hotplug_pm_sync_init(void)
 {
+        /*
+         * cpu_hotplug_pm_callback has higher priority than x86
+         * bsp_pm_callback which depends on cpu_hotplug_pm_callback
+         * to disable cpu hotplug to avoid cpu hotplug race.
+         */
        pm_notifier(cpu_hotplug_pm_callback, 0);
        return 0;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c7153b6d7..4f9dfe43ecbd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,14 +61,6 @@
 #include <linux/cgroup.h>
 /*
- * Workqueue for cpuset related tasks.
- *
- * Using kevent workqueue may cause deadlock when memory_migrate
- * is set. So we create a separate workqueue thread for cpuset.
- */
-static struct workqueue_struct *cpuset_wq;
-/*
 * Tracks how many cpusets are currently defined in system.
 * When there is only one cpuset (the root cpuset) we can
 * short circuit some hooks.
@@ -95,18 +87,21 @@ struct cpuset {
        cpumask_var_t cpus_allowed;     /* CPUs allowed to tasks in cpuset */
        nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
-        struct cpuset *parent;          /* my parent */
        struct fmeter fmeter;           /* memory_pressure filter */
+        /*
+         * Tasks are being attached to this cpuset.  Used to prevent
+         * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
+         */
+        int attach_in_progress;
        /* partition number for rebuild_sched_domains() */
        int pn;
        /* for custom sched domain */
        int relax_domain_level;
-        /* used for walking a cpuset hierarchy */
+        struct work_struct hotplug_work;
-        struct list_head stack_list;
 };
 /* Retrieve the cpuset for a cgroup */
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
                            struct cpuset, css);
 }
+static inline struct cpuset *parent_cs(const struct cpuset *cs)
+{
+        struct cgroup *pcgrp = cs->css.cgroup->parent;
+        if (pcgrp)
+                return cgroup_cs(pcgrp);
+        return NULL;
+}
 #ifdef CONFIG_NUMA
 static inline bool task_has_mempolicy(struct task_struct *task)
 {
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
 /* bits in struct cpuset flags field */
 typedef enum {
+        CS_ONLINE,
        CS_CPU_EXCLUSIVE,
        CS_MEM_EXCLUSIVE,
        CS_MEM_HARDWALL,
@@ -147,13 +152,12 @@ typedef enum {
        CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
-/* the type of hotplug event */
-enum hotplug_event {
-        CPUSET_CPU_OFFLINE,
-        CPUSET_MEM_OFFLINE,
-};
 /* convenient tests for these bits */
+static inline bool is_cpuset_online(const struct cpuset *cs)
+{
+        return test_bit(CS_ONLINE, &cs->flags);
+}
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
        return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
 }
 static struct cpuset top_cpuset = {
-        .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+        .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
+                  (1 << CS_MEM_EXCLUSIVE)),
 };
+/**
+ * cpuset_for_each_child - traverse online children of a cpuset
+ * @child_cs: loop cursor pointing to the current child
+ * @pos_cgrp: used for iteration
+ * @parent_cs: target cpuset to walk children of
+ *
+ * Walk @child_cs through the online children of @parent_cs.  Must be used
+ * with RCU read locked.
+ */
+#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs)            \
+        cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup)      \
+                if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
+/**
+ * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
+ * @des_cs: loop cursor pointing to the current descendant
+ * @pos_cgrp: used for iteration
+ * @root_cs: target cpuset to walk ancestor of
+ *
+ * Walk @des_cs through the online descendants of @root_cs.  Must be used
+ * with RCU read locked.  The caller may modify @pos_cgrp by calling
+ * cgroup_rightmost_descendant() to skip subtree.
+ */
+#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs)       \
+        cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
+                if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
 /*
- * There are two global mutexes guarding cpuset structures.  The first
+ * There are two global mutexes guarding cpuset structures - cpuset_mutex
- * is the main control groups cgroup_mutex, accessed via
+ * and callback_mutex.  The latter may nest inside the former.  We also
- * cgroup_lock()/cgroup_unlock().  The second is the cpuset-specific
+ * require taking task_lock() when dereferencing a task's cpuset pointer.
- * callback_mutex, below. They can nest.  It is ok to first take
+ * See "The task_lock() exception", at the end of this comment.
- * cgroup_mutex, then nest callback_mutex.  We also require taking
+ *
- * task_lock() when dereferencing a task's cpuset pointer.  See "The
+ * A task must hold both mutexes to modify cpusets.  If a task holds
- * task_lock() exception", at the end of this comment.
+ * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
- *
+ * is the only task able to also acquire callback_mutex and be able to
- * A task must hold both mutexes to modify cpusets.  If a task
+ * modify cpusets.  It can perform various checks on the cpuset structure
- * holds cgroup_mutex, then it blocks others wanting that mutex,
+ * first, knowing nothing will change.  It can also allocate memory while
- * ensuring that it is the only task able to also acquire callback_mutex
+ * just holding cpuset_mutex.  While it is performing these checks, various
- * and be able to modify cpusets.  It can perform various checks on
+ * callback routines can briefly acquire callback_mutex to query cpusets.
- * the cpuset structure first, knowing nothing will change.  It can
+ * Once it is ready to make the changes, it takes callback_mutex, blocking
- * also allocate memory while just holding cgroup_mutex.  While it is
+ * everyone else.
- * performing these checks, various callback routines can briefly
- * acquire callback_mutex to query cpusets.  Once it is ready to make
- * the changes, it takes callback_mutex, blocking everyone else.
 *
 * Calls to the kernel memory allocator can not be made while holding
 * callback_mutex, as that would risk double tripping on callback_mutex
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
 * guidelines for accessing subsystem state in kernel/cgroup.c
 */
+static DEFINE_MUTEX(cpuset_mutex);
 static DEFINE_MUTEX(callback_mutex);
 /*
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
 static DEFINE_SPINLOCK(cpuset_buffer_lock);
 /*
+ * CPU / memory hotplug is handled asynchronously.
+ */
+static struct workqueue_struct *cpuset_propagate_hotplug_wq;
+static void cpuset_hotplug_workfn(struct work_struct *work);
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
+static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
+/*
 * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
 * silently switch it to mount "cgroup" instead
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
                                  struct cpumask *pmask)
 {
        while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
-                cs = cs->parent;
+                cs = parent_cs(cs);
        if (cs)
                cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
        else
@@ -302,10 +343,10 @@ static void guarantee_online_cpus(const struct cpuset *cs,
 * are online, with memory.  If none are online with memory, walk
 * up the cpuset hierarchy until we find one that does have some
 * online mems.  If we get all the way to the top and still haven't
- * found any online mems, return node_states[N_HIGH_MEMORY].
+ * found any online mems, return node_states[N_MEMORY].
 *
 * One way or another, we guarantee to return some non-empty subset
- * of node_states[N_HIGH_MEMORY].
+ * of node_states[N_MEMORY].
 *
 * Call with callback_mutex held.
 */
@@ -313,20 +354,20 @@ static void guarantee_online_cpus(const struct cpuset *cs,
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 {
        while (cs && !nodes_intersects(cs->mems_allowed,
-                                        node_states[N_HIGH_MEMORY]))
+                                        node_states[N_MEMORY]))
-                cs = cs->parent;
+                cs = parent_cs(cs);
        if (cs)
                nodes_and(*pmask, cs->mems_allowed,
-                                        node_states[N_HIGH_MEMORY]);
+                                        node_states[N_MEMORY]);
        else
-                *pmask = node_states[N_HIGH_MEMORY];
+                *pmask = node_states[N_MEMORY];
-        BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
+        BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
 }
 /*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
- * Called with callback_mutex/cgroup_mutex held
+ * Called with callback_mutex/cpuset_mutex held
 */
 static void cpuset_update_task_spread_flag(struct cpuset *cs,
                                        struct task_struct *tsk)
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.  Call holding cgroup_mutex.
+ * are only set if the other's are set.  Call holding cpuset_mutex.
 */
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
- * cgroup_mutex held.
+ * cpuset_mutex held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 {
        struct cgroup *cont;
        struct cpuset *c, *par;
+        int ret;
+        rcu_read_lock();
        /* Each of our child cpusets must be a subset of us */
-        list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
+        ret = -EBUSY;
-                if (!is_cpuset_subset(cgroup_cs(cont), trial))
+        cpuset_for_each_child(c, cont, cur)
-                        return -EBUSY;
+                if (!is_cpuset_subset(c, trial))
-        }
+                        goto out;
        /* Remaining checks don't apply to root cpuset */
+        ret = 0;
        if (cur == &top_cpuset)
-                return 0;
+                goto out;
-        par = cur->parent;
+        par = parent_cs(cur);
        /* We must be a subset of our parent cpuset */
+        ret = -EACCES;
        if (!is_cpuset_subset(trial, par))
-                return -EACCES;
+                goto out;
        /*
         * If either I or some sibling (!= me) is exclusive, we can't
         * overlap
         */
-        list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
+        ret = -EINVAL;
-                c = cgroup_cs(cont);
+        cpuset_for_each_child(c, cont, par) {
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                    c != cur &&
                    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
-                        return -EINVAL;
+                        goto out;
                if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
                    c != cur &&
                    nodes_intersects(trial->mems_allowed, c->mems_allowed))
-                        return -EINVAL;
+                        goto out;
        }
-        /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
+        /*
-        if (cgroup_task_count(cur->css.cgroup)) {
+         * Cpusets with tasks - existing or newly being attached - can't
-                if (cpumask_empty(trial->cpus_allowed) ||
+         * have empty cpus_allowed or mems_allowed.
-                    nodes_empty(trial->mems_allowed)) {
+         */
-                        return -ENOSPC;
+        ret = -ENOSPC;
-                }
+        if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
-        }
+            (cpumask_empty(trial->cpus_allowed) ||
+             nodes_empty(trial->mems_allowed)))
+                goto out;
-        return 0;
+        ret = 0;
+out:
+        rcu_read_unlock();
+        return ret;
 }
 #ifdef CONFIG_SMP
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
        return;
 }
-static void
+static void update_domain_attr_tree(struct sched_domain_attr *dattr,
-update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
+                                    struct cpuset *root_cs)
 {
-        LIST_HEAD(q);
+        struct cpuset *cp;
+        struct cgroup *pos_cgrp;
-        list_add(&c->stack_list, &q);
-        while (!list_empty(&q)) {
-                struct cpuset *cp;
-                struct cgroup *cont;
-                struct cpuset *child;
-                cp = list_first_entry(&q, struct cpuset, stack_list);
-                list_del(q.next);
-                if (cpumask_empty(cp->cpus_allowed))
+        rcu_read_lock();
+        cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+                /* skip the whole subtree if @cp doesn't have any CPU */
+                if (cpumask_empty(cp->cpus_allowed)) {
+                        pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
                        continue;
+                }
                if (is_sched_load_balance(cp))
                        update_domain_attr(dattr, cp);
-                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
-                        child = cgroup_cs(cont);
-                        list_add_tail(&child->stack_list, &q);
-                }
        }
+        rcu_read_unlock();
 }
 /*
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
- * Must be called with cgroup_lock held.
+ * Must be called with cpuset_mutex held.
 *
 * The three key local variables below are:
 *    q  - a linked-list queue of cpuset pointers, used to implement a
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 static int generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
 {
-        LIST_HEAD(q);           /* queue of cpusets to be scanned */
        struct cpuset *cp;      /* scans q */
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;          /* number of sched domains in result */
        int nslot;              /* next empty doms[] struct cpumask slot */
+        struct cgroup *pos_cgrp;
        doms = NULL;
        dattr = NULL;
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
                goto done;
        csn = 0;
-        list_add(&top_cpuset.stack_list, &q);
+        rcu_read_lock();
-        while (!list_empty(&q)) {
+        cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
-                struct cgroup *cont;
-                struct cpuset *child;   /* scans child cpusets of cp */
-                cp = list_first_entry(&q, struct cpuset, stack_list);
-                list_del(q.next);
-                if (cpumask_empty(cp->cpus_allowed))
-                        continue;
                /*
-                 * All child cpusets contain a subset of the parent's cpus, so
+                 * Continue traversing beyond @cp iff @cp has some CPUs and
-                 * just skip them, and then we call update_domain_attr_tree()
+                 * isn't load balancing.  The former is obvious.  The
-                 * to calc relax_domain_level of the corresponding sched
+                 * latter: All child cpusets contain a subset of the
-                 * domain.
+                 * parent's cpus, so just skip them, and then we call
+                 * update_domain_attr_tree() to calc relax_domain_level of
+                 * the corresponding sched domain.
                 */
-                if (is_sched_load_balance(cp)) {
+                if (!cpumask_empty(cp->cpus_allowed) &&
-                        csa[csn++] = cp;
+                    !is_sched_load_balance(cp))
                        continue;
-                }
-                list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                if (is_sched_load_balance(cp))
-                        child = cgroup_cs(cont);
+                        csa[csn++] = cp;
-                        list_add_tail(&child->stack_list, &q);
-                }
+                /* skip @cp's subtree */
-        }
+                pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+        }
+        rcu_read_unlock();
        for (i = 0; i < csn; i++)
                csa[i]->pn = i;
@@ -725,25 +763,25 @@ done:
 /*
 * Rebuild scheduler domains.
 *
- * Call with neither cgroup_mutex held nor within get_online_cpus().
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
- * Takes both cgroup_mutex and get_online_cpus().
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
 *
- * Cannot be directly called from cpuset code handling changes
+ * Call with cpuset_mutex held.  Takes get_online_cpus().
- * to the cpuset pseudo-filesystem, because it cannot be called
- * from code that already holds cgroup_mutex.
 */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
 {
        struct sched_domain_attr *attr;
        cpumask_var_t *doms;
        int ndoms;
+        lockdep_assert_held(&cpuset_mutex);
        get_online_cpus();
        /* Generate domain masks and attrs */
-        cgroup_lock();
        ndoms = generate_sched_domains(&doms, &attr);
-        cgroup_unlock();
        /* Have scheduler rebuild the domains */
        partition_sched_domains(ndoms, doms, attr);
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
        put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void do_rebuild_sched_domains(struct work_struct *unused)
+static void rebuild_sched_domains_locked(void)
 {
 }
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
 }
 #endif /* CONFIG_SMP */
-static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-/*
- * Rebuild scheduler domains, asynchronously via workqueue.
- *
- * If the flag 'sched_load_balance' of any cpuset with non-empty
- * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
- * which has that flag enabled, or if any cpuset with a non-empty
- * 'cpus' is removed, then call this routine to rebuild the
- * scheduler's dynamic sched domains.
- *
- * The rebuild_sched_domains() and partition_sched_domains()
- * routines must nest cgroup_lock() inside get_online_cpus(),
- * but such cpuset changes as these must nest that locking the
- * other way, holding cgroup_lock() for much of the code.
- *
- * So in order to avoid an ABBA deadlock, the cpuset code handling
- * these user changes delegates the actual sched domain rebuilding
- * to a separate workqueue thread, which ends up processing the
- * above do_rebuild_sched_domains() function.
- */
-static void async_rebuild_sched_domains(void)
-{
-        queue_work(cpuset_wq, &rebuild_sched_domains_work);
-}
-/*
- * Accomplishes the same scheduler domain rebuild as the above
- * async_rebuild_sched_domains(), however it directly calls the
- * rebuild routine synchronously rather than calling it via an
- * asynchronous work thread.
- *
- * This can only be called from code that is not holding
- * cgroup_mutex (not nested in a cgroup_lock() call.)
- */
 void rebuild_sched_domains(void)
 {
-        do_rebuild_sched_domains(NULL);
+        mutex_lock(&cpuset_mutex);
+        rebuild_sched_domains_locked();
+        mutex_unlock(&cpuset_mutex);
 }
 /**
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
 * @tsk: task to test
 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
 *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Call with cpuset_mutex held.  May take callback_mutex during call.
 * Called for each task in a cgroup by cgroup_scan_tasks().
 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
 * words, if its mask is not equal to its cpuset's mask).
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
 * cpus_allowed mask needs to be changed.
 *
 * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
 */
 static void cpuset_change_cpumask(struct task_struct *tsk,
                                  struct cgroup_scanner *scan)
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
        heap_free(&heap);
        if (is_load_balanced)
-                async_rebuild_sched_domains();
+                rebuild_sched_domains_locked();
        return 0;
 }
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 *    Temporarilly set tasks mems_allowed to target nodes of migration,
 *    so that the migration code can allocate pages on these nodes.
 *
- *    Call holding cgroup_mutex, so current's cpuset won't change
+ *    Call holding cpuset_mutex, so current's cpuset won't change
 *    during this call, as manage_mutex holds off any cpuset_attach()
 *    calls.  Therefore we don't need to take task_lock around the
 *    call to guarantee_online_mems(), as we know no one is changing
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 /*
 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
+ * memory_migrate flag is set. Called with cpuset_mutex held.
 */
 static void cpuset_change_nodemask(struct task_struct *p,
                                   struct cgroup_scanner *scan)
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
        struct cpuset *cs;
        int migrate;
        const nodemask_t *oldmem = scan->data;
-        static nodemask_t newmems;      /* protected by cgroup_mutex */
+        static nodemask_t newmems;      /* protected by cpuset_mutex */
        cs = cgroup_cs(scan->cg);
        guarantee_online_mems(cs, &newmems);
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
 * @oldmem: old mems_allowed of cpuset cs
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
 * if @heap != NULL.
 */
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
         * take while holding tasklist_lock.  Forks can happen - the
         * mpol_dup() cpuset_being_rebound check will catch such forks,
         * and rebind their vma mempolicies too.  Because we still hold
-         * the global cgroup_mutex, we know that no other rebind effort
+         * the global cpuset_mutex, we know that no other rebind effort
         * will be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 * mempolicies and if the cpuset is marked 'memory_migrate',
 * migrate the tasks pages to the new memory.
 *
- * Call with cgroup_mutex held.  May take callback_mutex during call.
+ * Call with cpuset_mutex held.  May take callback_mutex during call.
 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
 * their mempolicies to the cpusets new mems_allowed.
@@ -1100,7 +1105,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                return -ENOMEM;
        /*
-         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+         * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
         * it's read-only
         */
        if (cs == &top_cpuset) {
@@ -1122,7 +1127,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
                        goto done;
                if (!nodes_subset(trialcs->mems_allowed,
-                                node_states[N_HIGH_MEMORY])) {
+                                node_states[N_MEMORY])) {
                        retval =  -EINVAL;
                        goto done;
                }
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
                cs->relax_domain_level = val;
                if (!cpumask_empty(cs->cpus_allowed) &&
                    is_sched_load_balance(cs))
-                        async_rebuild_sched_domains();
+                        rebuild_sched_domains_locked();
        }
        return 0;
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 * Called by cgroup_scan_tasks() for each task in a cgroup.
 *
 * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cgroup_lock() at this point.
+ * holding cpuset_mutex at this point.
 */
 static void cpuset_change_flag(struct task_struct *tsk,
                                struct cgroup_scanner *scan)
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
 * @cs: the cpuset in which each task's spread flags needs to be changed
 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
 * cs:          the cpuset to update
 * turning_on:  whether the flag is being set or cleared
 *
- * Call with cgroup_mutex held.
+ * Call with cpuset_mutex held.
 */
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        mutex_unlock(&callback_mutex);
        if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-                async_rebuild_sched_domains();
+                rebuild_sched_domains_locked();
        if (spread_flag_changed)
                update_tasks_flags(cs, &heap);
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
        return val;
 }
-/*
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
- * Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in can_attach, and they must
- * persist until attach.
- */
-static cpumask_var_t cpus_attach;
-static nodemask_t cpuset_attach_nodemask_from;
-static nodemask_t cpuset_attach_nodemask_to;
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
 static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
        struct cpuset *cs = cgroup_cs(cgrp);
        struct task_struct *task;
        int ret;
+        mutex_lock(&cpuset_mutex);
+        ret = -ENOSPC;
        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
-                return -ENOSPC;
+                goto out_unlock;
        cgroup_taskset_for_each(task, cgrp, tset) {
                /*
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
                 * set_cpus_allowed_ptr() on all attached tasks before
                 * cpus_allowed may be changed.
                 */
+                ret = -EINVAL;
                if (task->flags & PF_THREAD_BOUND)
-                        return -EINVAL;
+                        goto out_unlock;
-                if ((ret = security_task_setscheduler(task)))
+                ret = security_task_setscheduler(task);
-                        return ret;
+                if (ret)
+                        goto out_unlock;
        }
-        /* prepare for attach */
+        /*
-        if (cs == &top_cpuset)
+         * Mark attach is in progress.  This makes validate_change() fail
-                cpumask_copy(cpus_attach, cpu_possible_mask);
+         * changes which zero cpus/mems_allowed.
-        else
+         */
-                guarantee_online_cpus(cs, cpus_attach);
+        cs->attach_in_progress++;
+        ret = 0;
-        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+out_unlock:
+        mutex_unlock(&cpuset_mutex);
+        return ret;
+}
-        return 0;
+static void cpuset_cancel_attach(struct cgroup *cgrp,
+                                 struct cgroup_taskset *tset)
+{
+        mutex_lock(&cpuset_mutex);
+        cgroup_cs(cgrp)->attach_in_progress--;
+        mutex_unlock(&cpuset_mutex);
 }
+/*
+ * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
+ * but we can't allocate it dynamically there.  Define it global and
+ * allocate from cpuset_init().
+ */
+static cpumask_var_t cpus_attach;
 static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
+        /* static bufs protected by cpuset_mutex */
+        static nodemask_t cpuset_attach_nodemask_from;
+        static nodemask_t cpuset_attach_nodemask_to;
        struct mm_struct *mm;
        struct task_struct *task;
        struct task_struct *leader = cgroup_taskset_first(tset);
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
        struct cpuset *cs = cgroup_cs(cgrp);
        struct cpuset *oldcs = cgroup_cs(oldcgrp);
+        mutex_lock(&cpuset_mutex);
+        /* prepare for attach */
+        if (cs == &top_cpuset)
+                cpumask_copy(cpus_attach, cpu_possible_mask);
+        else
+                guarantee_online_cpus(cs, cpus_attach);
+        guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
        cgroup_taskset_for_each(task, cgrp, tset) {
                /*
                 * can_attach beforehand should guarantee that this doesn't
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
                                          &cpuset_attach_nodemask_to);
                mmput(mm);
        }
+        cs->attach_in_progress--;
+        /*
+         * We may have raced with CPU/memory hotunplug.  Trigger hotplug
+         * propagation if @cs doesn't have any CPU or memory.  It will move
+         * the newly added tasks to the nearest parent which can execute.
+         */
+        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+                schedule_cpuset_propagate_hotplug(cs);
+        mutex_unlock(&cpuset_mutex);
 }
 /* The various types of files and directories in a cpuset file system */
@@ -1469,12 +1510,13 @@ typedef enum {
 static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
-        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
+        int retval = -ENODEV;
-        if (!cgroup_lock_live_group(cgrp))
+        mutex_lock(&cpuset_mutex);
-                return -ENODEV;
+        if (!is_cpuset_online(cs))
+                goto out_unlock;
        switch (type) {
        case FILE_CPU_EXCLUSIVE:
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
                retval = -EINVAL;
                break;
        }
-        cgroup_unlock();
+out_unlock:
+        mutex_unlock(&cpuset_mutex);
        return retval;
 }
 static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 {
-        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        cpuset_filetype_t type = cft->private;
+        int retval = -ENODEV;
-        if (!cgroup_lock_live_group(cgrp))
+        mutex_lock(&cpuset_mutex);
-                return -ENODEV;
+        if (!is_cpuset_online(cs))
+                goto out_unlock;
        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
                retval = -EINVAL;
                break;
        }
-        cgroup_unlock();
+out_unlock:
+        mutex_unlock(&cpuset_mutex);
        return retval;
 }
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
 static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
                                const char *buf)
 {
-        int retval = 0;
        struct cpuset *cs = cgroup_cs(cgrp);
        struct cpuset *trialcs;
+        int retval = -ENODEV;
-        if (!cgroup_lock_live_group(cgrp))
+        /*
-                return -ENODEV;
+         * CPU or memory hotunplug may leave @cs w/o any execution
+         * resources, in which case the hotplug code asynchronously updates
+         * configuration and transfers all tasks to the nearest ancestor
+         * which can execute.
+         *
+         * As writes to "cpus" or "mems" may restore @cs's execution
+         * resources, wait for the previously scheduled operations before
+         * proceeding, so that we don't end up keep removing tasks added
+         * after execution capability is restored.
+         *
+         * Flushing cpuset_hotplug_work is enough to synchronize against
+         * hotplug hanlding; however, cpuset_attach() may schedule
+         * propagation work directly.  Flush the workqueue too.
+         */
+        flush_work(&cpuset_hotplug_work);
+        flush_workqueue(cpuset_propagate_hotplug_wq);
+        mutex_lock(&cpuset_mutex);
+        if (!is_cpuset_online(cs))
+                goto out_unlock;
        trialcs = alloc_trial_cpuset(cs);
        if (!trialcs) {
                retval = -ENOMEM;
-                goto out;
+                goto out_unlock;
        }
        switch (cft->private) {
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
        }
        free_trial_cpuset(trialcs);
-out:
+out_unlock:
-        cgroup_unlock();
+        mutex_unlock(&cpuset_mutex);
        return retval;
 }
@@ -1784,57 +1848,18 @@ static struct cftype files[] = {
 };
 /*
- * post_clone() is called during cgroup_create() when the
+ *      cpuset_css_alloc - allocate a cpuset css
- * clone_children mount argument was specified.  The cgroup
- * can not yet have any tasks.
- *
- * Currently we refuse to set up the cgroup - thereby
- * refusing the task to be entered, and as a result refusing
- * the sys_unshare() or clone() which initiated it - if any
- * sibling cpusets have exclusive cpus or mem.
- *
- * If this becomes a problem for some users who wish to
- * allow that scenario, then cpuset_post_clone() could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
- * held.
- */
-static void cpuset_post_clone(struct cgroup *cgroup)
-{
-        struct cgroup *parent, *child;
-        struct cpuset *cs, *parent_cs;
-        parent = cgroup->parent;
-        list_for_each_entry(child, &parent->children, sibling) {
-                cs = cgroup_cs(child);
-                if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
-                        return;
-        }
-        cs = cgroup_cs(cgroup);
-        parent_cs = cgroup_cs(parent);
-        mutex_lock(&callback_mutex);
-        cs->mems_allowed = parent_cs->mems_allowed;
-        cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
-        mutex_unlock(&callback_mutex);
-        return;
-}
-/*
- *      cpuset_create - create a cpuset
 *      cont:   control group that the new cpuset will be part of
 */
-static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
+static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
 {
        struct cpuset *cs;
-        struct cpuset *parent;
-        if (!cont->parent) {
+        if (!cont->parent)
                return &top_cpuset.css;
-        }
-        parent = cgroup_cs(cont->parent);
+        cs = kzalloc(sizeof(*cs), GFP_KERNEL);
-        cs = kmalloc(sizeof(*cs), GFP_KERNEL);
        if (!cs)
                return ERR_PTR(-ENOMEM);
        if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
@@ -1842,47 +1867,108 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
                return ERR_PTR(-ENOMEM);
        }
-        cs->flags = 0;
-        if (is_spread_page(parent))
-                set_bit(CS_SPREAD_PAGE, &cs->flags);
-        if (is_spread_slab(parent))
-                set_bit(CS_SPREAD_SLAB, &cs->flags);
        set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
        cpumask_clear(cs->cpus_allowed);
        nodes_clear(cs->mems_allowed);
        fmeter_init(&cs->fmeter);
+        INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
        cs->relax_domain_level = -1;
-        cs->parent = parent;
+        return &cs->css;
+}
+static int cpuset_css_online(struct cgroup *cgrp)
+{
+        struct cpuset *cs = cgroup_cs(cgrp);
+        struct cpuset *parent = parent_cs(cs);
+        struct cpuset *tmp_cs;
+        struct cgroup *pos_cg;
+        if (!parent)
+                return 0;
+        mutex_lock(&cpuset_mutex);
+        set_bit(CS_ONLINE, &cs->flags);
+        if (is_spread_page(parent))
+                set_bit(CS_SPREAD_PAGE, &cs->flags);
+        if (is_spread_slab(parent))
+                set_bit(CS_SPREAD_SLAB, &cs->flags);
        number_of_cpusets++;
-        return &cs->css ;
+        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
+                goto out_unlock;
+        /*
+         * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
+         * set.  This flag handling is implemented in cgroup core for
+         * histrical reasons - the flag may be specified during mount.
+         *
+         * Currently, if any sibling cpusets have exclusive cpus or mem, we
+         * refuse to clone the configuration - thereby refusing the task to
+         * be entered, and as a result refusing the sys_unshare() or
+         * clone() which initiated it.  If this becomes a problem for some
+         * users who wish to allow that scenario, then this could be
+         * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
+         * (and likewise for mems) to the new cgroup.
+         */
+        rcu_read_lock();
+        cpuset_for_each_child(tmp_cs, pos_cg, parent) {
+                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
+                        rcu_read_unlock();
+                        goto out_unlock;
+                }
+        }
+        rcu_read_unlock();
+        mutex_lock(&callback_mutex);
+        cs->mems_allowed = parent->mems_allowed;
+        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+        mutex_unlock(&callback_mutex);
+out_unlock:
+        mutex_unlock(&cpuset_mutex);
+        return 0;
+}
+static void cpuset_css_offline(struct cgroup *cgrp)
+{
+        struct cpuset *cs = cgroup_cs(cgrp);
+        mutex_lock(&cpuset_mutex);
+        if (is_sched_load_balance(cs))
+                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+        number_of_cpusets--;
+        clear_bit(CS_ONLINE, &cs->flags);
+        mutex_unlock(&cpuset_mutex);
 }
 /*
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
- * will call async_rebuild_sched_domains().
+ * will call rebuild_sched_domains_locked().
 */
-static void cpuset_destroy(struct cgroup *cont)
+static void cpuset_css_free(struct cgroup *cont)
 {
        struct cpuset *cs = cgroup_cs(cont);
-        if (is_sched_load_balance(cs))
-                update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
-        number_of_cpusets--;
        free_cpumask_var(cs->cpus_allowed);
        kfree(cs);
 }
 struct cgroup_subsys cpuset_subsys = {
        .name = "cpuset",
-        .create = cpuset_create,
+        .css_alloc = cpuset_css_alloc,
-        .destroy = cpuset_destroy,
+        .css_online = cpuset_css_online,
+        .css_offline = cpuset_css_offline,
+        .css_free = cpuset_css_free,
        .can_attach = cpuset_can_attach,
+        .cancel_attach = cpuset_cancel_attach,
        .attach = cpuset_attach,
-        .post_clone = cpuset_post_clone,
        .subsys_id = cpuset_subsys_id,
        .base_cftypes = files,
        .early_init = 1,
@@ -1932,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
 {
        struct cgroup *new_cgroup = scan->data;
+        cgroup_lock();
        cgroup_attach_task(new_cgroup, tsk);
+        cgroup_unlock();
 }
 /**
@@ -1940,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
 * @from: cpuset in which the tasks currently reside
 * @to: cpuset to which the tasks will be moved
 *
- * Called with cgroup_mutex held
+ * Called with cpuset_mutex held
 * callback_mutex must not be held, as cpuset_attach() will take it.
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -1967,203 +2055,212 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
 * cpuset to its next-highest non-empty parent.
- *
- * Called with cgroup_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
 */
 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 {
        struct cpuset *parent;
        /*
-         * The cgroup's css_sets list is in use if there are tasks
-         * in the cpuset; the list is empty if there are none;
-         * the cs->css.refcnt seems always 0.
-         */
-        if (list_empty(&cs->css.cgroup->css_sets))
-                return;
-        /*
         * Find its next-highest non-empty parent, (top cpuset
         * has online cpus, so can't be empty).
         */
-        parent = cs->parent;
+        parent = parent_cs(cs);
        while (cpumask_empty(parent->cpus_allowed) ||
                        nodes_empty(parent->mems_allowed))
-                parent = parent->parent;
+                parent = parent_cs(parent);
        move_member_tasks_to_cpuset(cs, parent);
 }
-/*
+/**
- * Helper function to traverse cpusets.
+ * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
- * It can be used to walk the cpuset tree from top to bottom, completing
+ * @cs: cpuset in interest
- * one layer before dropping down to the next (thus always processing a
+ *
- * node before any of its children).
+ * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
+ * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
+ * all its tasks are moved to the nearest ancestor with both resources.
 */
-static struct cpuset *cpuset_next(struct list_head *queue)
+static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
 {
-        struct cpuset *cp;
+        static cpumask_t off_cpus;
-        struct cpuset *child;   /* scans child cpusets of cp */
+        static nodemask_t off_mems, tmp_mems;
-        struct cgroup *cont;
+        struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
+        bool is_empty;
-        if (list_empty(queue))
+        mutex_lock(&cpuset_mutex);
-                return NULL;
+        cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
+        nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
+        /* remove offline cpus from @cs */
+        if (!cpumask_empty(&off_cpus)) {
+                mutex_lock(&callback_mutex);
+                cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+                mutex_unlock(&callback_mutex);
+                update_tasks_cpumask(cs, NULL);
+        }
-        cp = list_first_entry(queue, struct cpuset, stack_list);
+        /* remove offline mems from @cs */
-        list_del(queue->next);
+        if (!nodes_empty(off_mems)) {
-        list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+                tmp_mems = cs->mems_allowed;
-                child = cgroup_cs(cont);
+                mutex_lock(&callback_mutex);
-                list_add_tail(&child->stack_list, queue);
+                nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+                mutex_unlock(&callback_mutex);
+                update_tasks_nodemask(cs, &tmp_mems, NULL);
        }
-        return cp;
+        is_empty = cpumask_empty(cs->cpus_allowed) ||
+                nodes_empty(cs->mems_allowed);
+        mutex_unlock(&cpuset_mutex);
+        /*
+         * If @cs became empty, move tasks to the nearest ancestor with
+         * execution resources.  This is full cgroup operation which will
+         * also call back into cpuset.  Should be done outside any lock.
+         */
+        if (is_empty)
+                remove_tasks_in_empty_cpuset(cs);
+        /* the following may free @cs, should be the last operation */
+        css_put(&cs->css);
 }
+/**
+ * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
+ * @cs: cpuset of interest
+ *
+ * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
+ * memory masks according to top_cpuset.
+ */
+static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
+{
+        /*
+         * Pin @cs.  The refcnt will be released when the work item
+         * finishes executing.
+         */
+        if (!css_tryget(&cs->css))
+                return;
+        /*
+         * Queue @cs->hotplug_work.  If already pending, lose the css ref.
+         * cpuset_propagate_hotplug_wq is ordered and propagation will
+         * happen in the order this function is called.
+         */
+        if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
+                css_put(&cs->css);
+}
-/*
+/**
- * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
+ * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
- * online/offline) and update the cpusets accordingly.
- * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
- * cpuset must be moved to a parent cpuset.
 *
- * Called with cgroup_mutex held.  We take callback_mutex to modify
+ * This function is called after either CPU or memory configuration has
- * cpus_allowed and mems_allowed.
+ * changed and updates cpuset accordingly.  The top_cpuset is always
+ * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
+ * order to make cpusets transparent (of no affect) on systems that are
+ * actively using CPU hotplug but making no active use of cpusets.
 *
- * This walk processes the tree from top to bottom, completing one layer
+ * Non-root cpusets are only affected by offlining.  If any CPUs or memory
- * before dropping down to the next.  It always processes a node before
+ * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
- * any of its children.
+ * descendants.
 *
- * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
+ * Note that CPU offlining during suspend is ignored.  We don't modify
- * if all present pages from a node are offlined.
+ * cpusets across suspend/resume cycles at all.
 */
-static void
+static void cpuset_hotplug_workfn(struct work_struct *work)
-scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 {
-        LIST_HEAD(queue);
+        static cpumask_t new_cpus, tmp_cpus;
-        struct cpuset *cp;              /* scans cpusets being updated */
+        static nodemask_t new_mems, tmp_mems;
-        static nodemask_t oldmems;      /* protected by cgroup_mutex */
+        bool cpus_updated, mems_updated;
+        bool cpus_offlined, mems_offlined;
-        list_add_tail((struct list_head *)&root->stack_list, &queue);
+        mutex_lock(&cpuset_mutex);
-        switch (event) {
+        /* fetch the available cpus/mems and find out which changed how */
-        case CPUSET_CPU_OFFLINE:
+        cpumask_copy(&new_cpus, cpu_active_mask);
-                while ((cp = cpuset_next(&queue)) != NULL) {
+        new_mems = node_states[N_MEMORY];
-                        /* Continue past cpusets with all cpus online */
+        cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
-                        if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
+        cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
-                                continue;
+                                       &new_cpus);
-                        /* Remove offline cpus from this cpuset. */
+        mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
-                        mutex_lock(&callback_mutex);
+        nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
-                        cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+        mems_offlined = !nodes_empty(tmp_mems);
-                                                        cpu_active_mask);
-                        mutex_unlock(&callback_mutex);
-                        /* Move tasks from the empty cpuset to a parent */
+        /* synchronize cpus_allowed to cpu_active_mask */
-                        if (cpumask_empty(cp->cpus_allowed))
+        if (cpus_updated) {
-                                remove_tasks_in_empty_cpuset(cp);
+                mutex_lock(&callback_mutex);
-                        else
+                cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
-                                update_tasks_cpumask(cp, NULL);
+                mutex_unlock(&callback_mutex);
-                }
+                /* we don't mess with cpumasks of tasks in top_cpuset */
-                break;
+        }
+        /* synchronize mems_allowed to N_MEMORY */
+        if (mems_updated) {
+                tmp_mems = top_cpuset.mems_allowed;
+                mutex_lock(&callback_mutex);
+                top_cpuset.mems_allowed = new_mems;
+                mutex_unlock(&callback_mutex);
+                update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
+        }
-        case CPUSET_MEM_OFFLINE:
+        /* if cpus or mems went down, we need to propagate to descendants */
-                while ((cp = cpuset_next(&queue)) != NULL) {
+        if (cpus_offlined || mems_offlined) {
+                struct cpuset *cs;
+                struct cgroup *pos_cgrp;
-                        /* Continue past cpusets with all mems online */
+                rcu_read_lock();
-                        if (nodes_subset(cp->mems_allowed,
+                cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
-                                        node_states[N_HIGH_MEMORY]))
+                        schedule_cpuset_propagate_hotplug(cs);
-                                continue;
+                rcu_read_unlock();
+        }
-                        oldmems = cp->mems_allowed;
+        mutex_unlock(&cpuset_mutex);
-                        /* Remove offline mems from this cpuset. */
+        /* wait for propagations to finish */
-                        mutex_lock(&callback_mutex);
+        flush_workqueue(cpuset_propagate_hotplug_wq);
-                        nodes_and(cp->mems_allowed, cp->mems_allowed,
-                                                node_states[N_HIGH_MEMORY]);
-                        mutex_unlock(&callback_mutex);
-                        /* Move tasks from the empty cpuset to a parent */
+        /* rebuild sched domains if cpus_allowed has changed */
-                        if (nodes_empty(cp->mems_allowed))
+        if (cpus_updated) {
-                                remove_tasks_in_empty_cpuset(cp);
+                struct sched_domain_attr *attr;
-                        else
+                cpumask_var_t *doms;
-                                update_tasks_nodemask(cp, &oldmems, NULL);
+                int ndoms;
-                }
+                mutex_lock(&cpuset_mutex);
+                ndoms = generate_sched_domains(&doms, &attr);
+                mutex_unlock(&cpuset_mutex);
+                partition_sched_domains(ndoms, doms, attr);
        }
 }
-/*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
- * period.  This is necessary in order to make cpusets transparent
- * (of no affect) on systems that are actively using CPU hotplug
- * but making no active use of cpusets.
- *
- * The only exception to this is suspend/resume, where we don't
- * modify cpusets at all.
- *
- * This routine ensures that top_cpuset.cpus_allowed tracks
- * cpu_active_mask on each CPU hotplug (cpuhp) event.
- *
- * Called within get_online_cpus().  Needs to call cgroup_lock()
- * before calling generate_sched_domains().
- *
- * @cpu_online: Indicates whether this is a CPU online event (true) or
- * a CPU offline event (false).
- */
 void cpuset_update_active_cpus(bool cpu_online)
 {
-        struct sched_domain_attr *attr;
+        /*
-        cpumask_var_t *doms;
+         * We're inside cpu hotplug critical region which usually nests
-        int ndoms;
+         * inside cgroup synchronization.  Bounce actual hotplug processing
+         * to a work item to avoid reverse locking order.
-        cgroup_lock();
+         *
-        mutex_lock(&callback_mutex);
+         * We still need to do partition_sched_domains() synchronously;
-        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
+         * otherwise, the scheduler will get confused and put tasks to the
-        mutex_unlock(&callback_mutex);
+         * dead CPU.  Fall back to the default single domain.
+         * cpuset_hotplug_workfn() will rebuild it as necessary.
-        if (!cpu_online)
+         */
-                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
+        partition_sched_domains(1, NULL, NULL);
+        schedule_work(&cpuset_hotplug_work);
-        ndoms = generate_sched_domains(&doms, &attr);
-        cgroup_unlock();
-        /* Have scheduler rebuild the domains */
-        partition_sched_domains(ndoms, doms, attr);
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
- * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
+ * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
- * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
+ * Call this routine anytime after node_states[N_MEMORY] changes.
 * See cpuset_update_active_cpus() for CPU hotplug handling.
 */
 static int cpuset_track_online_nodes(struct notifier_block *self,
                                unsigned long action, void *arg)
 {
-        static nodemask_t oldmems;      /* protected by cgroup_mutex */
+        schedule_work(&cpuset_hotplug_work);
-        cgroup_lock();
-        switch (action) {
-        case MEM_ONLINE:
-                oldmems = top_cpuset.mems_allowed;
-                mutex_lock(&callback_mutex);
-                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-                mutex_unlock(&callback_mutex);
-                update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
-                break;
-        case MEM_OFFLINE:
-                /*
-                 * needn't update top_cpuset.mems_allowed explicitly because
-                 * scan_cpusets_upon_hotplug() will update it.
-                 */
-                scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
-                break;
-        default:
-                break;
-        }
-        cgroup_unlock();
        return NOTIFY_OK;
 }
 #endif
@@ -2177,12 +2274,13 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 void __init cpuset_init_smp(void)
 {
        cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
-        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+        top_cpuset.mems_allowed = node_states[N_MEMORY];
        hotplug_memory_notifier(cpuset_track_online_nodes, 10);
-        cpuset_wq = create_singlethread_workqueue("cpuset");
+        cpuset_propagate_hotplug_wq =
-        BUG_ON(!cpuset_wq);
+                alloc_ordered_workqueue("cpuset_hotplug", 0);
+        BUG_ON(!cpuset_propagate_hotplug_wq);
 }
 /**
@@ -2245,7 +2343,7 @@ void cpuset_init_current_mems_allowed(void)
 *
 * Description: Returns the nodemask_t mems_allowed of the cpuset
 * attached to the specified @tsk.  Guaranteed to return some non-empty
- * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
+ * subset of node_states[N_MEMORY], even if this means going outside the
 * tasks cpuset.
 **/
@@ -2281,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 */
 static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 {
-        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
+        while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
-                cs = cs->parent;
+                cs = parent_cs(cs);
        return cs;
 }
@@ -2420,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 }
 /**
- * cpuset_unlock - release lock on cpuset changes
- *
- * Undo the lock taken in a previous cpuset_lock() call.
- */
-void cpuset_unlock(void)
-{
-        mutex_unlock(&callback_mutex);
-}
-/**
 * cpuset_mem_spread_node() - On which node to begin search for a file page
 * cpuset_slab_spread_node() - On which node to begin search for a slab page
 *
@@ -2519,8 +2606,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
        dentry = task_cs(tsk)->css.cgroup->dentry;
        spin_lock(&cpuset_buffer_lock);
-        snprintf(cpuset_name, CPUSET_NAME_LEN,
-                 dentry ? (const char *)dentry->d_name.name : "/");
+        if (!dentry) {
+                strcpy(cpuset_name, "/");
+        } else {
+                spin_lock(&dentry->d_lock);
+                strlcpy(cpuset_name, (const char *)dentry->d_name.name,
+                        CPUSET_NAME_LEN);
+                spin_unlock(&dentry->d_lock);
+        }
        nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
                           tsk->mems_allowed);
        printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
@@ -2568,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)
 *  - Used for /proc/<pid>/cpuset.
 *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
 *    doesn't really matter if tsk->cpuset changes after we read it,
- *    and we take cgroup_mutex, keeping cpuset_attach() from changing it
+ *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
 *    anyway.
 */
 static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2590,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
        if (!tsk)
                goto out_free;
-        retval = -EINVAL;
+        rcu_read_lock();
-        cgroup_lock();
        css = task_subsys_state(tsk, cpuset_subsys_id);
        retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+        rcu_read_unlock();
        if (retval < 0)
-                goto out_unlock;
+                goto out_put_task;
        seq_puts(m, buf);
        seq_putc(m, '\n');
-out_unlock:
+out_put_task:
-        cgroup_unlock();
        put_task_struct(tsk);
 out_free:
        kfree(buf);
diff --git a/kernel/cred.c b/kernel/cred.c
index 48cea3da6d05..e0573a43c7df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -30,17 +30,6 @@
 static struct kmem_cache *cred_jar;
 /*
- * The common credentials for the initial task's thread group
- */
-#ifdef CONFIG_KEYS
-static struct thread_group_cred init_tgcred = {
-        .usage  = ATOMIC_INIT(2),
-        .tgid   = 0,
-        .lock   = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
-};
-#endif
-/*
 * The initial credentials for the initial task
 */
 struct cred init_cred = {
@@ -65,9 +54,6 @@ struct cred init_cred = {
        .user                   = INIT_USER,
        .user_ns                = &init_user_ns,
        .group_info             = &init_groups,
-#ifdef CONFIG_KEYS
-        .tgcred                 = &init_tgcred,
-#endif
 };
 static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
 }
 /*
- * Dispose of the shared task group credentials
- */
-#ifdef CONFIG_KEYS
-static void release_tgcred_rcu(struct rcu_head *rcu)
-{
-        struct thread_group_cred *tgcred =
-                container_of(rcu, struct thread_group_cred, rcu);
-        BUG_ON(atomic_read(&tgcred->usage) != 0);
-        key_put(tgcred->session_keyring);
-        key_put(tgcred->process_keyring);
-        kfree(tgcred);
-}
-#endif
-/*
- * Release a set of thread group credentials.
- */
-static void release_tgcred(struct cred *cred)
-{
-#ifdef CONFIG_KEYS
-        struct thread_group_cred *tgcred = cred->tgcred;
-        if (atomic_dec_and_test(&tgcred->usage))
-                call_rcu(&tgcred->rcu, release_tgcred_rcu);
-#endif
-}
-/*
 * The RCU callback to actually dispose of a set of credentials
 */
 static void put_cred_rcu(struct rcu_head *rcu)
@@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
 #endif
        security_cred_free(cred);
+        key_put(cred->session_keyring);
+        key_put(cred->process_keyring);
        key_put(cred->thread_keyring);
        key_put(cred->request_key_auth);
-        release_tgcred(cred);
        if (cred->group_info)
                put_group_info(cred->group_info);
        free_uid(cred->user);
@@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void)
        if (!new)
                return NULL;
-#ifdef CONFIG_KEYS
-        new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
-        if (!new->tgcred) {
-                kmem_cache_free(cred_jar, new);
-                return NULL;
-        }
-        atomic_set(&new->tgcred->usage, 1);
-#endif
        atomic_set(&new->usage, 1);
 #ifdef CONFIG_DEBUG_CREDENTIALS
        new->magic = CRED_MAGIC;
@@ -308,9 +256,10 @@ struct cred *prepare_creds(void)
        get_user_ns(new->user_ns);
 #ifdef CONFIG_KEYS
+        key_get(new->session_keyring);
+        key_get(new->process_keyring);
        key_get(new->thread_keyring);
        key_get(new->request_key_auth);
-        atomic_inc(&new->tgcred->usage);
 #endif
 #ifdef CONFIG_SECURITY
@@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds);
 */
 struct cred *prepare_exec_creds(void)
 {
-        struct thread_group_cred *tgcred = NULL;
        struct cred *new;
-#ifdef CONFIG_KEYS
-        tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
-        if (!tgcred)
-                return NULL;
-#endif
        new = prepare_creds();
-        if (!new) {
+        if (!new)
-                kfree(tgcred);
                return new;
-        }
 #ifdef CONFIG_KEYS
        /* newly exec'd tasks don't get a thread keyring */
        key_put(new->thread_keyring);
        new->thread_keyring = NULL;
-        /* create a new per-thread-group creds for all this set of threads to
-         * share */
-        memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
-        atomic_set(&tgcred->usage, 1);
-        spin_lock_init(&tgcred->lock);
        /* inherit the session keyring; new process keyring */
-        key_get(tgcred->session_keyring);
+        key_put(new->process_keyring);
-        tgcred->process_keyring = NULL;
+        new->process_keyring = NULL;
-        release_tgcred(new);
-        new->tgcred = tgcred;
 #endif
        return new;
@@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void)
 */
 int copy_creds(struct task_struct *p, unsigned long clone_flags)
 {
-#ifdef CONFIG_KEYS
-        struct thread_group_cred *tgcred;
-#endif
        struct cred *new;
        int ret;
@@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
                        install_thread_keyring_to_cred(new);
        }
-        /* we share the process and session keyrings between all the threads in
+        /* The process keyring is only shared between the threads in a process;
-         * a process - this is slightly icky as we violate COW credentials a
+         * anything outside of those threads doesn't inherit.
-         * bit */
+         */
        if (!(clone_flags & CLONE_THREAD)) {
-                tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+                key_put(new->process_keyring);
-                if (!tgcred) {
+                new->process_keyring = NULL;
-                        ret = -ENOMEM;
-                        goto error_put;
-                }
-                atomic_set(&tgcred->usage, 1);
-                spin_lock_init(&tgcred->lock);
-                tgcred->process_keyring = NULL;
-                tgcred->session_keyring = key_get(new->tgcred->session_keyring);
-                release_tgcred(new);
-                new->tgcred = tgcred;
        }
 #endif
@@ -455,6 +372,31 @@ error_put:
        return ret;
 }
+static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
+{
+        const struct user_namespace *set_ns = set->user_ns;
+        const struct user_namespace *subset_ns = subset->user_ns;
+        /* If the two credentials are in the same user namespace see if
+         * the capabilities of subset are a subset of set.
+         */
+        if (set_ns == subset_ns)
+                return cap_issubset(subset->cap_permitted, set->cap_permitted);
+        /* The credentials are in a different user namespaces
+         * therefore one is a subset of the other only if a set is an
+         * ancestor of subset and set->euid is owner of subset or one
+         * of subsets ancestors.
+         */
+        for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
+                if ((set_ns == subset_ns->parent)  &&
+                    uid_eq(subset_ns->owner, set->euid))
+                        return true;
+        }
+        return false;
+}
 /**
 * commit_creds - Install new credentials upon the current task
 * @new: The credentials to be assigned
@@ -493,7 +435,7 @@ int commit_creds(struct cred *new)
            !gid_eq(old->egid, new->egid) ||
            !uid_eq(old->fsuid, new->fsuid) ||
            !gid_eq(old->fsgid, new->fsgid) ||
-            !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+            !cred_cap_issubset(old, new)) {
                if (task->mm)
                        set_dumpable(task->mm, suid_dumpable);
                task->pdeath_signal = 0;
@@ -643,9 +585,6 @@ void __init cred_init(void)
 */
 struct cred *prepare_kernel_cred(struct task_struct *daemon)
 {
-#ifdef CONFIG_KEYS
-        struct thread_group_cred *tgcred;
-#endif
        const struct cred *old;
        struct cred *new;
@@ -653,14 +592,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        if (!new)
                return NULL;
-#ifdef CONFIG_KEYS
-        tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
-        if (!tgcred) {
-                kmem_cache_free(cred_jar, new);
-                return NULL;
-        }
-#endif
        kdebug("prepare_kernel_cred() alloc %p", new);
        if (daemon)
@@ -678,13 +609,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        get_group_info(new->group_info);
 #ifdef CONFIG_KEYS
-        atomic_set(&tgcred->usage, 1);
+        new->session_keyring = NULL;
-        spin_lock_init(&tgcred->lock);
+        new->process_keyring = NULL;
-        tgcred->process_keyring = NULL;
-        tgcred->session_keyring = NULL;
-        new->tgcred = tgcred;
-        new->request_key_auth = NULL;
        new->thread_keyring = NULL;
+        new->request_key_auth = NULL;
        new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
 #endif
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 9a61738cefc8..c26278fd4851 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -29,6 +29,7 @@
 */
 #include <linux/pid_namespace.h>
 #include <linux/clocksource.h>
+#include <linux/serial_core.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/console.h>
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 3494c28a7e7a..2235967e78b0 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -72,6 +72,8 @@ extern int dbg_kdb_mode;
 #ifdef CONFIG_KGDB_KDB
 extern int kdb_stub(struct kgdb_state *ks);
 extern int kdb_parse(const char *cmdstr);
+extern int kdb_common_init_state(struct kgdb_state *ks);
+extern int kdb_common_deinit_state(void);
 #else /* ! CONFIG_KGDB_KDB */
 static inline int kdb_stub(struct kgdb_state *ks)
 {
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index ce615e064482..19d9a578c753 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -31,6 +31,7 @@
 #include <linux/kernel.h>
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
+#include <linux/serial_core.h>
 #include <linux/reboot.h>
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -782,7 +783,10 @@ static void gdb_cmd_query(struct kgdb_state *ks)
                        len = len / 2;
                        remcom_out_buffer[len++] = 0;
+                        kdb_common_init_state(ks);
                        kdb_parse(remcom_out_buffer);
+                        kdb_common_deinit_state();
                        strcpy(remcom_out_buffer, "OK");
                }
                break;
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 8418c2f8ec5d..70a504601dc3 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -486,11 +486,9 @@ static int kdb_bc(int argc, const char **argv)
 /*
 * kdb_ss
 *
- *      Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
+ *      Process the 'ss' (Single Step) command.
- *      commands.
 *
 *      ss
- *      ssb
 *
 * Parameters:
 *      argc    Argument count
@@ -498,35 +496,23 @@ static int kdb_bc(int argc, const char **argv)
 * Outputs:
 *      None.
 * Returns:
- *      KDB_CMD_SS[B] for success, a kdb error if failure.
+ *      KDB_CMD_SS for success, a kdb error if failure.
 * Locking:
 *      None.
 * Remarks:
 *
 *      Set the arch specific option to trigger a debug trap after the next
 *      instruction.
- *
- *      For 'ssb', set the trace flag in the debug trap handler
- *      after printing the current insn and return directly without
- *      invoking the kdb command processor, until a branch instruction
- *      is encountered.
 */
 static int kdb_ss(int argc, const char **argv)
 {
-        int ssb = 0;
-        ssb = (strcmp(argv[0], "ssb") == 0);
        if (argc != 0)
                return KDB_ARGCOUNT;
        /*
         * Set trace flag and go.
         */
        KDB_STATE_SET(DOING_SS);
-        if (ssb) {
-                KDB_STATE_SET(DOING_SSB);
-                return KDB_CMD_SSB;
-        }
        return KDB_CMD_SS;
 }
@@ -561,8 +547,6 @@ void __init kdb_initbptab(void)
        kdb_register_repeat("ss", kdb_ss, "",
                "Single Step", 1, KDB_REPEAT_NO_ARGS);
-        kdb_register_repeat("ssb", kdb_ss, "",
-                "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
        /*
         * Architecture dependent initialization.
         */
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index be7b33b73d30..328d18ef31e4 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -34,6 +34,22 @@ EXPORT_SYMBOL_GPL(kdb_poll_idx);
 static struct kgdb_state *kdb_ks;
+int kdb_common_init_state(struct kgdb_state *ks)
+{
+        kdb_initial_cpu = atomic_read(&kgdb_active);
+        kdb_current_task = kgdb_info[ks->cpu].task;
+        kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
+        return 0;
+}
+int kdb_common_deinit_state(void)
+{
+        kdb_initial_cpu = -1;
+        kdb_current_task = NULL;
+        kdb_current_regs = NULL;
+        return 0;
+}
 int kdb_stub(struct kgdb_state *ks)
 {
        int error = 0;
@@ -94,13 +110,10 @@ int kdb_stub(struct kgdb_state *ks)
        }
        /* Set initial kdb state variables */
        KDB_STATE_CLEAR(KGDB_TRANS);
-        kdb_initial_cpu = atomic_read(&kgdb_active);
+        kdb_common_init_state(ks);
-        kdb_current_task = kgdb_info[ks->cpu].task;
-        kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
        /* Remove any breakpoints as needed by kdb and clear single step */
        kdb_bp_remove();
        KDB_STATE_CLEAR(DOING_SS);
-        KDB_STATE_CLEAR(DOING_SSB);
        KDB_STATE_SET(PAGER);
        /* zero out any offline cpu data */
        for_each_present_cpu(i) {
@@ -125,9 +138,7 @@ int kdb_stub(struct kgdb_state *ks)
         * Upon exit from the kdb main loop setup break points and restart
         * the system based on the requested continue state
         */
-        kdb_initial_cpu = -1;
+        kdb_common_deinit_state();
-        kdb_current_task = NULL;
-        kdb_current_regs = NULL;
        KDB_STATE_CLEAR(PAGER);
        kdbnearsym_cleanup();
        if (error == KDB_CMD_KGDB) {
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4d5f8d5612f3..00eb8f7fbf41 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -124,7 +124,7 @@ static kdbmsg_t kdbmsgs[] = {
 };
 #undef KDBMSG
-static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
+static const int __nkdb_err = ARRAY_SIZE(kdbmsgs);
 /*
@@ -175,7 +175,7 @@ static char *__env[] = {
 (char *)0,
 };
-static const int __nenv = (sizeof(__env) / sizeof(char *));
+static const int __nenv = ARRAY_SIZE(__env);
 struct task_struct *kdb_curr_task(int cpu)
 {
@@ -681,34 +681,50 @@ static int kdb_defcmd(int argc, const char **argv)
        }
        if (argc != 3)
                return KDB_ARGCOUNT;
-        defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+        if (in_dbg_master()) {
-                             GFP_KDB);
+                kdb_printf("Command only available during kdb_init()\n");
-        if (!defcmd_set) {
-                kdb_printf("Could not allocate new defcmd_set entry for %s\n",
-                           argv[1]);
-                defcmd_set = save_defcmd_set;
                return KDB_NOTIMP;
        }
+        defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
+                             GFP_KDB);
+        if (!defcmd_set)
+                goto fail_defcmd;
        memcpy(defcmd_set, save_defcmd_set,
               defcmd_set_count * sizeof(*defcmd_set));
-        kfree(save_defcmd_set);
        s = defcmd_set + defcmd_set_count;
        memset(s, 0, sizeof(*s));
        s->usable = 1;
        s->name = kdb_strdup(argv[1], GFP_KDB);
+        if (!s->name)
+                goto fail_name;
        s->usage = kdb_strdup(argv[2], GFP_KDB);
+        if (!s->usage)
+                goto fail_usage;
        s->help = kdb_strdup(argv[3], GFP_KDB);
+        if (!s->help)
+                goto fail_help;
        if (s->usage[0] == '"') {
-                strcpy(s->usage, s->usage+1);
+                strcpy(s->usage, argv[2]+1);
                s->usage[strlen(s->usage)-1] = '\0';
        }
        if (s->help[0] == '"') {
-                strcpy(s->help, s->help+1);
+                strcpy(s->help, argv[3]+1);
                s->help[strlen(s->help)-1] = '\0';
        }
        ++defcmd_set_count;
        defcmd_in_progress = 1;
+        kfree(save_defcmd_set);
        return 0;
+fail_help:
+        kfree(s->usage);
+fail_usage:
+        kfree(s->name);
+fail_name:
+        kfree(defcmd_set);
+fail_defcmd:
+        kdb_printf("Could not allocate new defcmd_set entry for %s\n", argv[1]);
+        defcmd_set = save_defcmd_set;
+        return KDB_NOTIMP;
 }
 /*
@@ -1112,7 +1128,6 @@ void kdb_set_current_task(struct task_struct *p)
 *      KDB_CMD_GO      User typed 'go'.
 *      KDB_CMD_CPU     User switched to another cpu.
 *      KDB_CMD_SS      Single step.
- *      KDB_CMD_SSB     Single step until branch.
 */
 static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                     kdb_dbtrap_t db_result)
@@ -1151,14 +1166,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                        kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
                                   instruction_pointer(regs));
                        break;
-                case KDB_DB_SSB:
-                        /*
-                         * In the midst of ssb command. Just return.
-                         */
-                        KDB_DEBUG_STATE("kdb_local 3", reason);
-                        return KDB_CMD_SSB;     /* Continue with SSB command */
-                        break;
                case KDB_DB_SS:
                        break;
                case KDB_DB_SSBPT:
@@ -1281,7 +1288,6 @@ do_full_getstr:
                if (diag == KDB_CMD_GO
                 || diag == KDB_CMD_CPU
                 || diag == KDB_CMD_SS
-                 || diag == KDB_CMD_SSB
                 || diag == KDB_CMD_KGDB)
                        break;
@@ -1368,12 +1374,6 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
                        break;
                }
-                if (result == KDB_CMD_SSB) {
-                        KDB_STATE_SET(DOING_SS);
-                        KDB_STATE_SET(DOING_SSB);
-                        break;
-                }
                if (result == KDB_CMD_KGDB) {
                        if (!KDB_STATE(DOING_KGDB))
                                kdb_printf("Entering please attach debugger "
@@ -1970,6 +1970,8 @@ static int kdb_lsmod(int argc, const char **argv)
        kdb_printf("Module                  Size  modstruct     Used by\n");
        list_for_each_entry(mod, kdb_modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                kdb_printf("%-20s%8u  0x%p ", mod->name,
                           mod->core_size, (void *)mod);
@@ -2348,69 +2350,6 @@ static int kdb_pid(int argc, const char **argv)
        return 0;
 }
-/*
- * kdb_ll - This function implements the 'll' command which follows a
- *      linked list and executes an arbitrary command for each
- *      element.
- */
-static int kdb_ll(int argc, const char **argv)
-{
-        int diag = 0;
-        unsigned long addr;
-        long offset = 0;
-        unsigned long va;
-        unsigned long linkoffset;
-        int nextarg;
-        const char *command;
-        if (argc != 3)
-                return KDB_ARGCOUNT;
-        nextarg = 1;
-        diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
-        if (diag)
-                return diag;
-        diag = kdbgetularg(argv[2], &linkoffset);
-        if (diag)
-                return diag;
-        /*
-         * Using the starting address as
-         * the first element in the list, and assuming that
-         * the list ends with a null pointer.
-         */
-        va = addr;
-        command = kdb_strdup(argv[3], GFP_KDB);
-        if (!command) {
-                kdb_printf("%s: cannot duplicate command\n", __func__);
-                return 0;
-        }
-        /* Recursive use of kdb_parse, do not use argv after this point */
-        argv = NULL;
-        while (va) {
-                char buf[80];
-                if (KDB_FLAG(CMD_INTERRUPT))
-                        goto out;
-                sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
-                diag = kdb_parse(buf);
-                if (diag)
-                        goto out;
-                addr = va + linkoffset;
-                if (kdb_getword(&va, addr, sizeof(va)))
-                        goto out;
-        }
-out:
-        kfree(command);
-        return diag;
-}
 static int kdb_kgdb(int argc, const char **argv)
 {
        return KDB_CMD_KGDB;
@@ -2428,11 +2367,15 @@ static int kdb_help(int argc, const char **argv)
        kdb_printf("-----------------------------"
                   "-----------------------------\n");
        for_each_kdbcmd(kt, i) {
-                if (kt->cmd_name)
+                char *space = "";
-                        kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
-                                   kt->cmd_usage, kt->cmd_help);
                if (KDB_FLAG(CMD_INTERRUPT))
                        return 0;
+                if (!kt->cmd_name)
+                        continue;
+                if (strlen(kt->cmd_usage) > 20)
+                        space = "\n                                    ";
+                kdb_printf("%-15.15s %-20s%s%s\n", kt->cmd_name,
+                           kt->cmd_usage, space, kt->cmd_help);
        }
        return 0;
 }
@@ -2737,7 +2680,7 @@ int kdb_register_repeat(char *cmd,
                          (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new));
                        kfree(kdb_commands);
                }
-                memset(new + kdb_max_commands, 0,
+                memset(new + kdb_max_commands - KDB_BASE_CMD_MAX, 0,
                       kdb_command_extend * sizeof(*new));
                kdb_commands = new;
                kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX;
@@ -2841,15 +2784,13 @@ static void __init kdb_inittab(void)
          "Stack traceback", 1, KDB_REPEAT_NONE);
        kdb_register_repeat("btp", kdb_bt, "<pid>",
          "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
-        kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
+        kdb_register_repeat("bta", kdb_bt, "[D|R|S|T|C|Z|E|U|I|M|A]",
-          "Display stack all processes", 0, KDB_REPEAT_NONE);
+          "Backtrace all processes matching state flag", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("btc", kdb_bt, "",
          "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("btt", kdb_bt, "<vaddr>",
          "Backtrace process given its struct task address", 0,
                            KDB_REPEAT_NONE);
-        kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
-          "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("env", kdb_env, "",
          "Show environment variables", 0, KDB_REPEAT_NONE);
        kdb_register_repeat("set", kdb_set, "",
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 392ec6a25844..7afd3c8c41d5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -19,7 +19,6 @@
 #define KDB_CMD_GO      (-1001)
 #define KDB_CMD_CPU     (-1002)
 #define KDB_CMD_SS      (-1003)
-#define KDB_CMD_SSB     (-1004)
 #define KDB_CMD_KGDB (-1005)
 /* Internal debug flags */
@@ -125,8 +124,6 @@ extern int kdb_state;
                                                 * kdb control */
 #define KDB_STATE_HOLD_CPU      0x00000010      /* Hold this cpu inside kdb */
 #define KDB_STATE_DOING_SS      0x00000020      /* Doing ss command */
-#define KDB_STATE_DOING_SSB     0x00000040      /* Doing ssb command,
-                                                 * DOING_SS is also set */
 #define KDB_STATE_SSBPT         0x00000080      /* Install breakpoint
                                                 * after one ss, independent of
                                                 * DOING_SS */
@@ -191,7 +188,6 @@ extern void kdb_bp_remove(void);
 typedef enum {
        KDB_DB_BPT,     /* Breakpoint */
        KDB_DB_SS,      /* Single-step trap */
-        KDB_DB_SSB,     /* Single step to branch */
        KDB_DB_SSBPT,   /* Single step over breakpoint */
        KDB_DB_NOBPT    /* Spurious breakpoint */
 } kdb_dbtrap_t;
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 418b3f7053aa..d473988c1d0b 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -106,6 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        unsigned long long t2, t3;
        unsigned long flags;
        struct timespec ts;
+        cputime_t utime, stime, stimescaled, utimescaled;
        /* Though tsk->delays accessed later, early exit avoids
         * unnecessary returning of other data
@@ -114,12 +115,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
                goto done;
        tmp = (s64)d->cpu_run_real_total;
-        cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+        task_cputime(tsk, &utime, &stime);
+        cputime_to_timespec(utime + stime, &ts);
        tmp += timespec_to_ns(&ts);
        d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
        tmp = (s64)d->cpu_scaled_run_real_total;
-        cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts);
+        task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+        cputime_to_timespec(utimescaled + stimescaled, &ts);
        tmp += timespec_to_ns(&ts);
        d->cpu_scaled_run_real_total =
                (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbccf83c134d..7e0962ed7f8a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -908,6 +908,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 }
 /*
+ * Initialize event state based on the perf_event_attr::disabled.
+ */
+static inline void perf_event__state_init(struct perf_event *event)
+{
+        event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
+                                              PERF_EVENT_STATE_INACTIVE;
+}
+/*
 * Called at perf_event creation and when events are attached/detached from a
 * group.
 */
@@ -3682,7 +3691,7 @@ unlock:
 static int perf_fasync(int fd, struct file *filp, int on)
 {
-        struct inode *inode = filp->f_path.dentry->d_inode;
+        struct inode *inode = file_inode(filp);
        struct perf_event *event = filp->private_data;
        int retval;
@@ -4425,12 +4434,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)
                        if (ctxn < 0)
                                goto next;
                        ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+                        if (ctx)
+                                perf_event_task_ctx(ctx, task_event);
                }
-                if (ctx)
-                        perf_event_task_ctx(ctx, task_event);
 next:
                put_cpu_ptr(pmu->pmu_cpu_context);
        }
+        if (task_event->task_ctx)
+                perf_event_task_ctx(task_event->task_ctx, task_event);
        rcu_read_unlock();
 }
@@ -4725,7 +4737,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
        } else {
                if (arch_vma_name(mmap_event->vma)) {
                        name = strncpy(tmp, arch_vma_name(mmap_event->vma),
-                                       sizeof(tmp));
+                                       sizeof(tmp) - 1);
+                        tmp[sizeof(tmp) - 1] = '\0';
                        goto got_name;
                }
@@ -5117,7 +5130,6 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 {
        struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
        struct perf_event *event;
-        struct hlist_node *node;
        struct hlist_head *head;
        rcu_read_lock();
@@ -5125,7 +5137,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
        if (!head)
                goto end;
-        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_event(event, nr, data, regs);
        }
@@ -5410,7 +5422,6 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 {
        struct perf_sample_data data;
        struct perf_event *event;
-        struct hlist_node *node;
        struct perf_raw_record raw = {
                .size = entry_size,
@@ -5420,7 +5431,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
        perf_sample_data_init(&data, addr, 0);
        data.raw = &raw;
-        hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
+        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_tp_event_match(event, &data, regs))
                        perf_swevent_event(event, count, &data, regs);
        }
@@ -5640,6 +5651,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
                event->attr.sample_period = NSEC_PER_SEC / freq;
                hwc->sample_period = event->attr.sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
+                hwc->last_period = hwc->sample_period;
                event->attr.freq = 0;
        }
 }
@@ -5956,13 +5968,9 @@ int perf_pmu_register(struct pmu *pmu, char *name, int type)
        pmu->name = name;
        if (type < 0) {
-                int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
+                type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
-                if (!err)
+                if (type < 0) {
-                        goto free_pdc;
+                        ret = type;
-                err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
-                if (err) {
-                        ret = err;
                        goto free_pdc;
                }
        }
@@ -5979,6 +5987,7 @@ skip_type:
        if (pmu->pmu_cpu_context)
                goto got_cpu_context;
+        ret = -ENOMEM;
        pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
        if (!pmu->pmu_cpu_context)
                goto free_dev;
@@ -6155,18 +6164,21 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        event->parent           = parent_event;
-        event->ns               = get_pid_ns(current->nsproxy->pid_ns);
+        event->ns               = get_pid_ns(task_active_pid_ns(current));
        event->id               = atomic64_inc_return(&perf_event_id);
        event->state            = PERF_EVENT_STATE_INACTIVE;
        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
+                if (attr->type == PERF_TYPE_TRACEPOINT)
+                        event->hw.tp_target = task;
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
                /*
                 * hw_breakpoint is a bit difficult here..
                 */
-                if (attr->type == PERF_TYPE_BREAKPOINT)
+                else if (attr->type == PERF_TYPE_BREAKPOINT)
                        event->hw.bp_target = task;
 #endif
        }
@@ -6179,8 +6191,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        event->overflow_handler = overflow_handler;
        event->overflow_handler_context = context;
-        if (attr->disabled)
+        perf_event__state_init(event);
-                event->state = PERF_EVENT_STATE_OFF;
        pmu = NULL;
@@ -6609,9 +6620,17 @@ SYSCALL_DEFINE5(perf_event_open,
                mutex_lock(&gctx->mutex);
                perf_remove_from_context(group_leader);
+                /*
+                 * Removing from the context ends up with disabled
+                 * event. What we want here is event in the initial
+                 * startup state, ready to be add into new context.
+                 */
+                perf_event__state_init(group_leader);
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
                        perf_remove_from_context(sibling);
+                        perf_event__state_init(sibling);
                        put_ctx(gctx);
                }
                mutex_unlock(&gctx->mutex);
@@ -7434,7 +7453,7 @@ unlock:
 device_initcall(perf_event_sysfs_init);
 #ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
+static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
 {
        struct perf_cgroup *jc;
@@ -7451,7 +7470,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
        return &jc->css;
 }
-static void perf_cgroup_destroy(struct cgroup *cont)
+static void perf_cgroup_css_free(struct cgroup *cont)
 {
        struct perf_cgroup *jc;
        jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7492,8 +7511,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
 struct cgroup_subsys perf_subsys = {
        .name           = "perf_event",
        .subsys_id      = perf_subsys_id,
-        .create         = perf_cgroup_create,
+        .css_alloc      = perf_cgroup_css_alloc,
-        .destroy        = perf_cgroup_destroy,
+        .css_free       = perf_cgroup_css_free,
        .exit           = perf_cgroup_exit,
        .attach         = perf_cgroup_attach,
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9a7b487c6fe2..a64f8aeb5c1f 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -111,14 +111,16 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 * Count the number of breakpoints of the same type and same task.
 * The given event must be not on the list.
 */
-static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type)
+static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
 {
        struct task_struct *tsk = bp->hw.bp_target;
        struct perf_event *iter;
        int count = 0;
        list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-                if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type)
+                if (iter->hw.bp_target == tsk &&
+                    find_slot_idx(iter) == type &&
+                    cpu == iter->cpu)
                        count += hw_breakpoint_weight(iter);
        }
@@ -141,7 +143,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                if (!tsk)
                        slots->pinned += max_task_bp_pinned(cpu, type);
                else
-                        slots->pinned += task_bp_pinned(bp, type);
+                        slots->pinned += task_bp_pinned(cpu, bp, type);
                slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
                return;
@@ -154,7 +156,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                if (!tsk)
                        nr += max_task_bp_pinned(cpu, type);
                else
-                        nr += task_bp_pinned(bp, type);
+                        nr += task_bp_pinned(cpu, bp, type);
                if (nr > slots->pinned)
                        slots->pinned = nr;
@@ -188,7 +190,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
        int old_idx = 0;
        int idx = 0;
-        old_count = task_bp_pinned(bp, type);
+        old_count = task_bp_pinned(cpu, bp, type);
        old_idx = old_count - 1;
        idx = old_idx + weight;
@@ -674,7 +676,7 @@ int __init init_hw_breakpoint(void)
 err_alloc:
        for_each_possible_cpu(err_cpu) {
                for (i = 0; i < TYPE_MAX; i++)
-                        kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+                        kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
                if (err_cpu == cpu)
                        break;
        }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8b..eb675c4d59df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
        int                             page_order;     /* allocation order  */
 #endif
        int                             nr_pages;       /* nr of data pages  */
-        int                             writable;       /* are we writable   */
+        int                             overwrite;      /* can overwrite itself */
        atomic_t                        poll;           /* POLL_ for wakeups */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff3973..97fddb09762b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
 static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
                              unsigned long offset, unsigned long head)
 {
-        unsigned long mask;
+        unsigned long sz = perf_data_size(rb);
+        unsigned long mask = sz - 1;
-        if (!rb->writable)
+        /*
+         * check if user-writable
+         * overwrite : over-write its own tail
+         * !overwrite: buffer possibly drops events.
+         */
+        if (rb->overwrite)
                return true;
-        mask = perf_data_size(rb) - 1;
+        /*
+         * verify that payload is not bigger than buffer
+         * otherwise masking logic may fail to detect
+         * the "not enough space" condition
+         */
+        if ((head - offset) > sz)
+                return false;
        offset = (offset - tail) & mask;
        head   = (head   - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
                rb->watermark = max_size / 2;
        if (flags & RING_BUFFER_WRITABLE)
-                rb->writable = 1;
+                rb->overwrite = 0;
+        else
+                rb->overwrite = 1;
        atomic_set(&rb->refcount, 1);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 5cc4e7e42e68..a567c8c7ef31 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,12 +27,14 @@
 #include <linux/pagemap.h>      /* read_mapping_page */
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/export.h>
 #include <linux/rmap.h>         /* anon_vma_prepare */
 #include <linux/mmu_notifier.h> /* set_pte_at_notify */
 #include <linux/swap.h>         /* try_to_free_swap */
 #include <linux/ptrace.h>       /* user_enable_single_step */
 #include <linux/kdebug.h>       /* notifier mechanism */
 #include "../../mm/internal.h"  /* munlock_vma_page */
+#include <linux/percpu-rwsem.h>
 #include <linux/uprobes.h>
@@ -40,56 +42,31 @@
 #define MAX_UPROBE_XOL_SLOTS            UINSNS_PER_PAGE
 static struct rb_root uprobes_tree = RB_ROOT;
-static DEFINE_SPINLOCK(uprobes_treelock);       /* serialize rbtree access */
-#define UPROBES_HASH_SZ 13
 /*
- * We need separate register/unregister and mmap/munmap lock hashes because
+ * allows us to skip the uprobe_mmap if there are no uprobe events active
- * of mmap_sem nesting.
+ * at this time.  Probably a fine grained per inode count is better?
- *
- * uprobe_register() needs to install probes on (potentially) all processes
- * and thus needs to acquire multiple mmap_sems (consequtively, not
- * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
- * for the particular process doing the mmap.
- *
- * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
- * because of lock order against i_mmap_mutex. This means there's a hole in
- * the register vma iteration where a mmap() can happen.
- *
- * Thus uprobe_register() can race with uprobe_mmap() and we can try and
- * install a probe where one is already installed.
 */
+#define no_uprobe_events()      RB_EMPTY_ROOT(&uprobes_tree)
-/* serialize (un)register */
+static DEFINE_SPINLOCK(uprobes_treelock);       /* serialize rbtree access */
-static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
-#define uprobes_hash(v)         (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
+#define UPROBES_HASH_SZ 13
 /* serialize uprobe->pending_list */
 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
 #define uprobes_mmap_hash(v)    (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
-/*
+static struct percpu_rw_semaphore dup_mmap_sem;
- * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
- * events active at this time.  Probably a fine grained per inode count is
- * better?
- */
-static atomic_t uprobe_events = ATOMIC_INIT(0);
 /* Have a copy of original instruction */
 #define UPROBE_COPY_INSN        0
-/* Dont run handlers when first register/ last unregister in progress*/
-#define UPROBE_RUN_HANDLER      1
 /* Can skip singlestep */
-#define UPROBE_SKIP_SSTEP       2
+#define UPROBE_SKIP_SSTEP       1
 struct uprobe {
        struct rb_node          rb_node;        /* node in the rb tree */
        atomic_t                ref;
+        struct rw_semaphore     register_rwsem;
        struct rw_semaphore     consumer_rwsem;
-        struct mutex            copy_mutex;     /* TODO: kill me and UPROBE_COPY_INSN */
        struct list_head        pending_list;
        struct uprobe_consumer  *consumers;
        struct inode            *inode;         /* Also hold a ref to inode */
@@ -427,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
        u = __insert_uprobe(uprobe);
        spin_unlock(&uprobes_treelock);
-        /* For now assume that the instruction need not be single-stepped */
-        __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
        return u;
 }
@@ -449,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
        uprobe->inode = igrab(inode);
        uprobe->offset = offset;
+        init_rwsem(&uprobe->register_rwsem);
        init_rwsem(&uprobe->consumer_rwsem);
-        mutex_init(&uprobe->copy_mutex);
+        /* For now assume that the instruction need not be single-stepped */
+        __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
        /* add to uprobes_tree, sorted on inode:offset */
        cur_uprobe = insert_uprobe(uprobe);
@@ -460,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
                kfree(uprobe);
                uprobe = cur_uprobe;
                iput(inode);
-        } else {
-                atomic_inc(&uprobe_events);
        }
        return uprobe;
 }
-static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
+static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
-{
-        struct uprobe_consumer *uc;
-        if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
-                return;
-        down_read(&uprobe->consumer_rwsem);
-        for (uc = uprobe->consumers; uc; uc = uc->next) {
-                if (!uc->filter || uc->filter(uc, current))
-                        uc->handler(uc, regs);
-        }
-        up_read(&uprobe->consumer_rwsem);
-}
-/* Returns the previous consumer */
-static struct uprobe_consumer *
-consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
        down_write(&uprobe->consumer_rwsem);
        uc->next = uprobe->consumers;
        uprobe->consumers = uc;
        up_write(&uprobe->consumer_rwsem);
-        return uc->next;
 }
 /*
@@ -585,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                return ret;
-        mutex_lock(&uprobe->copy_mutex);
+        /* TODO: move this into _register, until then we abuse this sem. */
+        down_write(&uprobe->consumer_rwsem);
        if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
                goto out;
@@ -609,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
        set_bit(UPROBE_COPY_INSN, &uprobe->flags);
 out:
-        mutex_unlock(&uprobe->copy_mutex);
+        up_write(&uprobe->consumer_rwsem);
+        return ret;
+}
+static inline bool consumer_filter(struct uprobe_consumer *uc,
+                                   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+        return !uc->filter || uc->filter(uc, ctx, mm);
+}
+static bool filter_chain(struct uprobe *uprobe,
+                         enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+        struct uprobe_consumer *uc;
+        bool ret = false;
+        down_read(&uprobe->consumer_rwsem);
+        for (uc = uprobe->consumers; uc; uc = uc->next) {
+                ret = consumer_filter(uc, ctx, mm);
+                if (ret)
+                        break;
+        }
+        up_read(&uprobe->consumer_rwsem);
        return ret;
 }
@@ -621,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
        bool first_uprobe;
        int ret;
-        /*
-         * If probe is being deleted, unregister thread could be done with
-         * the vma-rmap-walk through. Adding a probe now can be fatal since
-         * nobody will be able to cleanup. Also we could be from fork or
-         * mremap path, where the probe might have already been inserted.
-         * Hence behave as if probe already existed.
-         */
-        if (!uprobe->consumers)
-                return 0;
        ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
        if (ret)
                return ret;
@@ -655,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
 static int
 remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        /* can happen if uprobe_register() fails */
-        if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
-                return 0;
        set_bit(MMF_RECALC_UPROBES, &mm->flags);
        return set_orig_insn(&uprobe->arch, mm, vaddr);
 }
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+        return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
 /*
 * There could be threads that have already hit the breakpoint. They
 * will recheck the current insn and restart if find_uprobe() fails.
@@ -670,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
 */
 static void delete_uprobe(struct uprobe *uprobe)
 {
+        if (WARN_ON(!uprobe_is_active(uprobe)))
+                return;
        spin_lock(&uprobes_treelock);
        rb_erase(&uprobe->rb_node, &uprobes_tree);
        spin_unlock(&uprobes_treelock);
+        RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
        iput(uprobe->inode);
        put_uprobe(uprobe);
-        atomic_dec(&uprobe_events);
 }
 struct map_info {
@@ -761,15 +733,20 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
        return curr;
 }
-static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
+static int
+register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
 {
+        bool is_register = !!new;
        struct map_info *info;
        int err = 0;
+        percpu_down_write(&dup_mmap_sem);
        info = build_map_info(uprobe->inode->i_mapping,
                                        uprobe->offset, is_register);
-        if (IS_ERR(info))
+        if (IS_ERR(info)) {
-                return PTR_ERR(info);
+                err = PTR_ERR(info);
+                goto out;
+        }
        while (info) {
                struct mm_struct *mm = info->mm;
@@ -788,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
                    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
                        goto unlock;
-                if (is_register)
+                if (is_register) {
-                        err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+                        /* consult only the "caller", new consumer. */
-                else
+                        if (consumer_filter(new,
-                        err |= remove_breakpoint(uprobe, mm, info->vaddr);
+                                        UPROBE_FILTER_REGISTER, mm))
+                                err = install_breakpoint(uprobe, mm, vma, info->vaddr);
+                } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
+                        if (!filter_chain(uprobe,
+                                        UPROBE_FILTER_UNREGISTER, mm))
+                                err |= remove_breakpoint(uprobe, mm, info->vaddr);
+                }
 unlock:
                up_write(&mm->mmap_sem);
@@ -799,21 +782,28 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
                mmput(mm);
                info = free_map_info(info);
        }
+ out:
+        percpu_up_write(&dup_mmap_sem);
        return err;
 }
-static int __uprobe_register(struct uprobe *uprobe)
+static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
-        return register_for_each_vma(uprobe, true);
+        consumer_add(uprobe, uc);
+        return register_for_each_vma(uprobe, uc);
 }
-static void __uprobe_unregister(struct uprobe *uprobe)
+static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
 {
-        if (!register_for_each_vma(uprobe, false))
+        int err;
-                delete_uprobe(uprobe);
+        if (!consumer_del(uprobe, uc))  /* WARN? */
+                return;
+        err = register_for_each_vma(uprobe, NULL);
        /* TODO : cant unregister? schedule a worker thread */
+        if (!uprobe->consumers && !err)
+                delete_uprobe(uprobe);
 }
 /*
@@ -838,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
        struct uprobe *uprobe;
        int ret;
-        if (!inode || !uc || uc->next)
+        /* Racy, just to catch the obvious mistakes */
-                return -EINVAL;
        if (offset > i_size_read(inode))
                return -EINVAL;
-        ret = 0;
+ retry:
-        mutex_lock(uprobes_hash(inode));
        uprobe = alloc_uprobe(inode, offset);
+        if (!uprobe)
-        if (!uprobe) {
+                return -ENOMEM;
-                ret = -ENOMEM;
+        /*
-        } else if (!consumer_add(uprobe, uc)) {
+         * We can race with uprobe_unregister()->delete_uprobe().
-                ret = __uprobe_register(uprobe);
+         * Check uprobe_is_active() and retry if it is false.
-                if (ret) {
+         */
-                        uprobe->consumers = NULL;
+        down_write(&uprobe->register_rwsem);
-                        __uprobe_unregister(uprobe);
+        ret = -EAGAIN;
-                } else {
+        if (likely(uprobe_is_active(uprobe))) {
-                        set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
+                ret = __uprobe_register(uprobe, uc);
-                }
+                if (ret)
+                        __uprobe_unregister(uprobe, uc);
        }
+        up_write(&uprobe->register_rwsem);
+        put_uprobe(uprobe);
-        mutex_unlock(uprobes_hash(inode));
+        if (unlikely(ret == -EAGAIN))
-        if (uprobe)
+                goto retry;
-                put_uprobe(uprobe);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(uprobe_register);
+/*
+ * uprobe_apply - unregister a already registered probe.
+ * @inode: the file in which the probe has to be removed.
+ * @offset: offset from the start of the file.
+ * @uc: consumer which wants to add more or remove some breakpoints
+ * @add: add or remove the breakpoints
+ */
+int uprobe_apply(struct inode *inode, loff_t offset,
+                        struct uprobe_consumer *uc, bool add)
+{
+        struct uprobe *uprobe;
+        struct uprobe_consumer *con;
+        int ret = -ENOENT;
+        uprobe = find_uprobe(inode, offset);
+        if (!uprobe)
+                return ret;
+        down_write(&uprobe->register_rwsem);
+        for (con = uprobe->consumers; con && con != uc ; con = con->next)
+                ;
+        if (con)
+                ret = register_for_each_vma(uprobe, add ? uc : NULL);
+        up_write(&uprobe->register_rwsem);
+        put_uprobe(uprobe);
        return ret;
 }
@@ -877,25 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
 {
        struct uprobe *uprobe;
-        if (!inode || !uc)
-                return;
        uprobe = find_uprobe(inode, offset);
        if (!uprobe)
                return;
-        mutex_lock(uprobes_hash(inode));
+        down_write(&uprobe->register_rwsem);
+        __uprobe_unregister(uprobe, uc);
+        up_write(&uprobe->register_rwsem);
+        put_uprobe(uprobe);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister);
-        if (consumer_del(uprobe, uc)) {
+static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
-                if (!uprobe->consumers) {
+{
-                        __uprobe_unregister(uprobe);
+        struct vm_area_struct *vma;
-                        clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
+        int err = 0;
-                }
+        down_read(&mm->mmap_sem);
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                unsigned long vaddr;
+                loff_t offset;
+                if (!valid_vma(vma, false) ||
+                    vma->vm_file->f_mapping->host != uprobe->inode)
+                        continue;
+                offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
+                if (uprobe->offset <  offset ||
+                    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
+                        continue;
+                vaddr = offset_to_vaddr(vma, uprobe->offset);
+                err |= remove_breakpoint(uprobe, mm, vaddr);
        }
+        up_read(&mm->mmap_sem);
-        mutex_unlock(uprobes_hash(inode));
+        return err;
-        if (uprobe)
-                put_uprobe(uprobe);
 }
 static struct rb_node *
@@ -972,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
        struct uprobe *uprobe, *u;
        struct inode *inode;
-        if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
+        if (no_uprobe_events() || !valid_vma(vma, true))
                return 0;
        inode = vma->vm_file->f_mapping->host;
@@ -981,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma)
        mutex_lock(uprobes_mmap_hash(inode));
        build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
+        /*
+         * We can race with uprobe_unregister(), this uprobe can be already
+         * removed. But in this case filter_chain() must return false, all
+         * consumers have gone away.
+         */
        list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
-                if (!fatal_signal_pending(current)) {
+                if (!fatal_signal_pending(current) &&
+                    filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
                        unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
                        install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
                }
@@ -1018,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
 */
 void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
 {
-        if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
+        if (no_uprobe_events() || !valid_vma(vma, false))
                return;
        if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1035,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
 /* Slot allocation for XOL */
 static int xol_add_vma(struct xol_area *area)
 {
-        struct mm_struct *mm;
+        struct mm_struct *mm = current->mm;
-        int ret;
+        int ret = -EALREADY;
-        area->page = alloc_page(GFP_HIGHUSER);
-        if (!area->page)
-                return -ENOMEM;
-        ret = -EALREADY;
-        mm = current->mm;
        down_write(&mm->mmap_sem);
        if (mm->uprobes_state.xol_area)
                goto fail;
        ret = -ENOMEM;
        /* Try to map as high as possible, this is only a hint. */
        area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
        if (area->vaddr & ~PAGE_MASK) {
@@ -1066,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area)
        smp_wmb();      /* pairs with get_xol_area() */
        mm->uprobes_state.xol_area = area;
        ret = 0;
+ fail:
-fail:
        up_write(&mm->mmap_sem);
-        if (ret)
-                __free_page(area->page);
        return ret;
 }
-static struct xol_area *get_xol_area(struct mm_struct *mm)
-{
-        struct xol_area *area;
-        area = mm->uprobes_state.xol_area;
-        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
-        return area;
-}
 /*
- * xol_alloc_area - Allocate process's xol_area.
+ * get_xol_area - Allocate process's xol_area if necessary.
- * This area will be used for storing instructions for execution out of
+ * This area will be used for storing instructions for execution out of line.
- * line.
 *
 * Returns the allocated area or NULL.
 */
-static struct xol_area *xol_alloc_area(void)
+static struct xol_area *get_xol_area(void)
 {
+        struct mm_struct *mm = current->mm;
        struct xol_area *area;
+        area = mm->uprobes_state.xol_area;
+        if (area)
+                goto ret;
        area = kzalloc(sizeof(*area), GFP_KERNEL);
        if (unlikely(!area))
-                return NULL;
+                goto out;
        area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
        if (!area->bitmap)
-                goto fail;
+                goto free_area;
+        area->page = alloc_page(GFP_HIGHUSER);
+        if (!area->page)
+                goto free_bitmap;
        init_waitqueue_head(&area->wq);
        if (!xol_add_vma(area))
                return area;
-fail:
+        __free_page(area->page);
+ free_bitmap:
        kfree(area->bitmap);
+ free_area:
        kfree(area);
+ out:
-        return get_xol_area(current->mm);
+        area = mm->uprobes_state.xol_area;
+ ret:
+        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
+        return area;
 }
 /*
@@ -1131,6 +1162,16 @@ void uprobe_clear_state(struct mm_struct *mm)
        kfree(area);
 }
+void uprobe_start_dup_mmap(void)
+{
+        percpu_down_read(&dup_mmap_sem);
+}
+void uprobe_end_dup_mmap(void)
+{
+        percpu_up_read(&dup_mmap_sem);
+}
 void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
 {
        newmm->uprobes_state.xol_area = NULL;
@@ -1169,38 +1210,36 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
 }
 /*
- * xol_get_insn_slot - If was not allocated a slot, then
+ * xol_get_insn_slot - allocate a slot for xol.
- * allocate a slot.
 * Returns the allocated slot address or 0.
 */
-static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
+static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
 {
        struct xol_area *area;
        unsigned long offset;
+        unsigned long xol_vaddr;
        void *vaddr;
-        area = get_xol_area(current->mm);
+        area = get_xol_area();
-        if (!area) {
+        if (!area)
-                area = xol_alloc_area();
+                return 0;
-                if (!area)
-                        return 0;
-        }
-        current->utask->xol_vaddr = xol_take_insn_slot(area);
-        /*
+        xol_vaddr = xol_take_insn_slot(area);
-         * Initialize the slot if xol_vaddr points to valid
+        if (unlikely(!xol_vaddr))
-         * instruction slot.
-         */
-        if (unlikely(!current->utask->xol_vaddr))
                return 0;
-        current->utask->vaddr = slot_addr;
+        /* Initialize the slot */
-        offset = current->utask->xol_vaddr & ~PAGE_MASK;
+        offset = xol_vaddr & ~PAGE_MASK;
        vaddr = kmap_atomic(area->page);
        memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
        kunmap_atomic(vaddr);
+        /*
+         * We probably need flush_icache_user_range() but it needs vma.
+         * This should work on supported architectures too.
+         */
+        flush_dcache_page(area->page);
-        return current->utask->xol_vaddr;
+        return xol_vaddr;
 }
 /*
@@ -1218,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
                return;
        slot_addr = tsk->utask->xol_vaddr;
+        if (unlikely(!slot_addr))
-        if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
                return;
        area = tsk->mm->uprobes_state.xol_area;
@@ -1281,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t)
 }
 /*
- * Allocate a uprobe_task object for the task.
+ * Allocate a uprobe_task object for the task if if necessary.
- * Called when the thread hits a breakpoint for the first time.
+ * Called when the thread hits a breakpoint.
 *
 * Returns:
 * - pointer to new uprobe_task on success
 * - NULL otherwise
 */
-static struct uprobe_task *add_utask(void)
+static struct uprobe_task *get_utask(void)
 {
-        struct uprobe_task *utask;
+        if (!current->utask)
+                current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
-        utask = kzalloc(sizeof *utask, GFP_KERNEL);
+        return current->utask;
-        if (unlikely(!utask))
-                return NULL;
-        current->utask = utask;
-        return utask;
 }
 /* Prepare to single-step probed instruction out of line. */
 static int
-pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
+pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
 {
-        if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
+        struct uprobe_task *utask;
-                return 0;
+        unsigned long xol_vaddr;
+        int err;
-        return -EFAULT;
+        utask = get_utask();
+        if (!utask)
+                return -ENOMEM;
+        xol_vaddr = xol_get_insn_slot(uprobe);
+        if (!xol_vaddr)
+                return -ENOMEM;
+        utask->xol_vaddr = xol_vaddr;
+        utask->vaddr = bp_vaddr;
+        err = arch_uprobe_pre_xol(&uprobe->arch, regs);
+        if (unlikely(err)) {
+                xol_free_insn_slot(current);
+                return err;
+        }
+        utask->active_uprobe = uprobe;
+        utask->state = UTASK_SSTEP;
+        return 0;
 }
 /*
@@ -1369,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
                 * This is not strictly accurate, we can race with
                 * uprobe_unregister() and see the already removed
                 * uprobe if delete_uprobe() was not yet called.
+                 * Or this uprobe can be filtered out.
                 */
                if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
                        return;
@@ -1430,14 +1484,25 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
        return uprobe;
 }
-void __weak arch_uprobe_enable_step(struct arch_uprobe *arch)
+static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 {
-        user_enable_single_step(current);
+        struct uprobe_consumer *uc;
-}
+        int remove = UPROBE_HANDLER_REMOVE;
-void __weak arch_uprobe_disable_step(struct arch_uprobe *arch)
+        down_read(&uprobe->register_rwsem);
-{
+        for (uc = uprobe->consumers; uc; uc = uc->next) {
-        user_disable_single_step(current);
+                int rc = uc->handler(uc, regs);
+                WARN(rc & ~UPROBE_HANDLER_MASK,
+                        "bad rc=0x%x from %pf()\n", rc, uc->handler);
+                remove &= rc;
+        }
+        if (remove && uprobe->consumers) {
+                WARN_ON(!uprobe_is_active(uprobe));
+                unapply_uprobe(uprobe, current->mm);
+        }
+        up_read(&uprobe->register_rwsem);
 }
 /*
@@ -1446,7 +1511,6 @@ void __weak arch_uprobe_disable_step(struct arch_uprobe *arch)
 */
 static void handle_swbp(struct pt_regs *regs)
 {
-        struct uprobe_task *utask;
        struct uprobe *uprobe;
        unsigned long bp_vaddr;
        int uninitialized_var(is_swbp);
@@ -1471,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs)
                }
                return;
        }
+        /* change it in advance for ->handler() and restart */
+        instruction_pointer_set(regs, bp_vaddr);
        /*
         * TODO: move copy_insn/etc into _register and remove this hack.
         * After we hit the bp, _unregister + _register can install the
@@ -1478,33 +1546,16 @@ static void handle_swbp(struct pt_regs *regs)
         */
        smp_rmb(); /* pairs with wmb() in install_breakpoint() */
        if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
-                goto restart;
+                goto out;
-        utask = current->utask;
-        if (!utask) {
-                utask = add_utask();
-                /* Cannot allocate; re-execute the instruction. */
-                if (!utask)
-                        goto restart;
-        }
        handler_chain(uprobe, regs);
        if (can_skip_sstep(uprobe, regs))
                goto out;
-        if (!pre_ssout(uprobe, regs, bp_vaddr)) {
+        if (!pre_ssout(uprobe, regs, bp_vaddr))
-                arch_uprobe_enable_step(&uprobe->arch);
-                utask->active_uprobe = uprobe;
-                utask->state = UTASK_SSTEP;
                return;
-        }
-restart:
+        /* can_skip_sstep() succeeded, or restart if can't singlestep */
-        /*
-         * cannot singlestep; cannot skip instruction;
-         * re-execute the instruction.
-         */
-        instruction_pointer_set(regs, bp_vaddr);
 out:
        put_uprobe(uprobe);
 }
@@ -1525,7 +1576,6 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
        else
                WARN_ON_ONCE(1);
-        arch_uprobe_disable_step(&uprobe->arch);
        put_uprobe(uprobe);
        utask->active_uprobe = NULL;
        utask->state = UTASK_RUNNING;
@@ -1599,10 +1649,11 @@ static int __init init_uprobes(void)
 {
        int i;
-        for (i = 0; i < UPROBES_HASH_SZ; i++) {
+        for (i = 0; i < UPROBES_HASH_SZ; i++)
-                mutex_init(&uprobes_mutex[i]);
                mutex_init(&uprobes_mmap_mutex[i]);
-        }
+        if (percpu_init_rwsem(&dup_mmap_sem))
+                return -ENOMEM;
        return register_die_notifier(&uprobe_exception_nb);
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 346616c0092c..60bc027c61c3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -20,6 +20,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/freezer.h>
 #include <linux/binfmts.h>
 #include <linux/nsproxy.h>
 #include <linux/pid_namespace.h>
@@ -31,7 +32,6 @@
 #include <linux/mempolicy.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
-#include <linux/freezer.h>
 #include <linux/cgroup.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
@@ -72,18 +72,6 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
-                /*
-                 * If we are the last child process in a pid namespace to be
-                 * reaped, notify the reaper sleeping zap_pid_ns_processes().
-                 */
-                if (IS_ENABLED(CONFIG_PID_NS)) {
-                        struct task_struct *parent = p->real_parent;
-                        if ((task_active_pid_ns(parent)->child_reaper == parent) &&
-                            list_empty(&parent->children) &&
-                            (parent->flags & PF_EXITING))
-                                wake_up_process(parent);
-                }
        }
        list_del_rcu(&p->thread_group);
 }
@@ -97,6 +85,7 @@ static void __exit_signal(struct task_struct *tsk)
        bool group_dead = thread_group_leader(tsk);
        struct sighand_struct *sighand;
        struct tty_struct *uninitialized_var(tty);
+        cputime_t utime, stime;
        sighand = rcu_dereference_check(tsk->sighand,
                                        lockdep_tasklist_lock_is_held());
@@ -135,9 +124,10 @@ static void __exit_signal(struct task_struct *tsk)
                 * We won't ever get here for the group leader, since it
                 * will have been the last reference on the signal_struct.
                 */
-                sig->utime += tsk->utime;
+                task_cputime(tsk, &utime, &stime);
-                sig->stime += tsk->stime;
+                sig->utime += utime;
-                sig->gtime += tsk->gtime;
+                sig->stime += stime;
+                sig->gtime += task_gtime(tsk);
                sig->min_flt += tsk->min_flt;
                sig->maj_flt += tsk->maj_flt;
                sig->nvcsw += tsk->nvcsw;
@@ -322,43 +312,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
        }
 }
-/**
- * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
- *
- * If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to kthreadd so it
- * isn't in the way of other processes and is correctly cleaned up on exit.
- *
- * The various task state such as scheduling policy and priority may have
- * been inherited from a user process, so we reset them to sane values here.
- *
- * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
- */
-static void reparent_to_kthreadd(void)
-{
-        write_lock_irq(&tasklist_lock);
-        ptrace_unlink(current);
-        /* Reparent to init */
-        current->real_parent = current->parent = kthreadd_task;
-        list_move_tail(&current->sibling, &current->real_parent->children);
-        /* Set the exit signal to SIGCHLD so we signal init on exit */
-        current->exit_signal = SIGCHLD;
-        if (task_nice(current) < 0)
-                set_user_nice(current, 0);
-        /* cpus_allowed? */
-        /* rt_priority? */
-        /* signals? */
-        memcpy(current->signal->rlim, init_task.signal->rlim,
-               sizeof(current->signal->rlim));
-        atomic_inc(&init_cred.usage);
-        commit_creds(&init_cred);
-        write_unlock_irq(&tasklist_lock);
-}
 void __set_special_pids(struct pid *pid)
 {
        struct task_struct *curr = current->group_leader;
@@ -370,13 +323,6 @@ void __set_special_pids(struct pid *pid)
                change_pid(curr, PIDTYPE_PGID, pid);
 }
-static void set_special_pids(struct pid *pid)
-{
-        write_lock_irq(&tasklist_lock);
-        __set_special_pids(pid);
-        write_unlock_irq(&tasklist_lock);
-}
 /*
 * Let kernel threads use this to say that they allow a certain signal.
 * Must not be used if kthread was cloned with CLONE_SIGHAND.
@@ -416,54 +362,6 @@ int disallow_signal(int sig)
 EXPORT_SYMBOL(disallow_signal);
-/*
- *      Put all the gunge required to become a kernel thread without
- *      attached user resources in one place where it belongs.
- */
-void daemonize(const char *name, ...)
-{
-        va_list args;
-        sigset_t blocked;
-        va_start(args, name);
-        vsnprintf(current->comm, sizeof(current->comm), name, args);
-        va_end(args);
-        /*
-         * If we were started as result of loading a module, close all of the
-         * user space pages.  We don't need them, and if we didn't close them
-         * they would be locked into memory.
-         */
-        exit_mm(current);
-        /*
-         * We don't want to get frozen, in case system-wide hibernation
-         * or suspend transition begins right now.
-         */
-        current->flags |= (PF_NOFREEZE | PF_KTHREAD);
-        if (current->nsproxy != &init_nsproxy) {
-                get_nsproxy(&init_nsproxy);
-                switch_task_namespaces(current, &init_nsproxy);
-        }
-        set_special_pids(&init_struct_pid);
-        proc_clear_tty(current);
-        /* Block and flush all signals */
-        sigfillset(&blocked);
-        sigprocmask(SIG_BLOCK, &blocked, NULL);
-        flush_signals(current);
-        /* Become as one with the init task */
-        daemonize_fs_struct();
-        daemonize_descriptors();
-        reparent_to_kthreadd();
-}
-EXPORT_SYMBOL(daemonize);
 #ifdef CONFIG_MM_OWNER
 /*
 * A task is exiting.   If it owned this mm, find a new owner for the mm.
@@ -587,7 +485,7 @@ static void exit_mm(struct task_struct * tsk)
                        set_task_state(tsk, TASK_UNINTERRUPTIBLE);
                        if (!self.task) /* see coredump_finish() */
                                break;
-                        schedule();
+                        freezable_schedule();
                }
                __set_task_state(tsk, TASK_RUNNING);
                down_read(&mm->mmap_sem);
@@ -1186,17 +1084,17 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                 * as other threads in the parent group can be right
                 * here reaping other children at the same time.
                 *
-                 * We use thread_group_times() to get times for the thread
+                 * We use thread_group_cputime_adjusted() to get times for the thread
                 * group, which consolidates times for all threads in the
                 * group including the group leader.
                 */
-                thread_group_times(p, &tgutime, &tgstime);
+                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                spin_lock_irq(&p->real_parent->sighand->siglock);
                psig = p->real_parent->signal;
                sig = p->signal;
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
-                psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
+                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
                psig->cmin_flt +=
                        p->min_flt + sig->min_flt + sig->cmin_flt;
                psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7d3aa2..1766d324d5e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti)
 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
                                                  int node)
 {
-        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+        struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
                                             THREAD_SIZE_ORDER);
        return page ? page_address(page) : NULL;
@@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 static inline void free_thread_info(struct thread_info *ti)
 {
-        free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+        free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_info_cache;
@@ -352,6 +352,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        unsigned long charge;
        struct mempolicy *pol;
+        uprobe_start_dup_mmap();
        down_write(&oldmm->mmap_sem);
        flush_cache_dup_mm(oldmm);
        uprobe_dup_mmap(oldmm, mm);
@@ -412,7 +413,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                tmp->vm_next = tmp->vm_prev = NULL;
                file = tmp->vm_file;
                if (file) {
-                        struct inode *inode = file->f_path.dentry->d_inode;
+                        struct inode *inode = file_inode(file);
                        struct address_space *mapping = file->f_mapping;
                        get_file(file);
@@ -469,6 +470,7 @@ out:
        up_write(&mm->mmap_sem);
        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
+        uprobe_end_dup_mmap();
        return retval;
 fail_nomem_anon_vma_fork:
        mpol_put(pol);
@@ -821,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        mm->pmd_huge_pte = NULL;
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        mm->first_nid = NUMA_PTE_SCAN_INIT;
+#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@ -1039,8 +1044,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        atomic_set(&sig->live, 1);
        atomic_set(&sig->sigcnt, 1);
        init_waitqueue_head(&sig->wait_chldexit);
-        if (clone_flags & CLONE_NEWPID)
-                sig->flags |= SIGNAL_UNKILLABLE;
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
@@ -1127,7 +1130,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 */
 static struct task_struct *copy_process(unsigned long clone_flags,
                                        unsigned long stack_start,
-                                        struct pt_regs *regs,
                                        unsigned long stack_size,
                                        int __user *child_tidptr,
                                        struct pid *pid,
@@ -1135,11 +1137,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
        int retval;
        struct task_struct *p;
-        int cgroup_callbacks_done = 0;
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);
+        if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+                return ERR_PTR(-EINVAL);
        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
@@ -1165,6 +1169,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                                current->signal->flags & SIGNAL_UNKILLABLE)
                return ERR_PTR(-EINVAL);
+        /*
+         * If the new process will be in a different pid namespace
+         * don't allow the creation of threads.
+         */
+        if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
+            (task_active_pid_ns(current) != current->nsproxy->pid_ns))
+                return ERR_PTR(-EINVAL);
        retval = security_task_create(clone_flags);
        if (retval)
                goto fork_out;
@@ -1222,8 +1234,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->utime = p->stime = p->gtime = 0;
        p->utimescaled = p->stimescaled = 0;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-        p->prev_utime = p->prev_stime = 0;
+        p->prev_cputime.utime = p->prev_cputime.stime = 0;
 #endif
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+        seqlock_init(&p->vtime_seqlock);
+        p->vtime_snap = 0;
+        p->vtime_snap_whence = VTIME_SLEEPING;
+#endif
 #if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
 #endif
@@ -1320,7 +1338,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
-        retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
+        retval = copy_thread(clone_flags, stack_start, stack_size, p);
        if (retval)
                goto bad_fork_cleanup_io;
@@ -1393,12 +1411,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
-        /* Now that the task is set up, run cgroup callbacks if
-         * necessary. We need to run them before the task is visible
-         * on the tasklist. */
-        cgroup_fork_callbacks(p);
-        cgroup_callbacks_done = 1;
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
@@ -1441,8 +1453,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
                if (thread_group_leader(p)) {
-                        if (is_child_reaper(pid))
+                        if (is_child_reaper(pid)) {
-                                p->nsproxy->pid_ns->child_reaper = p;
+                                ns_of_pid(pid)->child_reaper = p;
+                                p->signal->flags |= SIGNAL_UNKILLABLE;
+                        }
                        p->signal->leader_pid = pid;
                        p->signal->tty = tty_kref_get(current->signal->tty);
@@ -1476,8 +1490,6 @@ bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
 bad_fork_cleanup_namespaces:
-        if (unlikely(clone_flags & CLONE_NEWPID))
-                pid_ns_release_proc(p->nsproxy->pid_ns);
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
        if (p->mm)
@@ -1503,7 +1515,7 @@ bad_fork_cleanup_cgroup:
 #endif
        if (clone_flags & CLONE_THREAD)
                threadgroup_change_end(current);
-        cgroup_exit(p, cgroup_callbacks_done);
+        cgroup_exit(p, 0);
        delayacct_tsk_free(p);
        module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
@@ -1515,12 +1527,6 @@ fork_out:
        return ERR_PTR(retval);
 }
-noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
-{
-        memset(regs, 0, sizeof(struct pt_regs));
-        return regs;
-}
 static inline void init_idle_pids(struct pid_link *links)
 {
        enum pid_type type;
@@ -1534,10 +1540,7 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct * __cpuinit fork_idle(int cpu)
 {
        struct task_struct *task;
-        struct pt_regs regs;
+        task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
-        task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
-                            &init_struct_pid, 0);
        if (!IS_ERR(task)) {
                init_idle_pids(task->pids);
                init_idle(task, cpu);
@@ -1554,7 +1557,6 @@ struct task_struct * __cpuinit fork_idle(int cpu)
 */
 long do_fork(unsigned long clone_flags,
              unsigned long stack_start,
-              struct pt_regs *regs,
              unsigned long stack_size,
              int __user *parent_tidptr,
              int __user *child_tidptr)
@@ -1567,15 +1569,9 @@ long do_fork(unsigned long clone_flags,
         * Do some preliminary argument and permissions checking before we
         * actually start allocating stuff
         */
-        if (clone_flags & CLONE_NEWUSER) {
+        if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
-                if (clone_flags & CLONE_THREAD)
+                if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
                        return -EINVAL;
-                /* hopefully this check will go away when userns support is
-                 * complete
-                 */
-                if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
-                                !capable(CAP_SETGID))
-                        return -EPERM;
        }
        /*
@@ -1584,7 +1580,7 @@ long do_fork(unsigned long clone_flags,
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
-        if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) {
+        if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if ((clone_flags & CSIGNAL) != SIGCHLD)
@@ -1596,7 +1592,7 @@ long do_fork(unsigned long clone_flags,
                        trace = 0;
        }
-        p = copy_process(clone_flags, stack_start, regs, stack_size,
+        p = copy_process(clone_flags, stack_start, stack_size,
                         child_tidptr, NULL, trace);
        /*
         * Do this prior waking up the new thread - the thread pointer
@@ -1634,15 +1630,58 @@ long do_fork(unsigned long clone_flags,
        return nr;
 }
-#ifdef CONFIG_GENERIC_KERNEL_THREAD
 /*
 * Create a kernel thread.
 */
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
-        return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL,
+        return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
                (unsigned long)arg, NULL, NULL);
 }
+#ifdef __ARCH_WANT_SYS_FORK
+SYSCALL_DEFINE0(fork)
+{
+#ifdef CONFIG_MMU
+        return do_fork(SIGCHLD, 0, 0, NULL, NULL);
+#else
+        /* can not support in nommu mode */
+        return(-EINVAL);
+#endif
+}
+#endif
+#ifdef __ARCH_WANT_SYS_VFORK
+SYSCALL_DEFINE0(vfork)
+{
+        return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 
+                        0, NULL, NULL);
+}
+#endif
+#ifdef __ARCH_WANT_SYS_CLONE
+#ifdef CONFIG_CLONE_BACKWARDS
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+                 int __user *, parent_tidptr,
+                 int, tls_val,
+                 int __user *, child_tidptr)
+#elif defined(CONFIG_CLONE_BACKWARDS2)
+SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
+                 int __user *, parent_tidptr,
+                 int __user *, child_tidptr,
+                 int, tls_val)
+#else
+SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
+                 int __user *, parent_tidptr,
+                 int __user *, child_tidptr,
+                 int, tls_val)
+#endif
+{
+        long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+        asmlinkage_protect(5, ret, clone_flags, newsp,
+                        parent_tidptr, child_tidptr, tls_val);
+        return ret;
+}
 #endif
 #ifndef ARCH_MIN_MMSTRUCT_ALIGN
@@ -1694,7 +1733,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
 {
        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
-                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+                                CLONE_NEWUSER|CLONE_NEWPID))
                return -EINVAL;
        /*
         * Not implemented, but pretend it works if there is nothing to
@@ -1761,19 +1801,40 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 {
        struct fs_struct *fs, *new_fs = NULL;
        struct files_struct *fd, *new_fd = NULL;
+        struct cred *new_cred = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
        int err;
-        err = check_unshare_flags(unshare_flags);
+        /*
-        if (err)
+         * If unsharing a user namespace must also unshare the thread.
-                goto bad_unshare_out;
+         */
+        if (unshare_flags & CLONE_NEWUSER)
+                unshare_flags |= CLONE_THREAD | CLONE_FS;
+        /*
+         * If unsharing a pid namespace must also unshare the thread.
+         */
+        if (unshare_flags & CLONE_NEWPID)
+                unshare_flags |= CLONE_THREAD;
+        /*
+         * If unsharing a thread from a thread group, must also unshare vm.
+         */
+        if (unshare_flags & CLONE_THREAD)
+                unshare_flags |= CLONE_VM;
+        /*
+         * If unsharing vm, must also unshare signal handlers.
+         */
+        if (unshare_flags & CLONE_VM)
+                unshare_flags |= CLONE_SIGHAND;
        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (unshare_flags & CLONE_NEWNS)
                unshare_flags |= CLONE_FS;
+        err = check_unshare_flags(unshare_flags);
+        if (err)
+                goto bad_unshare_out;
        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
@@ -1787,11 +1848,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
        err = unshare_fd(unshare_flags, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
-        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs);
+        err = unshare_userns(unshare_flags, &new_cred);
        if (err)
                goto bad_unshare_cleanup_fd;
+        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+                                         new_cred, new_fs);
+        if (err)
+                goto bad_unshare_cleanup_cred;
-        if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
+        if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1799,10 +1864,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                        exit_sem(current);
                }
-                if (new_nsproxy) {
+                if (new_nsproxy)
                        switch_task_namespaces(current, new_nsproxy);
-                        new_nsproxy = NULL;
-                }
                task_lock(current);
@@ -1824,11 +1887,17 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
                }
                task_unlock(current);
-        }
-        if (new_nsproxy)
+                if (new_cred) {
-                put_nsproxy(new_nsproxy);
+                        /* Install the new user namespace */
+                        commit_creds(new_cred);
+                        new_cred = NULL;
+                }
+        }
+bad_unshare_cleanup_cred:
+        if (new_cred)
+                put_cred(new_cred);
 bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 11f82a4d4eae..c38893b0efba 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p)
                return false;
        }
-        if (!(p->flags & PF_KTHREAD)) {
+        if (!(p->flags & PF_KTHREAD))
                fake_signal_wake_up(p);
-                /*
+        else
-                 * fake_signal_wake_up() goes through p's scheduler
-                 * lock and guarantees that TASK_STOPPED/TRACED ->
-                 * TASK_RUNNING transition can't race with task state
-                 * testing in try_to_freeze_tasks().
-                 */
-        } else {
                wake_up_state(p, TASK_INTERRUPTIBLE);
-        }
        spin_unlock_irqrestore(&freezer_lock, flags);
        return true;
diff --git a/kernel/futex.c b/kernel/futex.c
index 3717e7b306e0..b26dcfc02c94 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -60,6 +60,7 @@
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
 #include <linux/ptrace.h>
+#include <linux/sched/rt.h>
 #include <asm/futex.h>
@@ -222,10 +223,11 @@ static void drop_futex_key_refs(union futex_key *key)
 * @rw:         mapping needs to be read/write (values: VERIFY_READ,
 *              VERIFY_WRITE)
 *
- * Returns a negative error code or 0
+ * Return: a negative error code or 0
+ *
 * The key words are stored in *key on success.
 *
- * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
+ * For shared mappings, it's (page->index, file_inode(vma->vm_file),
 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 * We can usually work out the index without swapping in the page.
 *
@@ -704,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 *                      be "current" except in the case of requeue pi.
 * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
 *
- * Returns:
+ * Return:
- *  0 - ready to wait
+ *  0 - ready to wait;
- *  1 - acquired the lock
+ *  1 - acquired the lock;
 * <0 - error
 *
 * The hb->lock and futex_key refs shall be held by the caller.
@@ -716,7 +718,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                struct futex_pi_state **ps,
                                struct task_struct *task, int set_waiters)
 {
-        int lock_taken, ret, ownerdied = 0;
+        int lock_taken, ret, force_take = 0;
        u32 uval, newval, curval, vpid = task_pid_vnr(task);
 retry:
@@ -755,17 +757,15 @@ retry:
        newval = curval | FUTEX_WAITERS;
        /*
-         * There are two cases, where a futex might have no owner (the
+         * Should we force take the futex? See below.
-         * owner TID is 0): OWNER_DIED. We take over the futex in this
-         * case. We also do an unconditional take over, when the owner
-         * of the futex died.
-         *
-         * This is safe as we are protected by the hash bucket lock !
         */
-        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+        if (unlikely(force_take)) {
-                /* Keep the OWNER_DIED bit */
+                /*
+                 * Keep the OWNER_DIED and the WAITERS bit and set the
+                 * new TID value.
+                 */
                newval = (curval & ~FUTEX_TID_MASK) | vpid;
-                ownerdied = 0;
+                force_take = 0;
                lock_taken = 1;
        }
@@ -775,7 +775,7 @@ retry:
                goto retry;
        /*
-         * We took the lock due to owner died take over.
+         * We took the lock due to forced take over.
         */
        if (unlikely(lock_taken))
                return 1;
@@ -790,20 +790,25 @@ retry:
                switch (ret) {
                case -ESRCH:
                        /*
-                         * No owner found for this futex. Check if the
+                         * We failed to find an owner for this
-                         * OWNER_DIED bit is set to figure out whether
+                         * futex. So we have no pi_state to block
-                         * this is a robust futex or not.
+                         * on. This can happen in two cases:
+                         *
+                         * 1) The owner died
+                         * 2) A stale FUTEX_WAITERS bit
+                         *
+                         * Re-read the futex value.
                         */
                        if (get_futex_value_locked(&curval, uaddr))
                                return -EFAULT;
                        /*
-                         * We simply start over in case of a robust
+                         * If the owner died or we have a stale
-                         * futex. The code above will take the futex
+                         * WAITERS bit the owner TID in the user space
-                         * and return happy.
+                         * futex is 0.
                         */
-                        if (curval & FUTEX_OWNER_DIED) {
+                        if (!(curval & FUTEX_TID_MASK)) {
-                                ownerdied = 1;
+                                force_take = 1;
                                goto retry;
                        }
                default:
@@ -840,6 +845,9 @@ static void wake_futex(struct futex_q *q)
 {
        struct task_struct *p = q->task;
+        if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+                return;
        /*
         * We set q->lock_ptr = NULL _before_ we wake up the task. If
         * a non-futex wake up happens on another CPU then the task
@@ -1075,6 +1083,10 @@ retry_private:
        plist_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key1)) {
+                        if (this->pi_state || this->rt_waiter) {
+                                ret = -EINVAL;
+                                goto out_unlock;
+                        }
                        wake_futex(this);
                        if (++ret >= nr_wake)
                                break;
@@ -1087,6 +1099,10 @@ retry_private:
                op_ret = 0;
                plist_for_each_entry_safe(this, next, head, list) {
                        if (match_futex (&this->key, &key2)) {
+                                if (this->pi_state || this->rt_waiter) {
+                                        ret = -EINVAL;
+                                        goto out_unlock;
+                                }
                                wake_futex(this);
                                if (++op_ret >= nr_wake2)
                                        break;
@@ -1095,6 +1111,7 @@ retry_private:
                ret += op_ret;
        }
+out_unlock:
        double_unlock_hb(hb1, hb2);
 out_put_keys:
        put_futex_key(&key2);
@@ -1175,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
 * hb1 and hb2 must be held by the caller.
 *
- * Returns:
+ * Return:
- *  0 - failed to acquire the lock atomicly
+ *  0 - failed to acquire the lock atomically;
- *  1 - acquired the lock
+ *  1 - acquired the lock;
 * <0 - error
 */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1238,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
 * uaddr2 atomically on behalf of the top waiter.
 *
- * Returns:
+ * Return:
- * >=0 - on success, the number of tasks requeued or woken
+ * >=0 - on success, the number of tasks requeued or woken;
 *  <0 - on error
 */
 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1384,9 +1401,13 @@ retry_private:
                /*
                 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
                 * be paired with each other and no other futex ops.
+                 *
+                 * We should never be requeueing a futex_q with a pi_state,
+                 * which is awaiting a futex_unlock_pi().
                 */
                if ((requeue_pi && !this->rt_waiter) ||
-                    (!requeue_pi && this->rt_waiter)) {
+                    (!requeue_pi && this->rt_waiter) ||
+                    this->pi_state) {
                        ret = -EINVAL;
                        break;
                }
@@ -1516,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
 * be paired with exactly one earlier call to queue_me().
 *
- * Returns:
+ * Return:
- *   1 - if the futex_q was still queued (and we removed unqueued it)
+ *   1 - if the futex_q was still queued (and we removed unqueued it);
 *   0 - if the futex_q was already removed by the waking thread
 */
 static int unqueue_me(struct futex_q *q)
@@ -1687,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
 * the pi_state owner as well as handle race conditions that may allow us to
 * acquire the lock. Must be called with the hb lock held.
 *
- * Returns:
+ * Return:
- *  1 - success, lock taken
+ *  1 - success, lock taken;
- *  0 - success, lock not taken
+ *  0 - success, lock not taken;
 * <0 - on error (-EFAULT)
 */
 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1804,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 * Return with the hb lock held and a q.key reference on success, and unlocked
 * with no q.key reference on failure.
 *
- * Returns:
+ * Return:
- *  0 - uaddr contains val and hb has been locked
+ *  0 - uaddr contains val and hb has been locked;
 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
 */
 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2183,9 +2204,9 @@ pi_faulted:
 * the wakeup and return the appropriate error code to the caller.  Must be
 * called with the hb lock held.
 *
- * Returns
+ * Return:
- *  0 - no early wakeup detected
+ *  0 = no early wakeup detected;
- * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
 */
 static inline
 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2227,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 * @val:        the expected value of uaddr
 * @abs_time:   absolute timeout
 * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
- * @clockrt:    whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
 * @uaddr2:     the pi futex we will take prior to returning to user-space
 *
 * The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2238,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 * there was a need to.
 *
 * We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following:
+ * via the following--
 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
 * 2) wakeup on uaddr2 after a requeue
 * 3) signal
@@ -2256,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 *
 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
 *
- * Returns:
+ * Return:
- *  0 - On success
+ *  0 - On success;
 * <0 - On error
 */
 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
@@ -2452,8 +2472,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
-        WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
        rcu_read_lock();
        ret = -ESRCH;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 83e368b005fc..f9f44fd4d34d 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -11,6 +11,7 @@
 #include <linux/nsproxy.h>
 #include <linux/futex.h>
 #include <linux/ptrace.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
@@ -116,9 +117,9 @@ void compat_exit_robust_list(struct task_struct *curr)
        }
 }
-asmlinkage long
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
-compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
+                struct compat_robust_list_head __user *, head,
-                           compat_size_t len)
+                compat_size_t, len)
 {
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
@@ -131,9 +132,9 @@ compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
        return 0;
 }
-asmlinkage long
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
-compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
+                        compat_uptr_t __user *, head_ptr,
-                           compat_size_t __user *len_ptr)
+                        compat_size_t __user *, len_ptr)
 {
        struct compat_robust_list_head __user *head;
        unsigned long ret;
@@ -142,8 +143,6 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
        if (!futex_cmpxchg_enabled)
                return -ENOSYS;
-        WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
        rcu_read_lock();
        ret = -ESRCH;
@@ -172,9 +171,9 @@ err_unlock:
        return ret;
 }
-asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
-                struct compat_timespec __user *utime, u32 __user *uaddr2,
+                struct compat_timespec __user *, utime, u32 __user *, uaddr2,
-                u32 val3)
+                u32, val3)
 {
        struct timespec ts;
        ktime_t t, *tp = NULL;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc1..d4da55d1fb65 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
 config GCOV_PROFILE_ALL
        bool "Profile entire Kernel"
        depends on GCOV_KERNEL
-        depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+        depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
        default n
        ---help---
        This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6db7a5ed52b5..14be27feda49 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -44,6 +44,8 @@
 #include <linux/err.h>
 #include <linux/debugobjects.h>
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
 #include <linux/timer.h>
 #include <asm/uaccess.h>
@@ -61,6 +63,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
+        .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
        .clock_base =
        {
                {
@@ -640,21 +643,9 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 * and expiry check is done in the hrtimer_interrupt or in the softirq.
 */
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-                                            struct hrtimer_clock_base *base,
+                                            struct hrtimer_clock_base *base)
-                                            int wakeup)
 {
-        if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
+        return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
-                if (wakeup) {
-                        raw_spin_unlock(&base->cpu_base->lock);
-                        raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-                        raw_spin_lock(&base->cpu_base->lock);
-                } else
-                        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-                return 1;
-        }
-        return 0;
 }
 static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
@@ -735,8 +726,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-                                            struct hrtimer_clock_base *base,
+                                            struct hrtimer_clock_base *base)
-                                            int wakeup)
 {
        return 0;
 }
@@ -995,8 +985,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
         *
         * XXX send_remote_softirq() ?
         */
-        if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
+        if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
-                hrtimer_enqueue_reprogram(timer, new_base, wakeup);
+                && hrtimer_enqueue_reprogram(timer, new_base)) {
+                if (wakeup) {
+                        /*
+                         * We need to drop cpu_base->lock to avoid a
+                         * lock ordering issue vs. rq->lock.
+                         */
+                        raw_spin_unlock(&new_base->cpu_base->lock);
+                        raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+                        local_irq_restore(flags);
+                        return ret;
+                } else {
+                        __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+                }
+        }
        unlock_hrtimer_base(timer, &flags);
@@ -1640,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;
-        raw_spin_lock_init(&cpu_base->lock);
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                cpu_base->clock_base[i].cpu_base = cpu_base;
                timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 57d86d07221e..cbd97ce0b000 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
 EXPORT_SYMBOL(irq_set_handler_data);
 /**
- *      irq_set_msi_desc - set MSI descriptor data for an irq
+ *      irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
- *      @irq:   Interrupt number
+ *      @irq_base:      Interrupt number base
- *      @entry: Pointer to MSI descriptor data
+ *      @irq_offset:    Interrupt number offset
+ *      @entry:         Pointer to MSI descriptor data
 *
- *      Set the MSI descriptor entry for an irq
+ *      Set the MSI descriptor entry for an irq at offset
 */
-int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
+                         struct msi_desc *entry)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+        struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        if (!desc)
                return -EINVAL;
        desc->irq_data.msi_desc = entry;
-        if (entry)
+        if (entry && !irq_offset)
-                entry->irq = irq;
+                entry->irq = irq_base;
        irq_put_desc_unlock(desc, flags);
        return 0;
 }
 /**
+ *      irq_set_msi_desc - set MSI descriptor data for an irq
+ *      @irq:   Interrupt number
+ *      @entry: Pointer to MSI descriptor data
+ *
+ *      Set the MSI descriptor entry for an irq
+ */
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
+{
+        return irq_set_msi_desc_off(irq, 0, entry);
+}
+/**
 *      irq_set_chip_data - set irq chip data for an irq
 *      @irq:   Interrupt number
 *      @data:  Pointer to chip specific data
@@ -272,6 +286,7 @@ void handle_nested_irq(unsigned int irq)
        raw_spin_lock_irq(&desc->lock);
+        desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
        kstat_incr_irqs_this_cpu(irq, desc);
        action = desc->action;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4e69e24d3d7d..96f3a1d9c379 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -177,8 +177,8 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
                        irq_base = irq_alloc_descs(first_irq, first_irq, size,
                                                   of_node_to_nid(of_node));
                        if (irq_base < 0) {
-                                WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
+                                pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
-                                     first_irq);
+                                        first_irq);
                                irq_base = first_irq;
                        }
                } else
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c69326aa773..fa17855ca65a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/task_work.h>
 #include "internals.h"
@@ -616,6 +617,22 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
        return ret;
 }
+#ifdef CONFIG_HARDIRQS_SW_RESEND
+int irq_set_parent(int irq, int parent_irq)
+{
+        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+        if (!desc)
+                return -EINVAL;
+        desc->parent_irq = parent_irq;
+        irq_put_desc_unlock(desc, flags);
+        return 0;
+}
+#endif
 /*
 * Default primary interrupt handler for threaded interrupts. Is
 * assigned as primary handler when request_threaded_irq is called
@@ -716,6 +733,7 @@ static void
 irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
 {
        cpumask_var_t mask;
+        bool valid = true;
        if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
                return;
@@ -730,10 +748,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
        }
        raw_spin_lock_irq(&desc->lock);
-        cpumask_copy(mask, desc->irq_data.affinity);
+        /*
+         * This code is triggered unconditionally. Check the affinity
+         * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
+         */
+        if (desc->irq_data.affinity)
+                cpumask_copy(mask, desc->irq_data.affinity);
+        else
+                valid = false;
        raw_spin_unlock_irq(&desc->lock);
-        set_cpus_allowed_ptr(current, mask);
+        if (valid)
+                set_cpus_allowed_ptr(current, mask);
        free_cpumask_var(mask);
 }
 #else
@@ -793,7 +819,7 @@ static void irq_thread_dtor(struct callback_head *unused)
        action = kthread_data(tsk);
        pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
-               tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
+               tsk->comm, tsk->pid, action->irq);
        desc = irq_to_desc(action->irq);
@@ -833,6 +859,8 @@ static int irq_thread(void *data)
        init_task_work(&on_exit_work, irq_thread_dtor);
        task_work_add(current, &on_exit_work, false);
+        irq_thread_check_affinity(desc, action);
        while (!irq_wait_for_interrupt(action)) {
                irqreturn_t action_ret;
@@ -936,6 +964,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 */
                get_task_struct(t);
                new->thread = t;
+                /*
+                 * Tell the thread to set its affinity. This is
+                 * important for shared interrupt handlers as we do
+                 * not invoke setup_affinity() for the secondary
+                 * handlers as everything is already set up. Even for
+                 * interrupts marked with IRQF_NO_BALANCE this is
+                 * correct as we want the thread to move to the cpu(s)
+                 * on which the requesting code placed the interrupt.
+                 */
+                set_bit(IRQTF_AFFINITY, &new->thread_flags);
        }
        if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -1487,6 +1525,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
 out:
        irq_put_desc_unlock(desc, flags);
 }
+EXPORT_SYMBOL_GPL(enable_percpu_irq);
 void disable_percpu_irq(unsigned int irq)
 {
@@ -1500,6 +1539,7 @@ void disable_percpu_irq(unsigned int irq)
        irq_percpu_disable(desc, cpu);
        irq_put_desc_unlock(desc, flags);
 }
+EXPORT_SYMBOL_GPL(disable_percpu_irq);
 /*
 * Internal function to unregister a percpu irqaction.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4bd4faa6323a..397db02209ed 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
 static ssize_t write_irq_affinity(int type, struct file *file,
                const char __user *buffer, size_t count, loff_t *pos)
 {
-        unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+        unsigned int irq = (int)(long)PDE(file_inode(file))->data;
        cpumask_var_t new_value;
        int err;
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 6454db7b6a4d..9065107f083e 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -74,6 +74,14 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
                if (!desc->irq_data.chip->irq_retrigger ||
                    !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
 #ifdef CONFIG_HARDIRQS_SW_RESEND
+                        /*
+                         * If the interrupt has a parent irq and runs
+                         * in the thread context of the parent irq,
+                         * retrigger the parent.
+                         */
+                        if (desc->parent_irq &&
+                            irq_settings_is_nested_thread(desc))
+                                irq = desc->parent_irq;
                        /* Set it pending and activate the softirq: */
                        set_bit(irq, irqs_resend);
                        tasklet_schedule(&resend_tasklet);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 611cd6003c45..7b5f012bde9d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
        /*
         * All handlers must agree on IRQF_SHARED, so we test just the
-         * first. Check for action->next as well.
+         * first.
         */
        action = desc->action;
        if (!action || !(action->flags & IRQF_SHARED) ||
-            (action->flags & __IRQF_TIMER) ||
+            (action->flags & __IRQF_TIMER))
-            (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
-            !action->next)
                goto out;
        /* Already running on another processor */
@@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
        do {
                if (handle_irq_event(desc) == IRQ_HANDLED)
                        ret = IRQ_HANDLED;
+                /* Make sure that there is still a valid action */
                action = desc->action;
        } while ((desc->istate & IRQS_PENDING) && action);
        desc->istate &= ~IRQS_POLL_INPROGRESS;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 1588e3b2871b..55fcce6065cf 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -12,37 +12,36 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/irqflags.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
 #include <asm/processor.h>
-/*
- * An entry can be in one of four states:
- *
- * free      NULL, 0 -> {claimed}       : free to be used
- * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
- * pending   next, 3 -> {busy}          : queued, pending callback
- * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- */
-#define IRQ_WORK_PENDING        1UL
-#define IRQ_WORK_BUSY           2UL
-#define IRQ_WORK_FLAGS          3UL
 static DEFINE_PER_CPU(struct llist_head, irq_work_list);
+static DEFINE_PER_CPU(int, irq_work_raised);
 /*
 * Claim the entry so that no one else will poke at it.
 */
 static bool irq_work_claim(struct irq_work *work)
 {
-        unsigned long flags, nflags;
+        unsigned long flags, oflags, nflags;
+        /*
+         * Start with our best wish as a premise but only trust any
+         * flag value after cmpxchg() result.
+         */
+        flags = work->flags & ~IRQ_WORK_PENDING;
        for (;;) {
-                flags = work->flags;
-                if (flags & IRQ_WORK_PENDING)
-                        return false;
                nflags = flags | IRQ_WORK_FLAGS;
-                if (cmpxchg(&work->flags, flags, nflags) == flags)
+                oflags = cmpxchg(&work->flags, flags, nflags);
+                if (oflags == flags)
                        break;
+                if (oflags & IRQ_WORK_PENDING)
+                        return false;
+                flags = oflags;
                cpu_relax();
        }
@@ -57,57 +56,69 @@ void __weak arch_irq_work_raise(void)
 }
 /*
- * Queue the entry and raise the IPI if needed.
+ * Enqueue the irq_work @entry unless it's already pending
+ * somewhere.
+ *
+ * Can be re-enqueued while the callback is still in progress.
 */
-static void __irq_work_queue(struct irq_work *work)
+void irq_work_queue(struct irq_work *work)
 {
-        bool empty;
+        /* Only queue if not already pending */
+        if (!irq_work_claim(work))
+                return;
+        /* Queue the entry and raise the IPI if needed. */
        preempt_disable();
-        empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
+        llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
-        /* The list was empty, raise self-interrupt to start processing. */
-        if (empty)
+        /*
-                arch_irq_work_raise();
+         * If the work is not "lazy" or the tick is stopped, raise the irq
+         * work interrupt (if supported by the arch), otherwise, just wait
+         * for the next tick.
+         */
+        if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
+                if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
+                        arch_irq_work_raise();
+        }
        preempt_enable();
 }
+EXPORT_SYMBOL_GPL(irq_work_queue);
-/*
+bool irq_work_needs_cpu(void)
- * Enqueue the irq_work @entry, returns true on success, failure when the
- * @entry was already enqueued by someone else.
- *
- * Can be re-enqueued while the callback is still in progress.
- */
-bool irq_work_queue(struct irq_work *work)
 {
-        if (!irq_work_claim(work)) {
+        struct llist_head *this_list;
-                /*
-                 * Already enqueued, can't do!
+        this_list = &__get_cpu_var(irq_work_list);
-                 */
+        if (llist_empty(this_list))
                return false;
-        }
-        __irq_work_queue(work);
+        /* All work should have been flushed before going offline */
+        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
        return true;
 }
-EXPORT_SYMBOL_GPL(irq_work_queue);
-/*
+static void __irq_work_run(void)
- * Run the irq_work entries on this cpu. Requires to be ran from hardirq
- * context with local IRQs disabled.
- */
-void irq_work_run(void)
 {
+        unsigned long flags;
        struct irq_work *work;
        struct llist_head *this_list;
        struct llist_node *llnode;
+        /*
+         * Reset the "raised" state right before we check the list because
+         * an NMI may enqueue after we find the list empty from the runner.
+         */
+        __this_cpu_write(irq_work_raised, 0);
+        barrier();
        this_list = &__get_cpu_var(irq_work_list);
        if (llist_empty(this_list))
                return;
-        BUG_ON(!in_irq());
        BUG_ON(!irqs_disabled());
        llnode = llist_del_all(this_list);
@@ -119,16 +130,31 @@ void irq_work_run(void)
                /*
                 * Clear the PENDING bit, after this point the @work
                 * can be re-used.
+                 * Make it immediately visible so that other CPUs trying
+                 * to claim that work don't rely on us to handle their data
+                 * while we are in the middle of the func.
                 */
-                work->flags = IRQ_WORK_BUSY;
+                flags = work->flags & ~IRQ_WORK_PENDING;
+                xchg(&work->flags, flags);
                work->func(work);
                /*
                 * Clear the BUSY bit and return to the free state if
                 * no-one else claimed it meanwhile.
                 */
-                (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
+                (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
        }
 }
+/*
+ * Run the irq_work entries on this cpu. Requires to be ran from hardirq
+ * context with local IRQs disabled.
+ */
+void irq_work_run(void)
+{
+        BUG_ON(!in_irq());
+        __irq_work_run();
+}
 EXPORT_SYMBOL_GPL(irq_work_run);
 /*
@@ -143,3 +169,35 @@ void irq_work_sync(struct irq_work *work)
                cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
+#ifdef CONFIG_HOTPLUG_CPU
+static int irq_work_cpu_notify(struct notifier_block *self,
+                               unsigned long action, void *hcpu)
+{
+        long cpu = (long)hcpu;
+        switch (action) {
+        case CPU_DYING:
+                /* Called from stop_machine */
+                if (WARN_ON_ONCE(cpu != smp_processor_id()))
+                        break;
+                __irq_work_run();
+                break;
+        default:
+                break;
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block cpu_notify;
+static __init int irq_work_init_cpu_notifier(void)
+{
+        cpu_notify.notifier_call = irq_work_cpu_notify;
+        cpu_notify.priority = 0;
+        register_cpu_notifier(&cpu_notify);
+        return 0;
+}
+device_initcall(irq_work_init_cpu_notifier);
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 30b7b225306c..e30ac0fe61c3 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -4,6 +4,7 @@
 #include <linux/string.h>
 #include <linux/random.h>
 #include <linux/module.h>
+#include <linux/ptrace.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/cache.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5e4bd7864c5d..bddd3d7a74b6 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -54,6 +54,12 @@ struct resource crashk_res = {
        .end   = 0,
        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
+struct resource crashk_low_res = {
+        .name  = "Crash kernel low",
+        .start = 0,
+        .end   = 0,
+        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
 int kexec_should_crash(struct task_struct *p)
 {
@@ -223,6 +229,8 @@ out:
 }
+static void kimage_free_page_list(struct list_head *list);
 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
                                unsigned long nr_segments,
                                struct kexec_segment __user *segments)
@@ -236,8 +244,6 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
        if (result)
                goto out;
-        *rimage = image;
        /*
         * Find a location for the control code buffer, and add it
         * the vector of segments so that it's pages will also be
@@ -248,22 +254,22 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                printk(KERN_ERR "Could not allocate control_code_buffer\n");
-                goto out;
+                goto out_free;
        }
        image->swap_page = kimage_alloc_control_pages(image, 0);
        if (!image->swap_page) {
                printk(KERN_ERR "Could not allocate swap buffer\n");
-                goto out;
+                goto out_free;
        }
-        result = 0;
+        *rimage = image;
- out:
+        return 0;
-        if (result == 0)
-                *rimage = image;
-        else
-                kfree(image);
+out_free:
+        kimage_free_page_list(&image->control_pages);
+        kfree(image);
+out:
        return result;
 }
@@ -310,7 +316,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
                mend = mstart + image->segment[i].memsz - 1;
                /* Ensure we are within the crash kernel limits */
                if ((mstart < crashk_res.start) || (mend > crashk_res.end))
-                        goto out;
+                        goto out_free;
        }
        /*
@@ -323,16 +329,15 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
                                           get_order(KEXEC_CONTROL_PAGE_SIZE));
        if (!image->control_code_page) {
                printk(KERN_ERR "Could not allocate control_code_buffer\n");
-                goto out;
+                goto out_free;
        }
-        result = 0;
+        *rimage = image;
-out:
+        return 0;
-        if (result == 0)
-                *rimage = image;
-        else
-                kfree(image);
+out_free:
+        kfree(image);
+out:
        return result;
 }
@@ -497,8 +502,6 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
                if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
                        break;
-                if (hole_end > crashk_res.end)
-                        break;
                /* See if I overlap any of the segments */
                for (i = 0; i < image->nr_segments; i++) {
                        unsigned long mstart, mend;
@@ -1369,10 +1372,11 @@ static int __init parse_crashkernel_simple(char 		*cmdline,
 * That function is the entry point for command line parsing and should be
 * called from the arch-specific code.
 */
-int __init parse_crashkernel(char                *cmdline,
+static int __init __parse_crashkernel(char *cmdline,
                             unsigned long long system_ram,
                             unsigned long long *crash_size,
-                             unsigned long long *crash_base)
+                             unsigned long long *crash_base,
+                                const char *name)
 {
        char    *p = cmdline, *ck_cmdline = NULL;
        char    *first_colon, *first_space;
@@ -1382,16 +1386,16 @@ int __init parse_crashkernel(char 		 *cmdline,
        *crash_base = 0;
        /* find crashkernel and use the last one if there are more */
-        p = strstr(p, "crashkernel=");
+        p = strstr(p, name);
        while (p) {
                ck_cmdline = p;
-                p = strstr(p+1, "crashkernel=");
+                p = strstr(p+1, name);
        }
        if (!ck_cmdline)
                return -EINVAL;
-        ck_cmdline += 12; /* strlen("crashkernel=") */
+        ck_cmdline += strlen(name);
        /*
         * if the commandline contains a ':', then that's the extended
@@ -1409,6 +1413,23 @@ int __init parse_crashkernel(char 		 *cmdline,
        return 0;
 }
+int __init parse_crashkernel(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base)
+{
+        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                        "crashkernel=");
+}
+int __init parse_crashkernel_low(char *cmdline,
+                             unsigned long long system_ram,
+                             unsigned long long *crash_size,
+                             unsigned long long *crash_base)
+{
+        return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+                                        "crashkernel_low=");
+}
 static void update_vmcoreinfo_note(void)
 {
@@ -1490,6 +1511,8 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_OFFSET(page, _count);
        VMCOREINFO_OFFSET(page, mapping);
        VMCOREINFO_OFFSET(page, lru);
+        VMCOREINFO_OFFSET(page, _mapcount);
+        VMCOREINFO_OFFSET(page, private);
        VMCOREINFO_OFFSET(pglist_data, node_zones);
        VMCOREINFO_OFFSET(pglist_data, nr_zones);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1512,6 +1535,11 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_NUMBER(PG_lru);
        VMCOREINFO_NUMBER(PG_private);
        VMCOREINFO_NUMBER(PG_swapcache);
+        VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+        VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+        VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
        arch_crash_save_vmcoreinfo();
        update_vmcoreinfo_note();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
deleted file mode 100644
index 59dcf5b81d24..000000000000
--- a/kernel/kfifo.c
+++ /dev/null
@@ -1,609 +0,0 @@
-/*
- * A generic kernel FIFO implementation
- *
- * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/log2.h>
-#include <linux/uaccess.h>
-#include <linux/kfifo.h>
-/*
- * internal helper to calculate the unused elements in a fifo
- */
-static inline unsigned int kfifo_unused(struct __kfifo *fifo)
-{
-        return (fifo->mask + 1) - (fifo->in - fifo->out);
-}
-int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
-                size_t esize, gfp_t gfp_mask)
-{
-        /*
-         * round down to the next power of 2, since our 'let the indices
-         * wrap' technique works only in this case.
-         */
-        if (!is_power_of_2(size))
-                size = rounddown_pow_of_two(size);
-        fifo->in = 0;
-        fifo->out = 0;
-        fifo->esize = esize;
-        if (size < 2) {
-                fifo->data = NULL;
-                fifo->mask = 0;
-                return -EINVAL;
-        }
-        fifo->data = kmalloc(size * esize, gfp_mask);
-        if (!fifo->data) {
-                fifo->mask = 0;
-                return -ENOMEM;
-        }
-        fifo->mask = size - 1;
-        return 0;
-}
-EXPORT_SYMBOL(__kfifo_alloc);
-void __kfifo_free(struct __kfifo *fifo)
-{
-        kfree(fifo->data);
-        fifo->in = 0;
-        fifo->out = 0;
-        fifo->esize = 0;
-        fifo->data = NULL;
-        fifo->mask = 0;
-}
-EXPORT_SYMBOL(__kfifo_free);
-int __kfifo_init(struct __kfifo *fifo, void *buffer,
-                unsigned int size, size_t esize)
-{
-        size /= esize;
-        if (!is_power_of_2(size))
-                size = rounddown_pow_of_two(size);
-        fifo->in = 0;
-        fifo->out = 0;
-        fifo->esize = esize;
-        fifo->data = buffer;
-        if (size < 2) {
-                fifo->mask = 0;
-                return -EINVAL;
-        }
-        fifo->mask = size - 1;
-        return 0;
-}
-EXPORT_SYMBOL(__kfifo_init);
-static void kfifo_copy_in(struct __kfifo *fifo, const void *src,
-                unsigned int len, unsigned int off)
-{
-        unsigned int size = fifo->mask + 1;
-        unsigned int esize = fifo->esize;
-        unsigned int l;
-        off &= fifo->mask;
-        if (esize != 1) {
-                off *= esize;
-                size *= esize;
-                len *= esize;
-        }
-        l = min(len, size - off);
-        memcpy(fifo->data + off, src, l);
-        memcpy(fifo->data, src + l, len - l);
-        /*
-         * make sure that the data in the fifo is up to date before
-         * incrementing the fifo->in index counter
-         */
-        smp_wmb();
-}
-unsigned int __kfifo_in(struct __kfifo *fifo,
-                const void *buf, unsigned int len)
-{
-        unsigned int l;
-        l = kfifo_unused(fifo);
-        if (len > l)
-                len = l;
-        kfifo_copy_in(fifo, buf, len, fifo->in);
-        fifo->in += len;
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_in);
-static void kfifo_copy_out(struct __kfifo *fifo, void *dst,
-                unsigned int len, unsigned int off)
-{
-        unsigned int size = fifo->mask + 1;
-        unsigned int esize = fifo->esize;
-        unsigned int l;
-        off &= fifo->mask;
-        if (esize != 1) {
-                off *= esize;
-                size *= esize;
-                len *= esize;
-        }
-        l = min(len, size - off);
-        memcpy(dst, fifo->data + off, l);
-        memcpy(dst + l, fifo->data, len - l);
-        /*
-         * make sure that the data is copied before
-         * incrementing the fifo->out index counter
-         */
-        smp_wmb();
-}
-unsigned int __kfifo_out_peek(struct __kfifo *fifo,
-                void *buf, unsigned int len)
-{
-        unsigned int l;
-        l = fifo->in - fifo->out;
-        if (len > l)
-                len = l;
-        kfifo_copy_out(fifo, buf, len, fifo->out);
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_out_peek);
-unsigned int __kfifo_out(struct __kfifo *fifo,
-                void *buf, unsigned int len)
-{
-        len = __kfifo_out_peek(fifo, buf, len);
-        fifo->out += len;
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_out);
-static unsigned long kfifo_copy_from_user(struct __kfifo *fifo,
-        const void __user *from, unsigned int len, unsigned int off,
-        unsigned int *copied)
-{
-        unsigned int size = fifo->mask + 1;
-        unsigned int esize = fifo->esize;
-        unsigned int l;
-        unsigned long ret;
-        off &= fifo->mask;
-        if (esize != 1) {
-                off *= esize;
-                size *= esize;
-                len *= esize;
-        }
-        l = min(len, size - off);
-        ret = copy_from_user(fifo->data + off, from, l);
-        if (unlikely(ret))
-                ret = DIV_ROUND_UP(ret + len - l, esize);
-        else {
-                ret = copy_from_user(fifo->data, from + l, len - l);
-                if (unlikely(ret))
-                        ret = DIV_ROUND_UP(ret, esize);
-        }
-        /*
-         * make sure that the data in the fifo is up to date before
-         * incrementing the fifo->in index counter
-         */
-        smp_wmb();
-        *copied = len - ret;
-        /* return the number of elements which are not copied */
-        return ret;
-}
-int __kfifo_from_user(struct __kfifo *fifo, const void __user *from,
-                unsigned long len, unsigned int *copied)
-{
-        unsigned int l;
-        unsigned long ret;
-        unsigned int esize = fifo->esize;
-        int err;
-        if (esize != 1)
-                len /= esize;
-        l = kfifo_unused(fifo);
-        if (len > l)
-                len = l;
-        ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied);
-        if (unlikely(ret)) {
-                len -= ret;
-                err = -EFAULT;
-        } else
-                err = 0;
-        fifo->in += len;
-        return err;
-}
-EXPORT_SYMBOL(__kfifo_from_user);
-static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to,
-                unsigned int len, unsigned int off, unsigned int *copied)
-{
-        unsigned int l;
-        unsigned long ret;
-        unsigned int size = fifo->mask + 1;
-        unsigned int esize = fifo->esize;
-        off &= fifo->mask;
-        if (esize != 1) {
-                off *= esize;
-                size *= esize;
-                len *= esize;
-        }
-        l = min(len, size - off);
-        ret = copy_to_user(to, fifo->data + off, l);
-        if (unlikely(ret))
-                ret = DIV_ROUND_UP(ret + len - l, esize);
-        else {
-                ret = copy_to_user(to + l, fifo->data, len - l);
-                if (unlikely(ret))
-                        ret = DIV_ROUND_UP(ret, esize);
-        }
-        /*
-         * make sure that the data is copied before
-         * incrementing the fifo->out index counter
-         */
-        smp_wmb();
-        *copied = len - ret;
-        /* return the number of elements which are not copied */
-        return ret;
-}
-int __kfifo_to_user(struct __kfifo *fifo, void __user *to,
-                unsigned long len, unsigned int *copied)
-{
-        unsigned int l;
-        unsigned long ret;
-        unsigned int esize = fifo->esize;
-        int err;
-        if (esize != 1)
-                len /= esize;
-        l = fifo->in - fifo->out;
-        if (len > l)
-                len = l;
-        ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied);
-        if (unlikely(ret)) {
-                len -= ret;
-                err = -EFAULT;
-        } else
-                err = 0;
-        fifo->out += len;
-        return err;
-}
-EXPORT_SYMBOL(__kfifo_to_user);
-static int setup_sgl_buf(struct scatterlist *sgl, void *buf,
-                int nents, unsigned int len)
-{
-        int n;
-        unsigned int l;
-        unsigned int off;
-        struct page *page;
-        if (!nents)
-                return 0;
-        if (!len)
-                return 0;
-        n = 0;
-        page = virt_to_page(buf);
-        off = offset_in_page(buf);
-        l = 0;
-        while (len >= l + PAGE_SIZE - off) {
-                struct page *npage;
-                l += PAGE_SIZE;
-                buf += PAGE_SIZE;
-                npage = virt_to_page(buf);
-                if (page_to_phys(page) != page_to_phys(npage) - l) {
-                        sg_set_page(sgl, page, l - off, off);
-                        sgl = sg_next(sgl);
-                        if (++n == nents || sgl == NULL)
-                                return n;
-                        page = npage;
-                        len -= l - off;
-                        l = off = 0;
-                }
-        }
-        sg_set_page(sgl, page, len, off);
-        return n + 1;
-}
-static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
-                int nents, unsigned int len, unsigned int off)
-{
-        unsigned int size = fifo->mask + 1;
-        unsigned int esize = fifo->esize;
-        unsigned int l;
-        unsigned int n;
-        off &= fifo->mask;
-        if (esize != 1) {
-                off *= esize;
-                size *= esize;
-                len *= esize;
-        }
-        l = min(len, size - off);
-        n = setup_sgl_buf(sgl, fifo->data + off, nents, l);
-        n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l);
-        return n;
-}
-unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
-                struct scatterlist *sgl, int nents, unsigned int len)
-{
-        unsigned int l;
-        l = kfifo_unused(fifo);
-        if (len > l)
-                len = l;
-        return setup_sgl(fifo, sgl, nents, len, fifo->in);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare);
-unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
-                struct scatterlist *sgl, int nents, unsigned int len)
-{
-        unsigned int l;
-        l = fifo->in - fifo->out;
-        if (len > l)
-                len = l;
-        return setup_sgl(fifo, sgl, nents, len, fifo->out);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare);
-unsigned int __kfifo_max_r(unsigned int len, size_t recsize)
-{
-        unsigned int max = (1 << (recsize << 3)) - 1;
-        if (len > max)
-                return max;
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_max_r);
-#define __KFIFO_PEEK(data, out, mask) \
-        ((data)[(out) & (mask)])
-/*
- * __kfifo_peek_n internal helper function for determinate the length of
- * the next record in the fifo
- */
-static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize)
-{
-        unsigned int l;
-        unsigned int mask = fifo->mask;
-        unsigned char *data = fifo->data;
-        l = __KFIFO_PEEK(data, fifo->out, mask);
-        if (--recsize)
-                l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8;
-        return l;
-}
-#define __KFIFO_POKE(data, in, mask, val) \
-        ( \
-        (data)[(in) & (mask)] = (unsigned char)(val) \
-        )
-/*
- * __kfifo_poke_n internal helper function for storeing the length of
- * the record into the fifo
- */
-static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize)
-{
-        unsigned int mask = fifo->mask;
-        unsigned char *data = fifo->data;
-        __KFIFO_POKE(data, fifo->in, mask, n);
-        if (recsize > 1)
-                __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8);
-}
-unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize)
-{
-        return __kfifo_peek_n(fifo, recsize);
-}
-EXPORT_SYMBOL(__kfifo_len_r);
-unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf,
-                unsigned int len, size_t recsize)
-{
-        if (len + recsize > kfifo_unused(fifo))
-                return 0;
-        __kfifo_poke_n(fifo, len, recsize);
-        kfifo_copy_in(fifo, buf, len, fifo->in + recsize);
-        fifo->in += len + recsize;
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_in_r);
-static unsigned int kfifo_out_copy_r(struct __kfifo *fifo,
-        void *buf, unsigned int len, size_t recsize, unsigned int *n)
-{
-        *n = __kfifo_peek_n(fifo, recsize);
-        if (len > *n)
-                len = *n;
-        kfifo_copy_out(fifo, buf, len, fifo->out + recsize);
-        return len;
-}
-unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
-                unsigned int len, size_t recsize)
-{
-        unsigned int n;
-        if (fifo->in == fifo->out)
-                return 0;
-        return kfifo_out_copy_r(fifo, buf, len, recsize, &n);
-}
-EXPORT_SYMBOL(__kfifo_out_peek_r);
-unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
-                unsigned int len, size_t recsize)
-{
-        unsigned int n;
-        if (fifo->in == fifo->out)
-                return 0;
-        len = kfifo_out_copy_r(fifo, buf, len, recsize, &n);
-        fifo->out += n + recsize;
-        return len;
-}
-EXPORT_SYMBOL(__kfifo_out_r);
-void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize)
-{
-        unsigned int n;
-        n = __kfifo_peek_n(fifo, recsize);
-        fifo->out += n + recsize;
-}
-EXPORT_SYMBOL(__kfifo_skip_r);
-int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from,
-        unsigned long len, unsigned int *copied, size_t recsize)
-{
-        unsigned long ret;
-        len = __kfifo_max_r(len, recsize);
-        if (len + recsize > kfifo_unused(fifo)) {
-                *copied = 0;
-                return 0;
-        }
-        __kfifo_poke_n(fifo, len, recsize);
-        ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied);
-        if (unlikely(ret)) {
-                *copied = 0;
-                return -EFAULT;
-        }
-        fifo->in += len + recsize;
-        return 0;
-}
-EXPORT_SYMBOL(__kfifo_from_user_r);
-int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
-        unsigned long len, unsigned int *copied, size_t recsize)
-{
-        unsigned long ret;
-        unsigned int n;
-        if (fifo->in == fifo->out) {
-                *copied = 0;
-                return 0;
-        }
-        n = __kfifo_peek_n(fifo, recsize);
-        if (len > n)
-                len = n;
-        ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied);
-        if (unlikely(ret)) {
-                *copied = 0;
-                return -EFAULT;
-        }
-        fifo->out += n + recsize;
-        return 0;
-}
-EXPORT_SYMBOL(__kfifo_to_user_r);
-unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
-        struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
-        if (!nents)
-                BUG();
-        len = __kfifo_max_r(len, recsize);
-        if (len + recsize > kfifo_unused(fifo))
-                return 0;
-        return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
-void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
-        unsigned int len, size_t recsize)
-{
-        len = __kfifo_max_r(len, recsize);
-        __kfifo_poke_n(fifo, len, recsize);
-        fifo->in += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
-unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
-        struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
-{
-        if (!nents)
-                BUG();
-        len = __kfifo_max_r(len, recsize);
-        if (len + recsize > fifo->in - fifo->out)
-                return 0;
-        return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
-}
-EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
-void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
-{
-        unsigned int len;
-        len = __kfifo_peek_n(fifo, recsize);
-        fifo->out += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 1c317e386831..56dd34976d7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -38,6 +38,7 @@
 #include <linux/suspend.h>
 #include <linux/rwsem.h>
 #include <linux/ptrace.h>
+#include <linux/async.h>
 #include <asm/uaccess.h>
 #include <trace/events/module.h>
@@ -130,6 +131,14 @@ int __request_module(bool wait, const char *fmt, ...)
 #define MAX_KMOD_CONCURRENT 50  /* Completely arbitrary value - KAO */
        static int kmod_loop_msg;
+        /*
+         * We don't allow synchronous module loading from async.  Module
+         * init may invoke async_synchronize_full() which will end up
+         * waiting for this task which already is waiting for the module
+         * loading to complete, leading to a deadlock.
+         */
+        WARN_ON_ONCE(wait && current_is_async());
        va_start(args, fmt);
        ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
        va_end(args);
@@ -219,9 +228,9 @@ static int ____call_usermodehelper(void *data)
        commit_creds(new);
-        retval = kernel_execve(sub_info->path,
+        retval = do_execve(sub_info->path,
-                               (const char *const *)sub_info->argv,
+                           (const char __user *const __user *)sub_info->argv,
-                               (const char *const *)sub_info->envp);
+                           (const char __user *const __user *)sub_info->envp);
        if (!retval)
                return 0;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396aa409..e35be53f6613 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -334,11 +334,10 @@ static inline void reset_kprobe_instance(void)
 struct kprobe __kprobes *get_kprobe(void *addr)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
-        hlist_for_each_entry_rcu(p, node, head, hlist) {
+        hlist_for_each_entry_rcu(p, head, hlist) {
                if (p->addr == addr)
                        return p;
        }
@@ -471,7 +470,6 @@ static LIST_HEAD(unoptimizing_list);
 static void kprobe_optimizer(struct work_struct *work);
 static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
-static DECLARE_COMPLETION(optimizer_comp);
 #define OPTIMIZE_DELAY 5
 /*
@@ -552,8 +550,7 @@ static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
 /* Start optimizer after OPTIMIZE_DELAY passed */
 static __kprobes void kick_kprobe_optimizer(void)
 {
-        if (!delayed_work_pending(&optimizing_work))
+        schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
-                schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
 }
 /* Kprobe jump optimizer */
@@ -592,16 +589,25 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
        /* Step 5: Kick optimizer again if needed */
        if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
                kick_kprobe_optimizer();
-        else
-                /* Wake up all waiters */
-                complete_all(&optimizer_comp);
 }
 /* Wait for completing optimization and unoptimization */
 static __kprobes void wait_for_kprobe_optimizer(void)
 {
-        if (delayed_work_pending(&optimizing_work))
+        mutex_lock(&kprobe_mutex);
-                wait_for_completion(&optimizer_comp);
+        while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
+                mutex_unlock(&kprobe_mutex);
+                /* this will also make optimizing_work execute immmediately */
+                flush_delayed_work(&optimizing_work);
+                /* @optimizing_work might not have been queued yet, relax */
+                cpu_relax();
+                mutex_lock(&kprobe_mutex);
+        }
+        mutex_unlock(&kprobe_mutex);
 }
 /* Optimize kprobe if p is ready to be optimized */
@@ -792,7 +798,6 @@ out:
 static void __kprobes optimize_all_kprobes(void)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        unsigned int i;
@@ -803,7 +808,7 @@ static void __kprobes optimize_all_kprobes(void)
        kprobes_allow_optimization = true;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
-                hlist_for_each_entry_rcu(p, node, head, hlist)
+                hlist_for_each_entry_rcu(p, head, hlist)
                        if (!kprobe_disabled(p))
                                optimize_kprobe(p);
        }
@@ -814,7 +819,6 @@ static void __kprobes optimize_all_kprobes(void)
 static void __kprobes unoptimize_all_kprobes(void)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        unsigned int i;
@@ -825,7 +829,7 @@ static void __kprobes unoptimize_all_kprobes(void)
        kprobes_allow_optimization = false;
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
-                hlist_for_each_entry_rcu(p, node, head, hlist) {
+                hlist_for_each_entry_rcu(p, head, hlist) {
                        if (!kprobe_disabled(p))
                                unoptimize_kprobe(p, false);
                }
@@ -919,7 +923,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 }
 #endif /* CONFIG_OPTPROBES */
-#ifdef KPROBES_CAN_USE_FTRACE
+#ifdef CONFIG_KPROBES_ON_FTRACE
 static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
        .func = kprobe_ftrace_handler,
        .flags = FTRACE_OPS_FL_SAVE_REGS,
@@ -964,7 +968,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
                           (unsigned long)p->addr, 1, 0);
        WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
 }
-#else   /* !KPROBES_CAN_USE_FTRACE */
+#else   /* !CONFIG_KPROBES_ON_FTRACE */
 #define prepare_kprobe(p)       arch_prepare_kprobe(p)
 #define arm_kprobe_ftrace(p)    do {} while (0)
 #define disarm_kprobe_ftrace(p) do {} while (0)
@@ -1141,7 +1145,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
        struct kretprobe_instance *ri;
        struct hlist_head *head, empty_rp;
-        struct hlist_node *node, *tmp;
+        struct hlist_node *tmp;
        unsigned long hash, flags = 0;
        if (unlikely(!kprobes_initialized))
@@ -1152,12 +1156,12 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
        hash = hash_ptr(tk, KPROBE_HASH_BITS);
        head = &kretprobe_inst_table[hash];
        kretprobe_table_lock(hash, &flags);
-        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+        hlist_for_each_entry_safe(ri, tmp, head, hlist) {
                if (ri->task == tk)
                        recycle_rp_inst(ri, &empty_rp);
        }
        kretprobe_table_unlock(hash, &flags);
-        hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
+        hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
                hlist_del(&ri->hlist);
                kfree(ri);
        }
@@ -1166,9 +1170,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 static inline void free_rp_inst(struct kretprobe *rp)
 {
        struct kretprobe_instance *ri;
-        struct hlist_node *pos, *next;
+        struct hlist_node *next;
-        hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
+        hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
                hlist_del(&ri->hlist);
                kfree(ri);
        }
@@ -1178,14 +1182,14 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
 {
        unsigned long flags, hash;
        struct kretprobe_instance *ri;
-        struct hlist_node *pos, *next;
+        struct hlist_node *next;
        struct hlist_head *head;
        /* No race here */
        for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
                kretprobe_table_lock(hash, &flags);
                head = &kretprobe_inst_table[hash];
-                hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
+                hlist_for_each_entry_safe(ri, next, head, hlist) {
                        if (ri->rp == rp)
                                ri->rp = NULL;
                }
@@ -1414,12 +1418,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
         */
        ftrace_addr = ftrace_location((unsigned long)p->addr);
        if (ftrace_addr) {
-#ifdef KPROBES_CAN_USE_FTRACE
+#ifdef CONFIG_KPROBES_ON_FTRACE
                /* Given address is not on the instruction boundary */
                if ((unsigned long)p->addr != ftrace_addr)
                        return -EILSEQ;
                p->flags |= KPROBE_FLAG_FTRACE;
-#else   /* !KPROBES_CAN_USE_FTRACE */
+#else   /* !CONFIG_KPROBES_ON_FTRACE */
                return -EINVAL;
 #endif
        }
@@ -2021,7 +2025,6 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
 {
        struct module *mod = data;
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        unsigned int i;
        int checkcore = (val == MODULE_STATE_GOING);
@@ -2038,7 +2041,7 @@ static int __kprobes kprobes_module_callback(struct notifier_block *nb,
        mutex_lock(&kprobe_mutex);
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
-                hlist_for_each_entry_rcu(p, node, head, hlist)
+                hlist_for_each_entry_rcu(p, head, hlist)
                        if (within_module_init((unsigned long)p->addr, mod) ||
                            (checkcore &&
                             within_module_core((unsigned long)p->addr, mod))) {
@@ -2185,7 +2188,6 @@ static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
 static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p, *kp;
        const char *sym = NULL;
        unsigned int i = *(loff_t *) v;
@@ -2194,7 +2196,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
        head = &kprobe_table[i];
        preempt_disable();
-        hlist_for_each_entry_rcu(p, node, head, hlist) {
+        hlist_for_each_entry_rcu(p, head, hlist) {
                sym = kallsyms_lookup((unsigned long)p->addr, NULL,
                                        &offset, &modname, namebuf);
                if (kprobe_aggrprobe(p)) {
@@ -2229,7 +2231,6 @@ static const struct file_operations debugfs_kprobes_operations = {
 static void __kprobes arm_all_kprobes(void)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        unsigned int i;
@@ -2242,7 +2243,7 @@ static void __kprobes arm_all_kprobes(void)
        /* Arming kprobes doesn't optimize kprobe itself */
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
-                hlist_for_each_entry_rcu(p, node, head, hlist)
+                hlist_for_each_entry_rcu(p, head, hlist)
                        if (!kprobe_disabled(p))
                                arm_kprobe(p);
        }
@@ -2258,7 +2259,6 @@ already_enabled:
 static void __kprobes disarm_all_kprobes(void)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct kprobe *p;
        unsigned int i;
@@ -2275,7 +2275,7 @@ static void __kprobes disarm_all_kprobes(void)
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                head = &kprobe_table[i];
-                hlist_for_each_entry_rcu(p, node, head, hlist) {
+                hlist_for_each_entry_rcu(p, head, hlist) {
                        if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
                                disarm_kprobe(p, false);
                }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 4e316e1acf58..6ada93c23a9a 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -26,7 +26,6 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 static struct kobj_attribute _name##_attr = \
        __ATTR(_name, 0644, _name##_show, _name##_store)
-#if defined(CONFIG_HOTPLUG)
 /* current uevent sequence number */
 static ssize_t uevent_seqnum_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
@@ -54,7 +53,7 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
        return count;
 }
 KERNEL_ATTR_RW(uevent_helper);
-#endif
 #ifdef CONFIG_PROFILING
 static ssize_t profiling_show(struct kobject *kobj,
@@ -141,6 +140,23 @@ static ssize_t fscaps_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(fscaps);
+int rcu_expedited;
+static ssize_t rcu_expedited_show(struct kobject *kobj,
+                                  struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%d\n", rcu_expedited);
+}
+static ssize_t rcu_expedited_store(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   const char *buf, size_t count)
+{
+        if (kstrtoint(buf, 0, &rcu_expedited))
+                return -EINVAL;
+        return count;
+}
+KERNEL_ATTR_RW(rcu_expedited);
 /*
 * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
 */
@@ -169,10 +185,8 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
 static struct attribute * kernel_attrs[] = {
        &fscaps_attr.attr,
-#if defined(CONFIG_HOTPLUG)
        &uevent_seqnum_attr.attr,
        &uevent_helper_attr.attr,
-#endif
 #ifdef CONFIG_PROFILING
        &profiling_attr.attr,
 #endif
@@ -182,6 +196,7 @@ static struct attribute * kernel_attrs[] = {
        &kexec_crash_size_attr.attr,
        &vmcoreinfo_attr.attr,
 #endif
+        &rcu_expedited_attr.attr,
        NULL
 };
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 29fb60caecb5..9eb7fed0bbaa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -124,12 +124,12 @@ void *kthread_data(struct task_struct *task)
 static void __kthread_parkme(struct kthread *self)
 {
-        __set_current_state(TASK_INTERRUPTIBLE);
+        __set_current_state(TASK_PARKED);
        while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
                if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
                        complete(&self->parked);
                schedule();
-                __set_current_state(TASK_INTERRUPTIBLE);
+                __set_current_state(TASK_PARKED);
        }
        clear_bit(KTHREAD_IS_PARKED, &self->flags);
        __set_current_state(TASK_RUNNING);
@@ -256,8 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
 {
+        /* Must have done schedule() in kthread() before we set_task_cpu */
+        if (!wait_task_inactive(p, state)) {
+                WARN_ON(1);
+                return;
+        }
        /* It's safe because the task is inactive. */
        do_set_cpus_allowed(p, cpumask_of(cpu));
        p->flags |= PF_THREAD_BOUND;
@@ -274,12 +279,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
 */
 void kthread_bind(struct task_struct *p, unsigned int cpu)
 {
-        /* Must have done schedule() in kthread() before we set_task_cpu */
+        __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
-        if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
-                WARN_ON(1);
-                return;
-        }
-        __kthread_bind(p, cpu);
 }
 EXPORT_SYMBOL(kthread_bind);
@@ -324,6 +324,22 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
        return NULL;
 }
+static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
+{
+        clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+        /*
+         * We clear the IS_PARKED bit here as we don't wait
+         * until the task has left the park code. So if we'd
+         * park before that happens we'd see the IS_PARKED bit
+         * which might be about to be cleared.
+         */
+        if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+                if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+                        __kthread_bind(k, kthread->cpu, TASK_PARKED);
+                wake_up_state(k, TASK_PARKED);
+        }
+}
 /**
 * kthread_unpark - unpark a thread created by kthread_create().
 * @k:          thread created by kthread_create().
@@ -336,20 +352,8 @@ void kthread_unpark(struct task_struct *k)
 {
        struct kthread *kthread = task_get_live_kthread(k);
-        if (kthread) {
+        if (kthread)
-                clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+                __kthread_unpark(k, kthread);
-                /*
-                 * We clear the IS_PARKED bit here as we don't wait
-                 * until the task has left the park code. So if we'd
-                 * park before that happens we'd see the IS_PARKED bit
-                 * which might be about to be cleared.
-                 */
-                if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-                        if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
-                                __kthread_bind(k, kthread->cpu);
-                        wake_up_process(k);
-                }
-        }
        put_task_struct(k);
 }
@@ -407,7 +411,7 @@ int kthread_stop(struct task_struct *k)
        trace_sched_kthread_stop(k);
        if (kthread) {
                set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
-                clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+                __kthread_unpark(k, kthread);
                wake_up_process(k);
                wait_for_completion(&kthread->exited);
        }
@@ -428,7 +432,7 @@ int kthreadd(void *unused)
        set_task_comm(tsk, "kthreadd");
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, cpu_all_mask);
-        set_mems_allowed(node_states[N_HIGH_MEMORY]);
+        set_mems_allowed(node_states[N_MEMORY]);
        current->flags |= PF_NOFREEZE;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 7981e5b2350d..8a0efac4f99d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3190,9 +3190,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 #endif
        if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
                debug_locks_off();
-                printk("BUG: MAX_LOCK_DEPTH too low!\n");
+                printk("BUG: MAX_LOCK_DEPTH too low, depth: %i  max: %lu!\n",
+                       curr->lockdep_depth, MAX_LOCK_DEPTH);
                printk("turning off the locking correctness validator.\n");
+                lockdep_print_held_locks(current);
+                debug_show_all_locks();
                dump_stack();
                return 0;
        }
@@ -3203,7 +3208,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 static int
-print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
                           unsigned long ip)
 {
        if (!debug_locks_off())
@@ -3246,7 +3251,7 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
                return 0;
        if (curr->lockdep_depth <= 0)
-                return print_unlock_inbalance_bug(curr, lock, ip);
+                return print_unlock_imbalance_bug(curr, lock, ip);
        return 1;
 }
@@ -3317,7 +3322,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
                        goto found_it;
                prev_hlock = hlock;
        }
-        return print_unlock_inbalance_bug(curr, lock, ip);
+        return print_unlock_imbalance_bug(curr, lock, ip);
 found_it:
        lockdep_init_map(lock, name, key, 0);
@@ -3384,7 +3389,7 @@ lock_release_non_nested(struct task_struct *curr,
                        goto found_it;
                prev_hlock = hlock;
        }
-        return print_unlock_inbalance_bug(curr, lock, ip);
+        return print_unlock_imbalance_bug(curr, lock, ip);
 found_it:
        if (hlock->instance == lock)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 91c32a0b612c..b2c71c5873e4 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -39,7 +39,7 @@ static void l_stop(struct seq_file *m, void *v)
 static void print_name(struct seq_file *m, struct lock_class *class)
 {
-        char str[128];
+        char str[KSYM_NAME_LEN];
        const char *name = class->name;
        if (!name) {
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
new file mode 100644
index 000000000000..246b4c6e6135
--- /dev/null
+++ b/kernel/modsign_certificate.S
@@ -0,0 +1,19 @@
+/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
+#ifndef SYMBOL_PREFIX
+#define ASM_SYMBOL(sym) sym
+#else
+#define PASTE2(x,y) x##y
+#define PASTE(x,y) PASTE2(x,y)
+#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
+#endif
+#define GLOBAL(name)    \
+        .globl ASM_SYMBOL(name);        \
+        ASM_SYMBOL(name):
+        .section ".init.data","aw"
+GLOBAL(modsign_certificate_list)
+        .incbin "signing_key.x509"
+        .incbin "extra_certificates"
+GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 4646eb2c3820..2b6e69909c39 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -20,12 +20,6 @@ struct key *modsign_keyring;
 extern __initdata const u8 modsign_certificate_list[];
 extern __initdata const u8 modsign_certificate_list_end[];
-asm(".section .init.data,\"aw\"\n"
-    "modsign_certificate_list:\n"
-    ".incbin \"signing_key.x509\"\n"
-    ".incbin \"extra_certificates\"\n"
-    "modsign_certificate_list_end:"
-    );
 /*
 * We need to make sure ccache doesn't cache the .o file as it doesn't notice
@@ -40,18 +34,15 @@ static __init int module_verify_init(void)
 {
        pr_notice("Initialise module verification\n");
-        modsign_keyring = key_alloc(&key_type_keyring, ".module_sign",
+        modsign_keyring = keyring_alloc(".module_sign",
-                                    KUIDT_INIT(0), KGIDT_INIT(0),
+                                        KUIDT_INIT(0), KGIDT_INIT(0),
-                                    current_cred(),
+                                        current_cred(),
-                                    (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+                                        ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
-                                    KEY_USR_VIEW | KEY_USR_READ,
+                                         KEY_USR_VIEW | KEY_USR_READ),
-                                    KEY_ALLOC_NOT_IN_QUOTA);
+                                        KEY_ALLOC_NOT_IN_QUOTA, NULL);
        if (IS_ERR(modsign_keyring))
                panic("Can't allocate module signing keyring\n");
-        if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0)
-                panic("Can't instantiate module signing keyring\n");
        return 0;
 }
diff --git a/kernel/module.c b/kernel/module.c
index 6085f5ef88ea..0925c9a71975 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -21,6 +21,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/sysfs.h>
 #include <linux/kernel.h>
@@ -28,6 +29,7 @@
 #include <linux/vmalloc.h>
 #include <linux/elf.h>
 #include <linux/proc_fs.h>
+#include <linux/security.h>
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
@@ -59,6 +61,7 @@
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
 #include <linux/fips.h>
+#include <uapi/linux/module.h>
 #include "module-internal.h"
 #define CREATE_TRACE_POINTS
@@ -185,6 +188,7 @@ struct load_info {
   ongoing or failed initialization etc. */
 static inline int strong_try_module_get(struct module *mod)
 {
+        BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
        if (mod && mod->state == MODULE_STATE_COMING)
                return -EBUSY;
        if (try_module_get(mod))
@@ -193,9 +197,10 @@ static inline int strong_try_module_get(struct module *mod)
                return -ENOENT;
 }
-static inline void add_taint_module(struct module *mod, unsigned flag)
+static inline void add_taint_module(struct module *mod, unsigned flag,
+                                    enum lockdep_ok lockdep_ok)
 {
-        add_taint(flag);
+        add_taint(flag, lockdep_ok);
        mod->taints |= (1U << flag);
 }
@@ -340,6 +345,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
 #endif
                };
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
                        return true;
        }
@@ -372,9 +380,6 @@ static bool check_symbol(const struct symsearch *syms,
                        printk(KERN_WARNING "Symbol %s is being used "
                               "by a non-GPL module, which will not "
                               "be allowed in the future\n", fsa->name);
-                        printk(KERN_WARNING "Please see the file "
-                               "Documentation/feature-removal-schedule.txt "
-                               "in the kernel source tree for more details.\n");
                }
        }
@@ -450,16 +455,24 @@ const struct kernel_symbol *find_symbol(const char *name,
 EXPORT_SYMBOL_GPL(find_symbol);
 /* Search for module by name: must hold module_mutex. */
-struct module *find_module(const char *name)
+static struct module *find_module_all(const char *name,
+                                      bool even_unformed)
 {
        struct module *mod;
        list_for_each_entry(mod, &modules, list) {
+                if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                if (strcmp(mod->name, name) == 0)
                        return mod;
        }
        return NULL;
 }
+struct module *find_module(const char *name)
+{
+        return find_module_all(name, false);
+}
 EXPORT_SYMBOL_GPL(find_module);
 #ifdef CONFIG_SMP
@@ -525,6 +538,8 @@ bool is_module_percpu_address(unsigned long addr)
        preempt_disable();
        list_for_each_entry_rcu(mod, &modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                if (!mod->percpu_size)
                        continue;
                for_each_possible_cpu(cpu) {
@@ -713,7 +728,7 @@ static inline int try_force_unload(unsigned int flags)
 {
        int ret = (flags & O_TRUNC);
        if (ret)
-                add_taint(TAINT_FORCED_RMMOD);
+                add_taint(TAINT_FORCED_RMMOD, LOCKDEP_NOW_UNRELIABLE);
        return ret;
 }
 #else
@@ -1048,6 +1063,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
        case MODULE_STATE_GOING:
                state = "going";
                break;
+        default:
+                BUG();
        }
        return sprintf(buffer, "%s\n", state);
 }
@@ -1122,7 +1139,7 @@ static int try_to_force_load(struct module *mod, const char *reason)
        if (!test_taint(TAINT_FORCED_MODULE))
                printk(KERN_WARNING "%s: %s: kernel tainted.\n",
                       mod->name, reason);
-        add_taint_module(mod, TAINT_FORCED_MODULE);
+        add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);
        return 0;
 #else
        return -ENOEXEC;
@@ -1786,6 +1803,8 @@ void set_all_modules_text_rw(void)
        mutex_lock(&module_mutex);
        list_for_each_entry_rcu(mod, &modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                if ((mod->module_core) && (mod->core_text_size)) {
                        set_page_attributes(mod->module_core,
                                                mod->module_core + mod->core_text_size,
@@ -1807,6 +1826,8 @@ void set_all_modules_text_ro(void)
        mutex_lock(&module_mutex);
        list_for_each_entry_rcu(mod, &modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                if ((mod->module_core) && (mod->core_text_size)) {
                        set_page_attributes(mod->module_core,
                                                mod->module_core + mod->core_text_size,
@@ -2127,7 +2148,8 @@ static void set_license(struct module *mod, const char *license)
                if (!test_taint(TAINT_PROPRIETARY_MODULE))
                        printk(KERN_WARNING "%s: module license '%s' taints "
                                "kernel.\n", mod->name, license);
-                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+                                 LOCKDEP_NOW_UNRELIABLE);
        }
 }
@@ -2282,7 +2304,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
        Elf_Shdr *symsect = info->sechdrs + info->index.sym;
        Elf_Shdr *strsect = info->sechdrs + info->index.str;
        const Elf_Sym *src;
-        unsigned int i, nsrc, ndst, strtab_size;
+        unsigned int i, nsrc, ndst, strtab_size = 0;
        /* Put symbol section at end of init part of module. */
        symsect->sh_flags |= SHF_ALLOC;
@@ -2294,11 +2316,13 @@ static void layout_symtab(struct module *mod, struct load_info *info)
        nsrc = symsect->sh_size / sizeof(*src);
        /* Compute total space required for the core symbols' strtab. */
-        for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
+        for (ndst = i = 0; i < nsrc; i++) {
-                if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
+                if (i == 0 ||
-                        strtab_size += strlen(&info->strtab[src->st_name]) + 1;
+                    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+                        strtab_size += strlen(&info->strtab[src[i].st_name])+1;
                        ndst++;
                }
+        }
        /* Append room for core symbols at end of core part. */
        info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
@@ -2332,15 +2356,14 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
        mod->core_symtab = dst = mod->module_core + info->symoffs;
        mod->core_strtab = s = mod->module_core + info->stroffs;
        src = mod->symtab;
-        *dst = *src;
+        for (ndst = i = 0; i < mod->num_symtab; i++) {
-        *s++ = 0;
+                if (i == 0 ||
-        for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
+                    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
-                if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
+                        dst[ndst] = src[i];
-                        continue;
+                        dst[ndst++].st_name = s - mod->core_strtab;
+                        s += strlcpy(s, &mod->strtab[src[i].st_name],
-                dst[ndst] = *src;
+                                     KSYM_NAME_LEN) + 1;
-                dst[ndst++].st_name = s - mod->core_strtab;
+                }
-                s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
        }
        mod->core_num_syms = ndst;
 }
@@ -2373,7 +2396,7 @@ static void dynamic_debug_remove(struct _ddebug *debug)
 void * __weak module_alloc(unsigned long size)
 {
-        return size == 0 ? NULL : vmalloc_exec(size);
+        return vmalloc_exec(size);
 }
 static void *module_alloc_update_bounds(unsigned long size)
@@ -2420,18 +2443,17 @@ static inline void kmemleak_load_module(const struct module *mod,
 #endif
 #ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info,
+static int module_sig_check(struct load_info *info)
-                            const void *mod, unsigned long *_len)
 {
        int err = -ENOKEY;
-        unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+        const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
-        unsigned long len = *_len;
+        const void *mod = info->hdr;
-        if (len > markerlen &&
+        if (info->len > markerlen &&
-            memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+            memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
                /* We truncate the module to discard the signature */
-                *_len -= markerlen;
+                info->len -= markerlen;
-                err = mod_verify_sig(mod, _len);
+                err = mod_verify_sig(mod, &info->len);
        }
        if (!err) {
@@ -2449,59 +2471,114 @@ static int module_sig_check(struct load_info *info,
        return err;
 }
 #else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info,
+static int module_sig_check(struct load_info *info)
-                            void *mod, unsigned long *len)
 {
        return 0;
 }
 #endif /* !CONFIG_MODULE_SIG */
-/* Sets info->hdr, info->len and info->sig_ok. */
+/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
-static int copy_and_check(struct load_info *info,
+static int elf_header_check(struct load_info *info)
-                          const void __user *umod, unsigned long len,
+{
-                          const char __user *uargs)
+        if (info->len < sizeof(*(info->hdr)))
+                return -ENOEXEC;
+        if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
+            || info->hdr->e_type != ET_REL
+            || !elf_check_arch(info->hdr)
+            || info->hdr->e_shentsize != sizeof(Elf_Shdr))
+                return -ENOEXEC;
+        if (info->hdr->e_shoff >= info->len
+            || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
+                info->len - info->hdr->e_shoff))
+                return -ENOEXEC;
+        return 0;
+}
+/* Sets info->hdr and info->len. */
+static int copy_module_from_user(const void __user *umod, unsigned long len,
+                                  struct load_info *info)
 {
        int err;
-        Elf_Ehdr *hdr;
-        if (len < sizeof(*hdr))
+        info->len = len;
+        if (info->len < sizeof(*(info->hdr)))
                return -ENOEXEC;
+        err = security_kernel_module_from_file(NULL);
+        if (err)
+                return err;
        /* Suck in entire file: we'll want most of it. */
-        if ((hdr = vmalloc(len)) == NULL)
+        info->hdr = vmalloc(info->len);
+        if (!info->hdr)
                return -ENOMEM;
-        if (copy_from_user(hdr, umod, len) != 0) {
+        if (copy_from_user(info->hdr, umod, info->len) != 0) {
-                err = -EFAULT;
+                vfree(info->hdr);
-                goto free_hdr;
+                return -EFAULT;
        }
-        err = module_sig_check(info, hdr, &len);
+        return 0;
+}
+/* Sets info->hdr and info->len. */
+static int copy_module_from_fd(int fd, struct load_info *info)
+{
+        struct file *file;
+        int err;
+        struct kstat stat;
+        loff_t pos;
+        ssize_t bytes = 0;
+        file = fget(fd);
+        if (!file)
+                return -ENOEXEC;
+        err = security_kernel_module_from_file(file);
        if (err)
-                goto free_hdr;
+                goto out;
-        /* Sanity checks against insmoding binaries or wrong arch,
+        err = vfs_getattr(&file->f_path, &stat);
-           weird elf version */
+        if (err)
-        if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
+                goto out;
-            || hdr->e_type != ET_REL
-            || !elf_check_arch(hdr)
+        if (stat.size > INT_MAX) {
-            || hdr->e_shentsize != sizeof(Elf_Shdr)) {
+                err = -EFBIG;
-                err = -ENOEXEC;
+                goto out;
-                goto free_hdr;
        }
-        if (hdr->e_shoff >= len ||
+        /* Don't hand 0 to vmalloc, it whines. */
-            hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) {
+        if (stat.size == 0) {
-                err = -ENOEXEC;
+                err = -EINVAL;
-                goto free_hdr;
+                goto out;
        }
-        info->hdr = hdr;
+        info->hdr = vmalloc(stat.size);
-        info->len = len;
+        if (!info->hdr) {
-        return 0;
+                err = -ENOMEM;
+                goto out;
+        }
+        pos = 0;
+        while (pos < stat.size) {
+                bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
+                                    stat.size - pos);
+                if (bytes < 0) {
+                        vfree(info->hdr);
+                        err = bytes;
+                        goto out;
+                }
+                if (bytes == 0)
+                        break;
+                pos += bytes;
+        }
+        info->len = pos;
-free_hdr:
+out:
-        vfree(hdr);
+        fput(file);
        return err;
 }
@@ -2510,7 +2587,7 @@ static void free_copy(struct load_info *info)
        vfree(info->hdr);
 }
-static int rewrite_section_headers(struct load_info *info)
+static int rewrite_section_headers(struct load_info *info, int flags)
 {
        unsigned int i;
@@ -2538,7 +2615,10 @@ static int rewrite_section_headers(struct load_info *info)
        }
        /* Track but don't keep modinfo and version sections. */
-        info->index.vers = find_sec(info, "__versions");
+        if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+                info->index.vers = 0; /* Pretend no __versions section! */
+        else
+                info->index.vers = find_sec(info, "__versions");
        info->index.info = find_sec(info, ".modinfo");
        info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
        info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -2553,7 +2633,7 @@ static int rewrite_section_headers(struct load_info *info)
 * Return the temporary module pointer (we'll replace it with the final
 * one when we move the module sections around).
 */
-static struct module *setup_load_info(struct load_info *info)
+static struct module *setup_load_info(struct load_info *info, int flags)
 {
        unsigned int i;
        int err;
@@ -2564,7 +2644,7 @@ static struct module *setup_load_info(struct load_info *info)
        info->secstrings = (void *)info->hdr
                + info->sechdrs[info->hdr->e_shstrndx].sh_offset;
-        err = rewrite_section_headers(info);
+        err = rewrite_section_headers(info, flags);
        if (err)
                return ERR_PTR(err);
@@ -2602,11 +2682,14 @@ static struct module *setup_load_info(struct load_info *info)
        return mod;
 }
-static int check_modinfo(struct module *mod, struct load_info *info)
+static int check_modinfo(struct module *mod, struct load_info *info, int flags)
 {
        const char *modmagic = get_modinfo(info, "vermagic");
        int err;
+        if (flags & MODULE_INIT_IGNORE_VERMAGIC)
+                modmagic = NULL;
        /* This is allowed: modprobe --force will invalidate it. */
        if (!modmagic) {
                err = try_to_force_load(mod, "bad vermagic");
@@ -2619,10 +2702,10 @@ static int check_modinfo(struct module *mod, struct load_info *info)
        }
        if (!get_modinfo(info, "intree"))
-                add_taint_module(mod, TAINT_OOT_MODULE);
+                add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
        if (get_modinfo(info, "staging")) {
-                add_taint_module(mod, TAINT_CRAP);
+                add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
                printk(KERN_WARNING "%s: module is from the staging directory,"
                       " the quality is unknown, you have been warned.\n",
                       mod->name);
@@ -2736,20 +2819,23 @@ static int move_module(struct module *mod, struct load_info *info)
        memset(ptr, 0, mod->core_size);
        mod->module_core = ptr;
-        ptr = module_alloc_update_bounds(mod->init_size);
+        if (mod->init_size) {
-        /*
+                ptr = module_alloc_update_bounds(mod->init_size);
-         * The pointer to this block is stored in the module structure
+                /*
-         * which is inside the block. This block doesn't need to be
+                 * The pointer to this block is stored in the module structure
-         * scanned as it contains data and code that will be freed
+                 * which is inside the block. This block doesn't need to be
-         * after the module is initialized.
+                 * scanned as it contains data and code that will be freed
-         */
+                 * after the module is initialized.
-        kmemleak_ignore(ptr);
+                 */
-        if (!ptr && mod->init_size) {
+                kmemleak_ignore(ptr);
-                module_free(mod, mod->module_core);
+                if (!ptr) {
-                return -ENOMEM;
+                        module_free(mod, mod->module_core);
-        }
+                        return -ENOMEM;
-        memset(ptr, 0, mod->init_size);
+                }
-        mod->module_init = ptr;
+                memset(ptr, 0, mod->init_size);
+                mod->module_init = ptr;
+        } else
+                mod->module_init = NULL;
        /* Transfer each section which specifies SHF_ALLOC */
        pr_debug("final section addresses:\n");
@@ -2785,15 +2871,17 @@ static int check_module_license_and_versions(struct module *mod)
         * using GPL-only symbols it needs.
         */
        if (strcmp(mod->name, "ndiswrapper") == 0)
-                add_taint(TAINT_PROPRIETARY_MODULE);
+                add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
        /* driverloader was caught wrongly pretending to be under GPL */
        if (strcmp(mod->name, "driverloader") == 0)
-                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+                                 LOCKDEP_NOW_UNRELIABLE);
        /* lve claims to be GPL but upstream won't provide source */
        if (strcmp(mod->name, "lve") == 0)
-                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+                add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+                                 LOCKDEP_NOW_UNRELIABLE);
 #ifdef CONFIG_MODVERSIONS
        if ((mod->num_syms && !mod->crcs)
@@ -2842,18 +2930,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
        return 0;
 }
-static struct module *layout_and_allocate(struct load_info *info)
+static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
        /* Module within temporary copy. */
        struct module *mod;
        Elf_Shdr *pcpusec;
        int err;
-        mod = setup_load_info(info);
+        mod = setup_load_info(info, flags);
        if (IS_ERR(mod))
                return mod;
-        err = check_modinfo(mod, info);
+        err = check_modinfo(mod, info, flags);
        if (err)
                return ERR_PTR(err);
@@ -2933,70 +3021,255 @@ static bool finished_loading(const char *name)
        bool ret;
        mutex_lock(&module_mutex);
-        mod = find_module(name);
+        mod = find_module_all(name, true);
-        ret = !mod || mod->state != MODULE_STATE_COMING;
+        ret = !mod || mod->state == MODULE_STATE_LIVE
+                || mod->state == MODULE_STATE_GOING;
        mutex_unlock(&module_mutex);
        return ret;
 }
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+        unsigned long i;
+        for (i = 0; i < mod->num_ctors; i++)
+                mod->ctors[i]();
+#endif
+}
+/* This is where the real work happens */
+static int do_init_module(struct module *mod)
+{
+        int ret = 0;
+        /*
+         * We want to find out whether @mod uses async during init.  Clear
+         * PF_USED_ASYNC.  async_schedule*() will set it.
+         */
+        current->flags &= ~PF_USED_ASYNC;
+        blocking_notifier_call_chain(&module_notify_list,
+                        MODULE_STATE_COMING, mod);
+        /* Set RO and NX regions for core */
+        set_section_ro_nx(mod->module_core,
+                                mod->core_text_size,
+                                mod->core_ro_size,
+                                mod->core_size);
+        /* Set RO and NX regions for init */
+        set_section_ro_nx(mod->module_init,
+                                mod->init_text_size,
+                                mod->init_ro_size,
+                                mod->init_size);
+        do_mod_ctors(mod);
+        /* Start the module */
+        if (mod->init != NULL)
+                ret = do_one_initcall(mod->init);
+        if (ret < 0) {
+                /* Init routine failed: abort.  Try to protect us from
+                   buggy refcounters. */
+                mod->state = MODULE_STATE_GOING;
+                synchronize_sched();
+                module_put(mod);
+                blocking_notifier_call_chain(&module_notify_list,
+                                             MODULE_STATE_GOING, mod);
+                free_module(mod);
+                wake_up_all(&module_wq);
+                return ret;
+        }
+        if (ret > 0) {
+                printk(KERN_WARNING
+"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+"%s: loading module anyway...\n",
+                       __func__, mod->name, ret,
+                       __func__);
+                dump_stack();
+        }
+        /* Now it's a first class citizen! */
+        mod->state = MODULE_STATE_LIVE;
+        blocking_notifier_call_chain(&module_notify_list,
+                                     MODULE_STATE_LIVE, mod);
+        /*
+         * We need to finish all async code before the module init sequence
+         * is done.  This has potential to deadlock.  For example, a newly
+         * detected block device can trigger request_module() of the
+         * default iosched from async probing task.  Once userland helper
+         * reaches here, async_synchronize_full() will wait on the async
+         * task waiting on request_module() and deadlock.
+         *
+         * This deadlock is avoided by perfomring async_synchronize_full()
+         * iff module init queued any async jobs.  This isn't a full
+         * solution as it will deadlock the same if module loading from
+         * async jobs nests more than once; however, due to the various
+         * constraints, this hack seems to be the best option for now.
+         * Please refer to the following thread for details.
+         *
+         * http://thread.gmane.org/gmane.linux.kernel/1420814
+         */
+        if (current->flags & PF_USED_ASYNC)
+                async_synchronize_full();
+        mutex_lock(&module_mutex);
+        /* Drop initial reference. */
+        module_put(mod);
+        trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+        mod->num_symtab = mod->core_num_syms;
+        mod->symtab = mod->core_symtab;
+        mod->strtab = mod->core_strtab;
+#endif
+        unset_module_init_ro_nx(mod);
+        module_free(mod, mod->module_init);
+        mod->module_init = NULL;
+        mod->init_size = 0;
+        mod->init_ro_size = 0;
+        mod->init_text_size = 0;
+        mutex_unlock(&module_mutex);
+        wake_up_all(&module_wq);
+        return 0;
+}
+static int may_init_module(void)
+{
+        if (!capable(CAP_SYS_MODULE) || modules_disabled)
+                return -EPERM;
+        return 0;
+}
+/*
+ * We try to place it in the list now to make sure it's unique before
+ * we dedicate too many resources.  In particular, temporary percpu
+ * memory exhaustion.
+ */
+static int add_unformed_module(struct module *mod)
+{
+        int err;
+        struct module *old;
+        mod->state = MODULE_STATE_UNFORMED;
+again:
+        mutex_lock(&module_mutex);
+        if ((old = find_module_all(mod->name, true)) != NULL) {
+                if (old->state == MODULE_STATE_COMING
+                    || old->state == MODULE_STATE_UNFORMED) {
+                        /* Wait in case it fails to load. */
+                        mutex_unlock(&module_mutex);
+                        err = wait_event_interruptible(module_wq,
+                                               finished_loading(mod->name));
+                        if (err)
+                                goto out_unlocked;
+                        goto again;
+                }
+                err = -EEXIST;
+                goto out;
+        }
+        list_add_rcu(&mod->list, &modules);
+        err = 0;
+out:
+        mutex_unlock(&module_mutex);
+out_unlocked:
+        return err;
+}
+static int complete_formation(struct module *mod, struct load_info *info)
+{
+        int err;
+        mutex_lock(&module_mutex);
+        /* Find duplicate symbols (must be called under lock). */
+        err = verify_export_symbols(mod);
+        if (err < 0)
+                goto out;
+        /* This relies on module_mutex for list integrity. */
+        module_bug_finalize(info->hdr, info->sechdrs, mod);
+        /* Mark state as coming so strong_try_module_get() ignores us,
+         * but kallsyms etc. can see us. */
+        mod->state = MODULE_STATE_COMING;
+out:
+        mutex_unlock(&module_mutex);
+        return err;
+}
 /* Allocate and load the module: note that size of section 0 is always
   zero, and we rely on this for optional sections. */
-static struct module *load_module(void __user *umod,
+static int load_module(struct load_info *info, const char __user *uargs,
-                                  unsigned long len,
+                       int flags)
-                                  const char __user *uargs)
 {
-        struct load_info info = { NULL, };
+        struct module *mod;
-        struct module *mod, *old;
        long err;
-        pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
+        err = module_sig_check(info);
-               umod, len, uargs);
+        if (err)
+                goto free_copy;
-        /* Copy in the blobs from userspace, check they are vaguely sane. */
+        err = elf_header_check(info);
-        err = copy_and_check(&info, umod, len, uargs);
        if (err)
-                return ERR_PTR(err);
+                goto free_copy;
        /* Figure out module layout, and allocate all the memory. */
-        mod = layout_and_allocate(&info);
+        mod = layout_and_allocate(info, flags);
        if (IS_ERR(mod)) {
                err = PTR_ERR(mod);
                goto free_copy;
        }
+        /* Reserve our place in the list. */
+        err = add_unformed_module(mod);
+        if (err)
+                goto free_module;
 #ifdef CONFIG_MODULE_SIG
-        mod->sig_ok = info.sig_ok;
+        mod->sig_ok = info->sig_ok;
-        if (!mod->sig_ok)
+        if (!mod->sig_ok) {
-                add_taint_module(mod, TAINT_FORCED_MODULE);
+                printk_once(KERN_NOTICE
+                            "%s: module verification failed: signature and/or"
+                            " required key missing - tainting kernel\n",
+                            mod->name);
+                add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
+        }
 #endif
        /* Now module is in final location, initialize linked lists, etc. */
        err = module_unload_init(mod);
        if (err)
-                goto free_module;
+                goto unlink_mod;
        /* Now we've got everything in the final locations, we can
         * find optional sections. */
-        find_module_sections(mod, &info);
+        find_module_sections(mod, info);
        err = check_module_license_and_versions(mod);
        if (err)
                goto free_unload;
        /* Set up MODINFO_ATTR fields */
-        setup_modinfo(mod, &info);
+        setup_modinfo(mod, info);
        /* Fix up syms, so that st_value is a pointer to location. */
-        err = simplify_symbols(mod, &info);
+        err = simplify_symbols(mod, info);
        if (err < 0)
                goto free_modinfo;
-        err = apply_relocations(mod, &info);
+        err = apply_relocations(mod, info);
        if (err < 0)
                goto free_modinfo;
-        err = post_relocation(mod, &info);
+        err = post_relocation(mod, info);
        if (err < 0)
                goto free_modinfo;
@@ -3009,72 +3282,39 @@ static struct module *load_module(void __user *umod,
                goto free_arch_cleanup;
        }
-        /* Mark state as coming so strong_try_module_get() ignores us. */
+        dynamic_debug_setup(info->debug, info->num_debug);
-        mod->state = MODULE_STATE_COMING;
-        /* Now sew it into the lists so we can get lockdep and oops
+        /* Finally it's fully formed, ready to start executing. */
-         * info during argument parsing.  No one should access us, since
+        err = complete_formation(mod, info);
-         * strong_try_module_get() will fail.
+        if (err)
-         * lockdep/oops can run asynchronous, so use the RCU list insertion
+                goto ddebug_cleanup;
-         * function to insert in a way safe to concurrent readers.
-         * The mutex protects against concurrent writers.
-         */
-again:
-        mutex_lock(&module_mutex);
-        if ((old = find_module(mod->name)) != NULL) {
-                if (old->state == MODULE_STATE_COMING) {
-                        /* Wait in case it fails to load. */
-                        mutex_unlock(&module_mutex);
-                        err = wait_event_interruptible(module_wq,
-                                               finished_loading(mod->name));
-                        if (err)
-                                goto free_arch_cleanup;
-                        goto again;
-                }
-                err = -EEXIST;
-                goto unlock;
-        }
-        /* This has to be done once we're sure module name is unique. */
-        dynamic_debug_setup(info.debug, info.num_debug);
-        /* Find duplicate symbols */
-        err = verify_export_symbols(mod);
-        if (err < 0)
-                goto ddebug;
-        module_bug_finalize(info.hdr, info.sechdrs, mod);
-        list_add_rcu(&mod->list, &modules);
-        mutex_unlock(&module_mutex);
        /* Module is ready to execute: parsing args may do that. */
        err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
                         -32768, 32767, &ddebug_dyndbg_module_param_cb);
        if (err < 0)
-                goto unlink;
+                goto bug_cleanup;
        /* Link in to syfs. */
-        err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
+        err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
        if (err < 0)
-                goto unlink;
+                goto bug_cleanup;
        /* Get rid of temporary copy. */
-        free_copy(&info);
+        free_copy(info);
        /* Done! */
        trace_module_load(mod);
-        return mod;
- unlink:
+        return do_init_module(mod);
+ bug_cleanup:
+        /* module_bug_cleanup needs module_mutex protection */
        mutex_lock(&module_mutex);
-        /* Unlink carefully: kallsyms could be walking list. */
-        list_del_rcu(&mod->list);
        module_bug_cleanup(mod);
-        wake_up_all(&module_wq);
- ddebug:
-        dynamic_debug_remove(info.debug);
- unlock:
        mutex_unlock(&module_mutex);
+ ddebug_cleanup:
+        dynamic_debug_remove(info->debug);
        synchronize_sched();
        kfree(mod->args);
 free_arch_cleanup:
@@ -3083,107 +3323,59 @@ again:
        free_modinfo(mod);
 free_unload:
        module_unload_free(mod);
+ unlink_mod:
+        mutex_lock(&module_mutex);
+        /* Unlink carefully: kallsyms could be walking list. */
+        list_del_rcu(&mod->list);
+        wake_up_all(&module_wq);
+        mutex_unlock(&module_mutex);
 free_module:
-        module_deallocate(mod, &info);
+        module_deallocate(mod, info);
 free_copy:
-        free_copy(&info);
+        free_copy(info);
-        return ERR_PTR(err);
+        return err;
-}
-/* Call module constructors. */
-static void do_mod_ctors(struct module *mod)
-{
-#ifdef CONFIG_CONSTRUCTORS
-        unsigned long i;
-        for (i = 0; i < mod->num_ctors; i++)
-                mod->ctors[i]();
-#endif
 }
-/* This is where the real work happens */
 SYSCALL_DEFINE3(init_module, void __user *, umod,
                unsigned long, len, const char __user *, uargs)
 {
-        struct module *mod;
+        int err;
-        int ret = 0;
+        struct load_info info = { };
-        /* Must have permission */
+        err = may_init_module();
-        if (!capable(CAP_SYS_MODULE) || modules_disabled)
+        if (err)
-                return -EPERM;
+                return err;
-        /* Do all the hard work */
+        pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
-        mod = load_module(umod, len, uargs);
+               umod, len, uargs);
-        if (IS_ERR(mod))
-                return PTR_ERR(mod);
-        blocking_notifier_call_chain(&module_notify_list,
+        err = copy_module_from_user(umod, len, &info);
-                        MODULE_STATE_COMING, mod);
+        if (err)
+                return err;
-        /* Set RO and NX regions for core */
+        return load_module(&info, uargs, 0);
-        set_section_ro_nx(mod->module_core,
+}
-                                mod->core_text_size,
-                                mod->core_ro_size,
-                                mod->core_size);
-        /* Set RO and NX regions for init */
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
-        set_section_ro_nx(mod->module_init,
+{
-                                mod->init_text_size,
+        int err;
-                                mod->init_ro_size,
+        struct load_info info = { };
-                                mod->init_size);
-        do_mod_ctors(mod);
+        err = may_init_module();
-        /* Start the module */
+        if (err)
-        if (mod->init != NULL)
+                return err;
-                ret = do_one_initcall(mod->init);
-        if (ret < 0) {
-                /* Init routine failed: abort.  Try to protect us from
-                   buggy refcounters. */
-                mod->state = MODULE_STATE_GOING;
-                synchronize_sched();
-                module_put(mod);
-                blocking_notifier_call_chain(&module_notify_list,
-                                             MODULE_STATE_GOING, mod);
-                free_module(mod);
-                wake_up_all(&module_wq);
-                return ret;
-        }
-        if (ret > 0) {
-                printk(KERN_WARNING
-"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
-"%s: loading module anyway...\n",
-                       __func__, mod->name, ret,
-                       __func__);
-                dump_stack();
-        }
-        /* Now it's a first class citizen! */
+        pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
-        mod->state = MODULE_STATE_LIVE;
-        blocking_notifier_call_chain(&module_notify_list,
-                                     MODULE_STATE_LIVE, mod);
-        /* We need to finish all async code before the module init sequence is done */
+        if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
-        async_synchronize_full();
+                      |MODULE_INIT_IGNORE_VERMAGIC))
+                return -EINVAL;
-        mutex_lock(&module_mutex);
+        err = copy_module_from_fd(fd, &info);
-        /* Drop initial reference. */
+        if (err)
-        module_put(mod);
+                return err;
-        trim_init_extable(mod);
-#ifdef CONFIG_KALLSYMS
-        mod->num_symtab = mod->core_num_syms;
-        mod->symtab = mod->core_symtab;
-        mod->strtab = mod->core_strtab;
-#endif
-        unset_module_init_ro_nx(mod);
-        module_free(mod, mod->module_init);
-        mod->module_init = NULL;
-        mod->init_size = 0;
-        mod->init_ro_size = 0;
-        mod->init_text_size = 0;
-        mutex_unlock(&module_mutex);
-        wake_up_all(&module_wq);
-        return 0;
+        return load_module(&info, uargs, flags);
 }
 static inline int within(unsigned long addr, void *start, unsigned long size)
@@ -3259,6 +3451,8 @@ const char *module_address_lookup(unsigned long addr,
        preempt_disable();
        list_for_each_entry_rcu(mod, &modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                if (within_module_init(addr, mod) ||
                    within_module_core(addr, mod)) {
                        if (modname)
@@ -3282,6 +3476,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
        preempt_disable();
        list_for_each_entry_rcu(mod, &modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                if (within_module_init(addr, mod) ||
                    within_module_core(addr, mod)) {
                        const char *sym;
@@ -3306,6 +3502,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
        preempt_disable();
        list_for_each_entry_rcu(mod, &modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                if (within_module_init(addr, mod) ||
                    within_module_core(addr, mod)) {
                        const char *sym;
@@ -3333,6 +3531,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
        preempt_disable();
        list_for_each_entry_rcu(mod, &modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                if (symnum < mod->num_symtab) {
                        *value = mod->symtab[symnum].st_value;
                        *type = mod->symtab[symnum].st_info;
@@ -3375,9 +3575,12 @@ unsigned long module_kallsyms_lookup_name(const char *name)
                        ret = mod_find_symname(mod, colon+1);
                *colon = ':';
        } else {
-                list_for_each_entry_rcu(mod, &modules, list)
+                list_for_each_entry_rcu(mod, &modules, list) {
+                        if (mod->state == MODULE_STATE_UNFORMED)
+                                continue;
                        if ((ret = mod_find_symname(mod, name)) != 0)
                                break;
+                }
        }
        preempt_enable();
        return ret;
@@ -3392,6 +3595,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
        int ret;
        list_for_each_entry(mod, &modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                for (i = 0; i < mod->num_symtab; i++) {
                        ret = fn(data, mod->strtab + mod->symtab[i].st_name,
                                 mod, mod->symtab[i].st_value);
@@ -3407,6 +3612,7 @@ static char *module_flags(struct module *mod, char *buf)
 {
        int bx = 0;
+        BUG_ON(mod->state == MODULE_STATE_UNFORMED);
        if (mod->taints ||
            mod->state == MODULE_STATE_GOING ||
            mod->state == MODULE_STATE_COMING) {
@@ -3448,6 +3654,10 @@ static int m_show(struct seq_file *m, void *p)
        struct module *mod = list_entry(p, struct module, list);
        char buf[8];
+        /* We always ignore unformed modules. */
+        if (mod->state == MODULE_STATE_UNFORMED)
+                return 0;
        seq_printf(m, "%s %u",
                   mod->name, mod->init_size + mod->core_size);
        print_unload_info(m, mod);
@@ -3508,6 +3718,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
        preempt_disable();
        list_for_each_entry_rcu(mod, &modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                if (mod->num_exentries == 0)
                        continue;
@@ -3556,10 +3768,13 @@ struct module *__module_address(unsigned long addr)
        if (addr < module_addr_min || addr > module_addr_max)
                return NULL;
-        list_for_each_entry_rcu(mod, &modules, list)
+        list_for_each_entry_rcu(mod, &modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                if (within_module_core(addr, mod)
                    || within_module_init(addr, mod))
                        return mod;
+        }
        return NULL;
 }
 EXPORT_SYMBOL_GPL(__module_address);
@@ -3612,8 +3827,11 @@ void print_modules(void)
        printk(KERN_DEFAULT "Modules linked in:");
        /* Most callers should already have preempt disabled, but make sure */
        preempt_disable();
-        list_for_each_entry_rcu(mod, &modules, list)
+        list_for_each_entry_rcu(mod, &modules, list) {
+                if (mod->state == MODULE_STATE_UNFORMED)
+                        continue;
                printk(" %s%s", mod->name, module_flags(mod, buf));
+        }
        preempt_enable();
        if (last_unloaded_module[0])
                printk(" [last unloaded: %s]", last_unloaded_module);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index ea1b1df5dbb0..f2970bddc5ea 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -27,13 +27,13 @@
 *      - Information block
 */
 struct module_signature {
-        enum pkey_algo          algo : 8;       /* Public-key crypto algorithm */
+        u8      algo;           /* Public-key crypto algorithm [enum pkey_algo] */
-        enum pkey_hash_algo     hash : 8;       /* Digest algorithm */
+        u8      hash;           /* Digest algorithm [enum pkey_hash_algo] */
-        enum pkey_id_type       id_type : 8;    /* Key identifier type */
+        u8      id_type;        /* Key identifier type [enum pkey_id_type] */
-        u8                      signer_len;     /* Length of signer's name */
+        u8      signer_len;     /* Length of signer's name */
-        u8                      key_id_len;     /* Length of key identifier */
+        u8      key_id_len;     /* Length of key identifier */
-        u8                      __pad[3];
+        u8      __pad[3];
-        __be32                  sig_len;        /* Length of signature data */
+        __be32  sig_len;        /* Length of signature data */
 };
 /*
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a307cc9c9526..52f23011b6e0 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,6 +19,7 @@
 */
 #include <linux/mutex.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index b576f7f14bc6..afc0456f227a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -57,7 +57,8 @@ static inline struct nsproxy *create_nsproxy(void)
 * leave it to the caller to do proper locking and attach it to task.
 */
 static struct nsproxy *create_new_namespaces(unsigned long flags,
-                        struct task_struct *tsk, struct fs_struct *new_fs)
+        struct task_struct *tsk, struct user_namespace *user_ns,
+        struct fs_struct *new_fs)
 {
        struct nsproxy *new_nsp;
        int err;
@@ -66,31 +67,31 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
        if (!new_nsp)
                return ERR_PTR(-ENOMEM);
-        new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+        new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
        if (IS_ERR(new_nsp->mnt_ns)) {
                err = PTR_ERR(new_nsp->mnt_ns);
                goto out_ns;
        }
-        new_nsp->uts_ns = copy_utsname(flags, tsk);
+        new_nsp->uts_ns = copy_utsname(flags, user_ns, tsk->nsproxy->uts_ns);
        if (IS_ERR(new_nsp->uts_ns)) {
                err = PTR_ERR(new_nsp->uts_ns);
                goto out_uts;
        }
-        new_nsp->ipc_ns = copy_ipcs(flags, tsk);
+        new_nsp->ipc_ns = copy_ipcs(flags, user_ns, tsk->nsproxy->ipc_ns);
        if (IS_ERR(new_nsp->ipc_ns)) {
                err = PTR_ERR(new_nsp->ipc_ns);
                goto out_ipc;
        }
-        new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
+        new_nsp->pid_ns = copy_pid_ns(flags, user_ns, tsk->nsproxy->pid_ns);
        if (IS_ERR(new_nsp->pid_ns)) {
                err = PTR_ERR(new_nsp->pid_ns);
                goto out_pid;
        }
-        new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
+        new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
        if (IS_ERR(new_nsp->net_ns)) {
                err = PTR_ERR(new_nsp->net_ns);
                goto out_net;
@@ -122,6 +123,7 @@ out_ns:
 int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 {
        struct nsproxy *old_ns = tsk->nsproxy;
+        struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
        struct nsproxy *new_ns;
        int err = 0;
@@ -134,7 +136,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
                                CLONE_NEWPID | CLONE_NEWNET)))
                return 0;
-        if (!capable(CAP_SYS_ADMIN)) {
+        if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
                err = -EPERM;
                goto out;
        }
@@ -151,7 +153,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
                goto out;
        }
-        new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+        new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
        if (IS_ERR(new_ns)) {
                err = PTR_ERR(new_ns);
                goto out;
@@ -183,19 +185,21 @@ void free_nsproxy(struct nsproxy *ns)
 * On success, returns the new nsproxy.
 */
 int unshare_nsproxy_namespaces(unsigned long unshare_flags,
-                struct nsproxy **new_nsp, struct fs_struct *new_fs)
+        struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
 {
+        struct user_namespace *user_ns;
        int err = 0;
        if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
-                               CLONE_NEWNET)))
+                               CLONE_NEWNET | CLONE_NEWPID)))
                return 0;
-        if (!capable(CAP_SYS_ADMIN))
+        user_ns = new_cred ? new_cred->user_ns : current_user_ns();
+        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
-        *new_nsp = create_new_namespaces(unshare_flags, current,
+        *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
-                                new_fs ? new_fs : current->fs);
+                                         new_fs ? new_fs : current->fs);
        if (IS_ERR(*new_nsp)) {
                err = PTR_ERR(*new_nsp);
                goto out;
@@ -241,20 +245,17 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
        struct file *file;
        int err;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
        file = proc_ns_fget(fd);
        if (IS_ERR(file))
                return PTR_ERR(file);
        err = -EINVAL;
-        ei = PROC_I(file->f_dentry->d_inode);
+        ei = PROC_I(file_inode(file));
        ops = ei->ns_ops;
        if (nstype && (ops->type != nstype))
                goto out;
-        new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+        new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
        if (IS_ERR(new_nsproxy)) {
                err = PTR_ERR(new_nsproxy);
                goto out;
diff --git a/kernel/padata.c b/kernel/padata.c
index 89fe3d1b9efb..072f4ee4eb89 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -171,7 +171,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
 {
        int cpu, num_cpus;
        unsigned int next_nr, next_index;
-        struct padata_parallel_queue *queue, *next_queue;
+        struct padata_parallel_queue *next_queue;
        struct padata_priv *padata;
        struct padata_list *reorder;
@@ -204,8 +204,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
                goto out;
        }
-        queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
+        if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
-        if (queue->cpu_index == next_queue->cpu_index) {
                padata = ERR_PTR(-ENODATA);
                goto out;
        }
diff --git a/kernel/panic.c b/kernel/panic.c
index e1b2822fff97..7c57cc9eee2c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -259,26 +259,19 @@ unsigned long get_taint(void)
        return tainted_mask;
 }
-void add_taint(unsigned flag)
+/**
+ * add_taint: add a taint flag if not already set.
+ * @flag: one of the TAINT_* constants.
+ * @lockdep_ok: whether lock debugging is still OK.
+ *
+ * If something bad has gone wrong, you'll want @lockdebug_ok = false, but for
+ * some notewortht-but-not-corrupting cases, it can be set to true.
+ */
+void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
 {
-        /*
+        if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
-         * Can't trust the integrity of the kernel anymore.
+                printk(KERN_WARNING
-         * We don't call directly debug_locks_off() because the issue
+                       "Disabling lock debugging due to kernel taint\n");
-         * is not necessarily serious enough to set oops_in_progress to 1
-         * Also we want to keep up lockdep for staging/out-of-tree
-         * development and post-warning case.
-         */
-        switch (flag) {
-        case TAINT_CRAP:
-        case TAINT_OOT_MODULE:
-        case TAINT_WARN:
-        case TAINT_FIRMWARE_WORKAROUND:
-                break;
-        default:
-                if (__debug_locks_off())
-                        printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
-        }
        set_bit(flag, &tainted_mask);
 }
@@ -421,7 +414,8 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
        print_modules();
        dump_stack();
        print_oops_end_marker();
-        add_taint(taint);
+        /* Just a warning, don't kill lockdep. */
+        add_taint(taint, LOCKDEP_STILL_OK);
 }
 void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
diff --git a/kernel/pid.c b/kernel/pid.c
index aebd4f5aaf41..047dc6264638 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,8 +1,8 @@
 /*
 * Generic pidhash and scalable, time-bounded PID allocator
 *
- * (C) 2002-2003 William Irwin, IBM
+ * (C) 2002-2003 Nadia Yvette Chambers, IBM
- * (C) 2004 William Irwin, Oracle
+ * (C) 2004 Nadia Yvette Chambers, Oracle
 * (C) 2002-2004 Ingo Molnar, Red Hat
 *
 * pid-structures are backing objects for tasks sharing a given ID to chain
@@ -36,6 +36,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
+#include <linux/proc_fs.h>
 #define pid_hashfn(nr, ns)      \
        hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -78,24 +79,11 @@ struct pid_namespace init_pid_ns = {
        .last_pid = 0,
        .level = 0,
        .child_reaper = &init_task,
+        .user_ns = &init_user_ns,
+        .proc_inum = PROC_PID_INIT_INO,
 };
 EXPORT_SYMBOL_GPL(init_pid_ns);
-int is_container_init(struct task_struct *tsk)
-{
-        int ret = 0;
-        struct pid *pid;
-        rcu_read_lock();
-        pid = task_pid(tsk);
-        if (pid != NULL && pid->numbers[pid->level].nr == 1)
-                ret = 1;
-        rcu_read_unlock();
-        return ret;
-}
-EXPORT_SYMBOL(is_container_init);
 /*
 * Note: disable interrupts while the pidmap_lock is held as an
 * interrupt might come in and do read_lock(&tasklist_lock).
@@ -269,8 +257,23 @@ void free_pid(struct pid *pid)
        unsigned long flags;
        spin_lock_irqsave(&pidmap_lock, flags);
-        for (i = 0; i <= pid->level; i++)
+        for (i = 0; i <= pid->level; i++) {
-                hlist_del_rcu(&pid->numbers[i].pid_chain);
+                struct upid *upid = pid->numbers + i;
+                struct pid_namespace *ns = upid->ns;
+                hlist_del_rcu(&upid->pid_chain);
+                switch(--ns->nr_hashed) {
+                case 1:
+                        /* When all that is left in the pid namespace
+                         * is the reaper wake up the reaper.  The reaper
+                         * may be sleeping in zap_pid_ns_processes().
+                         */
+                        wake_up_process(ns->child_reaper);
+                        break;
+                case 0:
+                        schedule_work(&ns->proc_work);
+                        break;
+                }
+        }
        spin_unlock_irqrestore(&pidmap_lock, flags);
        for (i = 0; i <= pid->level; i++)
@@ -292,6 +295,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
                goto out;
        tmp = ns;
+        pid->level = ns->level;
        for (i = ns->level; i >= 0; i--) {
                nr = alloc_pidmap(tmp);
                if (nr < 0)
@@ -302,22 +306,32 @@ struct pid *alloc_pid(struct pid_namespace *ns)
                tmp = tmp->parent;
        }
+        if (unlikely(is_child_reaper(pid))) {
+                if (pid_ns_prepare_proc(ns))
+                        goto out_free;
+        }
        get_pid_ns(ns);
-        pid->level = ns->level;
        atomic_set(&pid->count, 1);
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);
        upid = pid->numbers + ns->level;
        spin_lock_irq(&pidmap_lock);
-        for ( ; upid >= pid->numbers; --upid)
+        if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+                goto out_unlock;
+        for ( ; upid >= pid->numbers; --upid) {
                hlist_add_head_rcu(&upid->pid_chain,
                                &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+                upid->ns->nr_hashed++;
+        }
        spin_unlock_irq(&pidmap_lock);
 out:
        return pid;
+out_unlock:
+        spin_unlock_irq(&pidmap_lock);
 out_free:
        while (++i <= ns->level)
                free_pidmap(pid->numbers + i);
@@ -327,12 +341,18 @@ out_free:
        goto out;
 }
+void disable_pid_allocation(struct pid_namespace *ns)
+{
+        spin_lock_irq(&pidmap_lock);
+        ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+        spin_unlock_irq(&pidmap_lock);
+}
 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
-        struct hlist_node *elem;
        struct upid *pnr;
-        hlist_for_each_entry_rcu(pnr, elem,
+        hlist_for_each_entry_rcu(pnr,
                        &pid_hash[pid_hashfn(nr, ns)], pid_chain)
                if (pnr->nr == nr && pnr->ns == ns)
                        return container_of(pnr, struct pid,
@@ -344,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);
 struct pid *find_vpid(int nr)
 {
-        return find_pid_ns(nr, current->nsproxy->pid_ns);
+        return find_pid_ns(nr, task_active_pid_ns(current));
 }
 EXPORT_SYMBOL_GPL(find_vpid);
@@ -428,7 +448,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 struct task_struct *find_task_by_vpid(pid_t vnr)
 {
-        return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
+        return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
 }
 struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -483,7 +503,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns);
 pid_t pid_vnr(struct pid *pid)
 {
-        return pid_nr_ns(pid, current->nsproxy->pid_ns);
+        return pid_nr_ns(pid, task_active_pid_ns(current));
 }
 EXPORT_SYMBOL_GPL(pid_vnr);
@@ -494,7 +514,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
        rcu_read_lock();
        if (!ns)
-                ns = current->nsproxy->pid_ns;
+                ns = task_active_pid_ns(current);
        if (likely(pid_alive(task))) {
                if (type != PIDTYPE_PID)
                        task = task->group_leader;
@@ -558,6 +578,9 @@ void __init pidhash_init(void)
 void __init pidmap_init(void)
 {
+        /* Veryify no one has done anything silly */
+        BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
        /* bump default and minimum pid_max based on number of cpus */
        pid_max = min(pid_max_max, max_t(int, pid_max,
                                PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
@@ -569,6 +592,7 @@ void __init pidmap_init(void)
        /* Reserve PID 0. We never call free_pidmap(0) */
        set_bit(0, init_pid_ns.pidmap[0].page);
        atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+        init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
        init_pid_ns.pid_cachep = KMEM_CACHE(pid,
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7b07cc0dfb75..bea15bdf82b0 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -10,6 +10,7 @@
 #include <linux/pid.h>
 #include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
 #include <linux/syscalls.h>
 #include <linux/err.h>
 #include <linux/acct.h>
@@ -71,10 +72,17 @@ err_alloc:
        return NULL;
 }
+static void proc_cleanup_work(struct work_struct *work)
+{
+        struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
+        pid_ns_release_proc(ns);
+}
 /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
 #define MAX_PID_NS_LEVEL 32
-static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
+static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
+        struct pid_namespace *parent_pid_ns)
 {
        struct pid_namespace *ns;
        unsigned int level = parent_pid_ns->level + 1;
@@ -99,9 +107,16 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
        if (ns->pid_cachep == NULL)
                goto out_free_map;
+        err = proc_alloc_inum(&ns->proc_inum);
+        if (err)
+                goto out_free_map;
        kref_init(&ns->kref);
        ns->level = level;
        ns->parent = get_pid_ns(parent_pid_ns);
+        ns->user_ns = get_user_ns(user_ns);
+        ns->nr_hashed = PIDNS_HASH_ADDING;
+        INIT_WORK(&ns->proc_work, proc_cleanup_work);
        set_bit(0, ns->pidmap[0].page);
        atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -109,14 +124,8 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
        for (i = 1; i < PIDMAP_ENTRIES; i++)
                atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-        err = pid_ns_prepare_proc(ns);
-        if (err)
-                goto out_put_parent_pid_ns;
        return ns;
-out_put_parent_pid_ns:
-        put_pid_ns(parent_pid_ns);
 out_free_map:
        kfree(ns->pidmap[0].page);
 out_free:
@@ -129,18 +138,21 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
 {
        int i;
+        proc_free_inum(ns->proc_inum);
        for (i = 0; i < PIDMAP_ENTRIES; i++)
                kfree(ns->pidmap[i].page);
+        put_user_ns(ns->user_ns);
        kmem_cache_free(pid_ns_cachep, ns);
 }
-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+struct pid_namespace *copy_pid_ns(unsigned long flags,
+        struct user_namespace *user_ns, struct pid_namespace *old_ns)
 {
        if (!(flags & CLONE_NEWPID))
                return get_pid_ns(old_ns);
-        if (flags & (CLONE_THREAD|CLONE_PARENT))
+        if (task_active_pid_ns(current) != old_ns)
                return ERR_PTR(-EINVAL);
-        return create_pid_namespace(old_ns);
+        return create_pid_namespace(user_ns, old_ns);
 }
 static void free_pid_ns(struct kref *kref)
@@ -169,6 +181,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        int nr;
        int rc;
        struct task_struct *task, *me = current;
+        int init_pids = thread_group_leader(me) ? 1 : 2;
+        /* Don't allow any more processes into the pid namespace */
+        disable_pid_allocation(pid_ns);
        /* Ignore SIGCHLD causing any terminated children to autoreap */
        spin_lock_irq(&me->sighand->siglock);
@@ -211,22 +227,15 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
        /*
         * sys_wait4() above can't reap the TASK_DEAD children.
-         * Make sure they all go away, see __unhash_process().
+         * Make sure they all go away, see free_pid().
         */
        for (;;) {
-                bool need_wait = false;
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                if (pid_ns->nr_hashed == init_pids)
-                read_lock(&tasklist_lock);
-                if (!list_empty(&current->children)) {
-                        __set_current_state(TASK_UNINTERRUPTIBLE);
-                        need_wait = true;
-                }
-                read_unlock(&tasklist_lock);
-                if (!need_wait)
                        break;
                schedule();
        }
+        __set_current_state(TASK_RUNNING);
        if (pid_ns->reboot)
                current->signal->group_exit_code = pid_ns->reboot;
@@ -239,9 +248,10 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 static int pid_ns_ctl_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp, loff_t *ppos)
 {
+        struct pid_namespace *pid_ns = task_active_pid_ns(current);
        struct ctl_table tmp = *table;
-        if (write && !capable(CAP_SYS_ADMIN))
+        if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        /*
@@ -250,7 +260,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
         * it should synchronize its usage with external means.
         */
-        tmp.data = &current->nsproxy->pid_ns->last_pid;
+        tmp.data = &pid_ns->last_pid;
        return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 }
@@ -299,6 +309,68 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
        return 0;
 }
+static void *pidns_get(struct task_struct *task)
+{
+        struct pid_namespace *ns;
+        rcu_read_lock();
+        ns = get_pid_ns(task_active_pid_ns(task));
+        rcu_read_unlock();
+        return ns;
+}
+static void pidns_put(void *ns)
+{
+        put_pid_ns(ns);
+}
+static int pidns_install(struct nsproxy *nsproxy, void *ns)
+{
+        struct pid_namespace *active = task_active_pid_ns(current);
+        struct pid_namespace *ancestor, *new = ns;
+        if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
+            !nsown_capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /*
+         * Only allow entering the current active pid namespace
+         * or a child of the current active pid namespace.
+         *
+         * This is required for fork to return a usable pid value and
+         * this maintains the property that processes and their
+         * children can not escape their current pid namespace.
+         */
+        if (new->level < active->level)
+                return -EINVAL;
+        ancestor = new;
+        while (ancestor->level > active->level)
+                ancestor = ancestor->parent;
+        if (ancestor != active)
+                return -EINVAL;
+        put_pid_ns(nsproxy->pid_ns);
+        nsproxy->pid_ns = get_pid_ns(new);
+        return 0;
+}
+static unsigned int pidns_inum(void *ns)
+{
+        struct pid_namespace *pid_ns = ns;
+        return pid_ns->proc_inum;
+}
+const struct proc_ns_operations pidns_operations = {
+        .name           = "pid",
+        .type           = CLONE_NEWPID,
+        .get            = pidns_get,
+        .put            = pidns_put,
+        .install        = pidns_install,
+        .inum           = pidns_inum,
+};
 static __init int pid_namespaces_init(void)
 {
        pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 125cb67daa21..8fd709c9bb58 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,6 +9,7 @@
 #include <asm/uaccess.h>
 #include <linux/kernel_stat.h>
 #include <trace/events/timer.h>
+#include <linux/random.h>
 /*
 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -154,11 +155,19 @@ static void bump_cpu_timer(struct k_itimer *timer,
 static inline cputime_t prof_ticks(struct task_struct *p)
 {
-        return p->utime + p->stime;
+        cputime_t utime, stime;
+        task_cputime(p, &utime, &stime);
+        return utime + stime;
 }
 static inline cputime_t virt_ticks(struct task_struct *p)
 {
-        return p->utime;
+        cputime_t utime;
+        task_cputime(p, &utime, NULL);
+        return utime;
 }
 static int
@@ -217,30 +226,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
        return 0;
 }
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
-        struct signal_struct *sig = tsk->signal;
-        struct task_struct *t;
-        times->utime = sig->utime;
-        times->stime = sig->stime;
-        times->sum_exec_runtime = sig->sum_sched_runtime;
-        rcu_read_lock();
-        /* make sure we can trust tsk->thread_group list */
-        if (!likely(pid_alive(tsk)))
-                goto out;
-        t = tsk;
-        do {
-                times->utime += t->utime;
-                times->stime += t->stime;
-                times->sum_exec_runtime += task_sched_runtime(t);
-        } while_each_thread(tsk, t);
-out:
-        rcu_read_unlock();
-}
 static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
 {
        if (b->utime > a->utime)
@@ -494,16 +479,23 @@ static void cleanup_timers(struct list_head *head,
 */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
+        cputime_t utime, stime;
+        add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+                                                sizeof(unsigned long long));
+        task_cputime(tsk, &utime, &stime);
        cleanup_timers(tsk->cpu_timers,
-                       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
+                       utime, stime, tsk->se.sum_exec_runtime);
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
        struct signal_struct *const sig = tsk->signal;
+        cputime_t utime, stime;
+        task_cputime(tsk, &utime, &stime);
        cleanup_timers(tsk->signal->cpu_timers,
-                       tsk->utime + sig->utime, tsk->stime + sig->stime,
+                       utime + sig->utime, stime + sig->stime,
                       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
@@ -1247,11 +1239,14 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
 static inline int fastpath_timer_check(struct task_struct *tsk)
 {
        struct signal_struct *sig;
+        cputime_t utime, stime;
+        task_cputime(tsk, &utime, &stime);
        if (!task_cputime_zero(&tsk->cputime_expires)) {
                struct task_cputime task_sample = {
-                        .utime = tsk->utime,
+                        .utime = utime,
-                        .stime = tsk->stime,
+                        .stime = stime,
                        .sum_exec_runtime = tsk->se.sum_exec_runtime
                };
@@ -1422,8 +1417,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
                while (!signal_pending(current)) {
                        if (timer.it.cpu.expires.sched == 0) {
                                /*
-                                 * Our timer fired and was reset.
+                                 * Our timer fired and was reset, below
+                                 * deletion can not fail.
                                 */
+                                posix_cpu_timer_del(&timer);
                                spin_unlock_irq(&timer.it_lock);
                                return 0;
                        }
@@ -1441,9 +1438,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
                 * We were interrupted by a signal.
                 */
                sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
-                posix_cpu_timer_set(&timer, 0, &zero_it, it);
+                error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+                if (!error) {
+                        /*
+                         * Timer is now unarmed, deletion can not fail.
+                         */
+                        posix_cpu_timer_del(&timer);
+                }
                spin_unlock_irq(&timer.it_lock);
+                while (error == TIMER_RETRY) {
+                        /*
+                         * We need to handle case when timer was or is in the
+                         * middle of firing. In other cases we already freed
+                         * resources.
+                         */
+                        spin_lock_irq(&timer.it_lock);
+                        error = posix_cpu_timer_del(&timer);
+                        spin_unlock_irq(&timer.it_lock);
+                }
                if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
                        /*
                         * It actually did fire already.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 69185ae6b701..6edbb2c55c22 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -552,24 +552,22 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
                return -EAGAIN;
        spin_lock_init(&new_timer->it_lock);
- retry:
-        if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) {
+        idr_preload(GFP_KERNEL);
-                error = -EAGAIN;
-                goto out;
-        }
        spin_lock_irq(&idr_lock);
-        error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
+        error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
        spin_unlock_irq(&idr_lock);
-        if (error) {
+        idr_preload_end();
-                if (error == -EAGAIN)
+        if (error < 0) {
-                        goto retry;
                /*
                 * Weird looking, but we return EAGAIN if the IDR is
                 * full (proper POSIX return value for this)
                 */
-                error = -EAGAIN;
+                if (error == -ENOSPC)
+                        error = -EAGAIN;
                goto out;
        }
+        new_timer_id = error;
        it_id_set = IT_ID_SET;
        new_timer->it_id = (timer_t) new_timer_id;
@@ -639,6 +637,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 {
        struct k_itimer *timr;
+        /*
+         * timer_t could be any type >= int and we want to make sure any
+         * @timer_id outside positive int range fails lookup.
+         */
+        if ((unsigned long long)timer_id > INT_MAX)
+                return NULL;
        rcu_read_lock();
        timr = idr_find(&posix_timers_id, (int)timer_id);
        if (timr) {
@@ -997,7 +1002,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
        err = kc->clock_adj(which_clock, &ktx);
-        if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+        if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx)))
                return -EFAULT;
        return err;
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index ca304046d9e2..c6422ffeda9a 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -66,7 +66,7 @@ static DECLARE_WORK(suspend_work, try_to_suspend);
 void queue_up_suspend_work(void)
 {
-        if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
+        if (autosleep_state > PM_SUSPEND_ON)
                queue_work(autosleep_wq, &suspend_work);
 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f458238109cc..d77663bfedeb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -59,7 +59,7 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
 {
        unsigned long val;
-        if (strict_strtoul(buf, 10, &val))
+        if (kstrtoul(buf, 10, &val))
                return -EINVAL;
        if (val > 1)
@@ -313,7 +313,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
 static suspend_state_t decode_state(const char *buf, size_t n)
 {
 #ifdef CONFIG_SUSPEND
-        suspend_state_t state = PM_SUSPEND_STANDBY;
+        suspend_state_t state = PM_SUSPEND_MIN;
        const char * const *s;
 #endif
        char *p;
@@ -553,6 +553,30 @@ power_attr(pm_trace_dev_match);
 #endif /* CONFIG_PM_TRACE */
+#ifdef CONFIG_FREEZER
+static ssize_t pm_freeze_timeout_show(struct kobject *kobj,
+                                      struct kobj_attribute *attr, char *buf)
+{
+        return sprintf(buf, "%u\n", freeze_timeout_msecs);
+}
+static ssize_t pm_freeze_timeout_store(struct kobject *kobj,
+                                       struct kobj_attribute *attr,
+                                       const char *buf, size_t n)
+{
+        unsigned long val;
+        if (kstrtoul(buf, 10, &val))
+                return -EINVAL;
+        freeze_timeout_msecs = val;
+        return n;
+}
+power_attr(pm_freeze_timeout);
+#endif  /* CONFIG_FREEZER*/
 static struct attribute * g[] = {
        &state_attr.attr,
 #ifdef CONFIG_PM_TRACE
@@ -576,6 +600,9 @@ static struct attribute * g[] = {
        &pm_print_times_attr.attr,
 #endif
 #endif
+#ifdef CONFIG_FREEZER
+        &pm_freeze_timeout_attr.attr,
+#endif
        NULL,
 };
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 87da817f9e13..98088e0e71e8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -21,7 +21,7 @@
 /* 
 * Timeout for stopping processes
 */
-#define TIMEOUT (20 * HZ)
+unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
 static int try_to_freeze_tasks(bool user_only)
 {
@@ -36,7 +36,7 @@ static int try_to_freeze_tasks(bool user_only)
        do_gettimeofday(&start);
-        end_time = jiffies + TIMEOUT;
+        end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
        if (!user_only)
                freeze_workqueues_begin();
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only)
                        if (p == current || !freeze_task(p))
                                continue;
-                        /*
+                        if (!freezer_should_skip(p))
-                         * Now that we've done set_freeze_flag, don't
-                         * perturb a task in TASK_STOPPED or TASK_TRACED.
-                         * It is "frozen enough".  If the task does wake
-                         * up, it will immediately call try_to_freeze.
-                         *
-                         * Because freeze_task() goes through p's scheduler lock, it's
-                         * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
-                         * transition can't race with task state testing here.
-                         */
-                        if (!task_is_stopped_or_traced(p) &&
-                            !freezer_should_skip(p))
                                todo++;
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 846bd42c7ed1..587dddeebf15 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -213,6 +213,69 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
 }
 /**
+ * pm_qos_flags_remove_req - Remove device PM QoS flags request.
+ * @pqf: Device PM QoS flags set to remove the request from.
+ * @req: Request to remove from the set.
+ */
+static void pm_qos_flags_remove_req(struct pm_qos_flags *pqf,
+                                    struct pm_qos_flags_request *req)
+{
+        s32 val = 0;
+        list_del(&req->node);
+        list_for_each_entry(req, &pqf->list, node)
+                val |= req->flags;
+        pqf->effective_flags = val;
+}
+/**
+ * pm_qos_update_flags - Update a set of PM QoS flags.
+ * @pqf: Set of flags to update.
+ * @req: Request to add to the set, to modify, or to remove from the set.
+ * @action: Action to take on the set.
+ * @val: Value of the request to add or modify.
+ *
+ * Update the given set of PM QoS flags and call notifiers if the aggregate
+ * value has changed.  Returns 1 if the aggregate constraint value has changed,
+ * 0 otherwise.
+ */
+bool pm_qos_update_flags(struct pm_qos_flags *pqf,
+                         struct pm_qos_flags_request *req,
+                         enum pm_qos_req_action action, s32 val)
+{
+        unsigned long irqflags;
+        s32 prev_value, curr_value;
+        spin_lock_irqsave(&pm_qos_lock, irqflags);
+        prev_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
+        switch (action) {
+        case PM_QOS_REMOVE_REQ:
+                pm_qos_flags_remove_req(pqf, req);
+                break;
+        case PM_QOS_UPDATE_REQ:
+                pm_qos_flags_remove_req(pqf, req);
+        case PM_QOS_ADD_REQ:
+                req->flags = val;
+                INIT_LIST_HEAD(&req->node);
+                list_add_tail(&req->node, &pqf->list);
+                pqf->effective_flags |= val;
+                break;
+        default:
+                /* no action */
+                ;
+        }
+        curr_value = list_empty(&pqf->list) ? 0 : pqf->effective_flags;
+        spin_unlock_irqrestore(&pm_qos_lock, irqflags);
+        return prev_value != curr_value;
+}
+/**
 * pm_qos_request - returns current system wide qos expectation
 * @pm_qos_class: identification of which qos value is requested
 *
@@ -296,8 +359,7 @@ void pm_qos_update_request(struct pm_qos_request *req,
                return;
        }
-        if (delayed_work_pending(&req->work))
+        cancel_delayed_work_sync(&req->work);
-                cancel_delayed_work_sync(&req->work);
        if (new_value != req->node.prio)
                pm_qos_update_target(
@@ -323,8 +385,7 @@ void pm_qos_update_request_timeout(struct pm_qos_request *req, s32 new_value,
                 "%s called for unknown object.", __func__))
                return;
-        if (delayed_work_pending(&req->work))
+        cancel_delayed_work_sync(&req->work);
-                cancel_delayed_work_sync(&req->work);
        if (new_value != req->node.prio)
                pm_qos_update_target(
@@ -353,8 +414,7 @@ void pm_qos_remove_request(struct pm_qos_request *req)
                return;
        }
-        if (delayed_work_pending(&req->work))
+        cancel_delayed_work_sync(&req->work);
-                cancel_delayed_work_sync(&req->work);
        pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
                             &req->node, PM_QOS_REMOVE_REQ,
@@ -500,7 +560,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                } else {
                        ascii_value[count] = '\0';
                }
-                ret = strict_strtoul(ascii_value, 16, &ulval);
+                ret = kstrtoul(ascii_value, 16, &ulval);
                if (ret) {
                        pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
                        return -EINVAL;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..d4feda084a3a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,12 +30,38 @@
 #include "power.h"
 const char *const pm_states[PM_SUSPEND_MAX] = {
+        [PM_SUSPEND_FREEZE]     = "freeze",
        [PM_SUSPEND_STANDBY]    = "standby",
        [PM_SUSPEND_MEM]        = "mem",
 };
 static const struct platform_suspend_ops *suspend_ops;
+static bool need_suspend_ops(suspend_state_t state)
+{
+        return !!(state > PM_SUSPEND_FREEZE);
+}
+static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
+static bool suspend_freeze_wake;
+static void freeze_begin(void)
+{
+        suspend_freeze_wake = false;
+}
+static void freeze_enter(void)
+{
+        wait_event(suspend_freeze_wait_head, suspend_freeze_wake);
+}
+void freeze_wake(void)
+{
+        suspend_freeze_wake = true;
+        wake_up(&suspend_freeze_wait_head);
+}
+EXPORT_SYMBOL_GPL(freeze_wake);
 /**
 * suspend_set_ops - Set the global suspend method table.
 * @ops: Suspend operations to use.
@@ -50,8 +76,11 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
 bool valid_state(suspend_state_t state)
 {
+        if (state == PM_SUSPEND_FREEZE)
+                return true;
        /*
-         * All states need lowlevel support and need to be valid to the lowlevel
+         * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
+         * support and need to be valid to the lowlevel
         * implementation, no valid callback implies that none are valid.
         */
        return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
@@ -89,11 +118,11 @@ static int suspend_test(int level)
 * hibernation).  Run suspend notifiers, allocate the "suspend" console and
 * freeze processes.
 */
-static int suspend_prepare(void)
+static int suspend_prepare(suspend_state_t state)
 {
        int error;
-        if (!suspend_ops || !suspend_ops->enter)
+        if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
                return -EPERM;
        pm_prepare_console();
@@ -137,7 +166,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 {
        int error;
-        if (suspend_ops->prepare) {
+        if (need_suspend_ops(state) && suspend_ops->prepare) {
                error = suspend_ops->prepare();
                if (error)
                        goto Platform_finish;
@@ -149,12 +178,23 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                goto Platform_finish;
        }
-        if (suspend_ops->prepare_late) {
+        if (need_suspend_ops(state) && suspend_ops->prepare_late) {
                error = suspend_ops->prepare_late();
                if (error)
                        goto Platform_wake;
        }
+        /*
+         * PM_SUSPEND_FREEZE equals
+         * frozen processes + suspended devices + idle processors.
+         * Thus we should invoke freeze_enter() soon after
+         * all the devices are suspended.
+         */
+        if (state == PM_SUSPEND_FREEZE) {
+                freeze_enter();
+                goto Platform_wake;
+        }
        if (suspend_test(TEST_PLATFORM))
                goto Platform_wake;
@@ -182,13 +222,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        enable_nonboot_cpus();
 Platform_wake:
-        if (suspend_ops->wake)
+        if (need_suspend_ops(state) && suspend_ops->wake)
                suspend_ops->wake();
        dpm_resume_start(PMSG_RESUME);
 Platform_finish:
-        if (suspend_ops->finish)
+        if (need_suspend_ops(state) && suspend_ops->finish)
                suspend_ops->finish();
        return error;
@@ -203,11 +243,11 @@ int suspend_devices_and_enter(suspend_state_t state)
        int error;
        bool wakeup = false;
-        if (!suspend_ops)
+        if (need_suspend_ops(state) && !suspend_ops)
                return -ENOSYS;
        trace_machine_suspend(state);
-        if (suspend_ops->begin) {
+        if (need_suspend_ops(state) && suspend_ops->begin) {
                error = suspend_ops->begin(state);
                if (error)
                        goto Close;
@@ -226,7 +266,7 @@ int suspend_devices_and_enter(suspend_state_t state)
        do {
                error = suspend_enter(state, &wakeup);
-        } while (!error && !wakeup
+        } while (!error && !wakeup && need_suspend_ops(state)
                && suspend_ops->suspend_again && suspend_ops->suspend_again());
 Resume_devices:
@@ -236,13 +276,13 @@ int suspend_devices_and_enter(suspend_state_t state)
        ftrace_start();
        resume_console();
 Close:
-        if (suspend_ops->end)
+        if (need_suspend_ops(state) && suspend_ops->end)
                suspend_ops->end();
        trace_machine_suspend(PWR_EVENT_EXIT);
        return error;
 Recover_platform:
-        if (suspend_ops->recover)
+        if (need_suspend_ops(state) && suspend_ops->recover)
                suspend_ops->recover();
        goto Resume_devices;
 }
@@ -278,12 +318,15 @@ static int enter_state(suspend_state_t state)
        if (!mutex_trylock(&pm_mutex))
                return -EBUSY;
+        if (state == PM_SUSPEND_FREEZE)
+                freeze_begin();
        printk(KERN_INFO "PM: Syncing filesystems ... ");
        sys_sync();
        printk("done.\n");
        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
-        error = suspend_prepare();
+        error = suspend_prepare(state);
        if (error)
                goto Unlock;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 25596e450ac7..9b2a1d58558d 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -112,7 +112,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
        rtc_set_alarm(rtc, &alm);
 }
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
+static int __init has_wakealarm(struct device *dev, const void *data)
 {
        struct rtc_device *candidate = to_rtc_device(dev);
@@ -121,7 +121,6 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
        if (!device_may_wakeup(candidate->dev.parent))
                return 0;
-        *(const char **)name_ptr = dev_name(dev);
        return 1;
 }
@@ -159,8 +158,8 @@ static int __init test_suspend(void)
        static char             warn_no_rtc[] __initdata =
                KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
-        char                    *pony = NULL;
        struct rtc_device       *rtc = NULL;
+        struct device           *dev;
        /* PM is initialized by now; is that state testable? */
        if (test_state == PM_SUSPEND_ON)
@@ -171,9 +170,9 @@ static int __init test_suspend(void)
        }
        /* RTCs have initialized by now too ... can we use one? */
-        class_find_device(rtc_class, NULL, &pony, has_wakealarm);
+        dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm);
-        if (pony)
+        if (dev)
-                rtc = rtc_class_open(pony);
+                rtc = rtc_class_open(dev_name(dev));
        if (!rtc) {
                printk(warn_no_rtc);
                goto done;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3c9d764eb0d8..7c33ed200410 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -126,7 +126,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
        /* Figure out where to put the new node */
        while (*new) {
-                ext = container_of(*new, struct swsusp_extent, node);
+                ext = rb_entry(*new, struct swsusp_extent, node);
                parent = *new;
                if (swap_offset < ext->start) {
                        /* Try to merge */
diff --git a/kernel/printk.c b/kernel/printk.c
index 2d607f4d1797..abbdd9e2ac82 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -42,6 +42,7 @@
 #include <linux/notifier.h>
 #include <linux/rculist.h>
 #include <linux/poll.h>
+#include <linux/irq_work.h>
 #include <asm/uaccess.h>
@@ -62,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
 #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
-DECLARE_WAIT_QUEUE_HEAD(log_wait);
 int console_printk[4] = {
        DEFAULT_CONSOLE_LOGLEVEL,       /* console_loglevel */
        DEFAULT_MESSAGE_LOGLEVEL,       /* default_message_loglevel */
@@ -87,6 +86,12 @@ static DEFINE_SEMAPHORE(console_sem);
 struct console *console_drivers;
 EXPORT_SYMBOL_GPL(console_drivers);
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map console_lock_dep_map = {
+        .name = "console_lock"
+};
+#endif
 /*
 * This is used for debugging the mess that is the VT code by
 * keeping track if we have the console semaphore held. It's
@@ -217,6 +222,7 @@ struct log {
 static DEFINE_RAW_SPINLOCK(logbuf_lock);
 #ifdef CONFIG_PRINTK
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
 static u64 syslog_seq;
 static u32 syslog_idx;
@@ -741,6 +747,21 @@ void __init setup_log_buf(int early)
                free, (free * 100) / __LOG_BUF_LEN);
 }
+static bool __read_mostly ignore_loglevel;
+static int __init ignore_loglevel_setup(char *str)
+{
+        ignore_loglevel = 1;
+        printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+        return 0;
+}
+early_param("ignore_loglevel", ignore_loglevel_setup);
+module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
+        "print all kernel messages to the console.");
 #ifdef CONFIG_BOOT_PRINTK_DELAY
 static int boot_delay; /* msecs delay after each printk during bootup */
@@ -764,13 +785,15 @@ static int __init boot_delay_setup(char *str)
 }
 __setup("boot_delay=", boot_delay_setup);
-static void boot_delay_msec(void)
+static void boot_delay_msec(int level)
 {
        unsigned long long k;
        unsigned long timeout;
-        if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
+        if ((boot_delay == 0 || system_state != SYSTEM_BOOTING)
+                || (level >= console_loglevel && !ignore_loglevel)) {
                return;
+        }
        k = (unsigned long long)loops_per_msec * boot_delay;
@@ -789,7 +812,7 @@ static void boot_delay_msec(void)
        }
 }
 #else
-static inline void boot_delay_msec(void)
+static inline void boot_delay_msec(int level)
 {
 }
 #endif
@@ -847,10 +870,11 @@ static size_t print_time(u64 ts, char *buf)
        if (!printk_time)
                return 0;
+        rem_nsec = do_div(ts, 1000000000);
        if (!buf)
-                return 15;
+                return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
-        rem_nsec = do_div(ts, 1000000000);
        return sprintf(buf, "[%5lu.%06lu] ",
                       (unsigned long)ts, rem_nsec / 1000);
 }
@@ -1232,21 +1256,6 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
        return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
 }
-static bool __read_mostly ignore_loglevel;
-static int __init ignore_loglevel_setup(char *str)
-{
-        ignore_loglevel = 1;
-        printk(KERN_INFO "debug: ignoring loglevel setting.\n");
-        return 0;
-}
-early_param("ignore_loglevel", ignore_loglevel_setup);
-module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
-        "print all kernel messages to the console.");
 /*
 * Call the console drivers, asking them to write out
 * log_buf[start] to log_buf[end - 1].
@@ -1492,7 +1501,7 @@ asmlinkage int vprintk_emit(int facility, int level,
        int this_cpu;
        int printed_len = 0;
-        boot_delay_msec();
+        boot_delay_msec(level);
        printk_delay();
        /* This stops the holder of console_sem just where we want him */
@@ -1908,12 +1917,14 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
 */
 void console_lock(void)
 {
-        BUG_ON(in_interrupt());
+        might_sleep();
        down(&console_sem);
        if (console_suspended)
                return;
        console_locked = 1;
        console_may_schedule = 1;
+        mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
 }
 EXPORT_SYMBOL(console_lock);
@@ -1935,6 +1946,7 @@ int console_trylock(void)
        }
        console_locked = 1;
        console_may_schedule = 0;
+        mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_);
        return 1;
 }
 EXPORT_SYMBOL(console_trylock);
@@ -1944,43 +1956,6 @@ int is_console_locked(void)
        return console_locked;
 }
-/*
- * Delayed printk version, for scheduler-internal messages:
- */
-#define PRINTK_BUF_SIZE         512
-#define PRINTK_PENDING_WAKEUP   0x01
-#define PRINTK_PENDING_SCHED    0x02
-static DEFINE_PER_CPU(int, printk_pending);
-static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
-void printk_tick(void)
-{
-        if (__this_cpu_read(printk_pending)) {
-                int pending = __this_cpu_xchg(printk_pending, 0);
-                if (pending & PRINTK_PENDING_SCHED) {
-                        char *buf = __get_cpu_var(printk_sched_buf);
-                        printk(KERN_WARNING "[sched_delayed] %s", buf);
-                }
-                if (pending & PRINTK_PENDING_WAKEUP)
-                        wake_up_interruptible(&log_wait);
-        }
-}
-int printk_needs_cpu(int cpu)
-{
-        if (cpu_is_offline(cpu))
-                printk_tick();
-        return __this_cpu_read(printk_pending);
-}
-void wake_up_klogd(void)
-{
-        if (waitqueue_active(&log_wait))
-                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
-}
 static void console_cont_flush(char *text, size_t size)
 {
        unsigned long flags;
@@ -2095,6 +2070,7 @@ skip:
                local_irq_restore(flags);
        }
        console_locked = 0;
+        mutex_release(&console_lock_dep_map, 1, _RET_IP_);
        /* Release the exclusive_console once it is used */
        if (unlikely(exclusive_console))
@@ -2442,6 +2418,44 @@ static int __init printk_late_init(void)
 late_initcall(printk_late_init);
 #if defined CONFIG_PRINTK
+/*
+ * Delayed printk version, for scheduler-internal messages:
+ */
+#define PRINTK_BUF_SIZE         512
+#define PRINTK_PENDING_WAKEUP   0x01
+#define PRINTK_PENDING_SCHED    0x02
+static DEFINE_PER_CPU(int, printk_pending);
+static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
+{
+        int pending = __this_cpu_xchg(printk_pending, 0);
+        if (pending & PRINTK_PENDING_SCHED) {
+                char *buf = __get_cpu_var(printk_sched_buf);
+                printk(KERN_WARNING "[sched_delayed] %s", buf);
+        }
+        if (pending & PRINTK_PENDING_WAKEUP)
+                wake_up_interruptible(&log_wait);
+}
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+        .func = wake_up_klogd_work_func,
+        .flags = IRQ_WORK_LAZY,
+};
+void wake_up_klogd(void)
+{
+        preempt_disable();
+        if (waitqueue_active(&log_wait)) {
+                this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+                irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+        }
+        preempt_enable();
+}
 int printk_sched(const char *fmt, ...)
 {
@@ -2458,6 +2472,7 @@ int printk_sched(const char *fmt, ...)
        va_end(args);
        __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
+        irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
        local_irq_restore(flags);
        return r;
diff --git a/kernel/profile.c b/kernel/profile.c
index 76b8e77773ee..dc3384ee874e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -8,9 +8,10 @@
 *  Scheduler profiling support, Arjan van de Ven and Ingo Molnar,
 *      Red Hat, July 2004
 *  Consolidation of architecture support code for profiling,
- *      William Irwin, Oracle, July 2004
+ *      Nadia Yvette Chambers, Oracle, July 2004
 *  Amortized hit count accounting via per-cpu open-addressed hashtables
- *      to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
+ *      to resolve timer interrupt livelocks, Nadia Yvette Chambers,
+ *      Oracle, 2004
 */
 #include <linux/export.h>
@@ -36,9 +37,6 @@ struct profile_hit {
 #define NR_PROFILE_HIT          (PAGE_SIZE/sizeof(struct profile_hit))
 #define NR_PROFILE_GRP          (NR_PROFILE_HIT/PROFILE_GRPSZ)
-/* Oprofile timer tick hook */
-static int (*timer_hook)(struct pt_regs *) __read_mostly;
 static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
@@ -207,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
 }
 EXPORT_SYMBOL_GPL(profile_event_unregister);
-int register_timer_hook(int (*hook)(struct pt_regs *))
-{
-        if (timer_hook)
-                return -EBUSY;
-        timer_hook = hook;
-        return 0;
-}
-EXPORT_SYMBOL_GPL(register_timer_hook);
-void unregister_timer_hook(int (*hook)(struct pt_regs *))
-{
-        WARN_ON(hook != timer_hook);
-        timer_hook = NULL;
-        /* make sure all CPUs see the NULL hook */
-        synchronize_sched();  /* Allow ongoing interrupts to complete. */
-}
-EXPORT_SYMBOL_GPL(unregister_timer_hook);
 #ifdef CONFIG_SMP
 /*
 * Each cpu has a pair of open-addressed hashtables for pending
@@ -256,7 +235,7 @@ EXPORT_SYMBOL_GPL(unregister_timer_hook);
 * pagetable hash functions, but uses a full hashtable full of finite
 * collision chains, not just pairs of them.
 *
- * -- wli
+ * -- nyc
 */
 static void __profile_flip_buffers(void *unused)
 {
@@ -435,8 +414,6 @@ void profile_tick(int type)
 {
        struct pt_regs *regs = get_irq_regs();
-        if (type == CPU_PROFILING && timer_hook)
-                timer_hook(regs);
        if (!user_mode(regs) && prof_cpu_mask != NULL &&
            cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
                profile_hit(type, (void *)profile_pc(regs));
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f5e55dda955..acbd28424d81 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -117,11 +117,45 @@ void __ptrace_unlink(struct task_struct *child)
         * TASK_KILLABLE sleeps.
         */
        if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
-                signal_wake_up(child, task_is_traced(child));
+                ptrace_signal_wake_up(child, true);
        spin_unlock(&child->sighand->siglock);
 }
+/* Ensure that nothing can wake it up, even SIGKILL */
+static bool ptrace_freeze_traced(struct task_struct *task)
+{
+        bool ret = false;
+        /* Lockless, nobody but us can set this flag */
+        if (task->jobctl & JOBCTL_LISTENING)
+                return ret;
+        spin_lock_irq(&task->sighand->siglock);
+        if (task_is_traced(task) && !__fatal_signal_pending(task)) {
+                task->state = __TASK_TRACED;
+                ret = true;
+        }
+        spin_unlock_irq(&task->sighand->siglock);
+        return ret;
+}
+static void ptrace_unfreeze_traced(struct task_struct *task)
+{
+        if (task->state != __TASK_TRACED)
+                return;
+        WARN_ON(!task->ptrace || task->parent != current);
+        spin_lock_irq(&task->sighand->siglock);
+        if (__fatal_signal_pending(task))
+                wake_up_state(task, __TASK_TRACED);
+        else
+                task->state = TASK_TRACED;
+        spin_unlock_irq(&task->sighand->siglock);
+}
 /**
 * ptrace_check_attach - check whether ptracee is ready for ptrace operation
 * @child: ptracee to check for
@@ -139,7 +173,7 @@ void __ptrace_unlink(struct task_struct *child)
 * RETURNS:
 * 0 on success, -ESRCH if %child is not ready.
 */
-int ptrace_check_attach(struct task_struct *child, bool ignore_state)
+static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
 {
        int ret = -ESRCH;
@@ -151,24 +185,29 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
         * be changed by us so it's not changing right after this.
         */
        read_lock(&tasklist_lock);
-        if ((child->ptrace & PT_PTRACED) && child->parent == current) {
+        if (child->ptrace && child->parent == current) {
+                WARN_ON(child->state == __TASK_TRACED);
                /*
                 * child->sighand can't be NULL, release_task()
                 * does ptrace_unlink() before __exit_signal().
                 */
-                spin_lock_irq(&child->sighand->siglock);
+                if (ignore_state || ptrace_freeze_traced(child))
-                WARN_ON_ONCE(task_is_stopped(child));
-                if (ignore_state || (task_is_traced(child) &&
-                                     !(child->jobctl & JOBCTL_LISTENING)))
                        ret = 0;
-                spin_unlock_irq(&child->sighand->siglock);
        }
        read_unlock(&tasklist_lock);
-        if (!ret && !ignore_state)
+        if (!ret && !ignore_state) {
-                ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
+                if (!wait_task_inactive(child, __TASK_TRACED)) {
+                        /*
+                         * This can only happen if may_ptrace_stop() fails and
+                         * ptrace_stop() changes ->state back to TASK_RUNNING,
+                         * so we should not worry about leaking __TASK_TRACED.
+                         */
+                        WARN_ON(child->state == __TASK_TRACED);
+                        ret = -ESRCH;
+                }
+        }
-        /* All systems go.. */
        return ret;
 }
@@ -215,8 +254,12 @@ ok:
        smp_rmb();
        if (task->mm)
                dumpable = get_dumpable(task->mm);
-        if (!dumpable  && !ptrace_has_cap(task_user_ns(task), mode))
+        rcu_read_lock();
+        if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+                rcu_read_unlock();
                return -EPERM;
+        }
+        rcu_read_unlock();
        return security_ptrace_access_check(task, mode);
 }
@@ -280,8 +323,10 @@ static int ptrace_attach(struct task_struct *task, long request,
        if (seize)
                flags |= PT_SEIZED;
-        if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
+        rcu_read_lock();
+        if (ns_capable(__task_cred(task)->user_ns, CAP_SYS_PTRACE))
                flags |= PT_PTRACE_CAP;
+        rcu_read_unlock();
        task->ptrace = flags;
        __ptrace_link(task, current);
@@ -311,7 +356,7 @@ static int ptrace_attach(struct task_struct *task, long request,
         */
        if (task_is_stopped(task) &&
            task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
-                signal_wake_up(task, 1);
+                signal_wake_up_state(task, __TASK_STOPPED);
        spin_unlock(&task->sighand->siglock);
@@ -457,6 +502,9 @@ void exit_ptrace(struct task_struct *tracer)
                return;
        list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
+                if (unlikely(p->ptrace & PT_EXITKILL))
+                        send_sig_info(SIGKILL, SEND_SIG_FORCED, p);
                if (__ptrace_detach(tracer, p))
                        list_add(&p->ptrace_entry, &ptrace_dead);
        }
@@ -664,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
                                             kiov->iov_len, kiov->iov_base);
 }
+/*
+ * This is declared in linux/regset.h and defined in machine-dependent
+ * code.  We put the export here, near the primary machine-neutral use,
+ * to ensure no machine forgets it.
+ */
+EXPORT_SYMBOL_GPL(task_user_regset_view);
 #endif
 int ptrace_request(struct task_struct *child, long request,
@@ -728,7 +782,7 @@ int ptrace_request(struct task_struct *child, long request,
                 * tracee into STOP.
                 */
                if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
-                        signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+                        ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
                unlock_task_sighand(child, &flags);
                ret = 0;
@@ -754,7 +808,7 @@ int ptrace_request(struct task_struct *child, long request,
                         * start of this trap and now.  Trigger re-trap.
                         */
                        if (child->jobctl & JOBCTL_TRAP_NOTIFY)
-                                signal_wake_up(child, true);
+                                ptrace_signal_wake_up(child, true);
                        ret = 0;
                }
                unlock_task_sighand(child, &flags);
@@ -891,6 +945,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
                goto out_put_task_struct;
        ret = arch_ptrace(child, request, addr, data);
+        if (ret || request != PTRACE_DETACH)
+                ptrace_unfreeze_traced(child);
 out_put_task_struct:
        put_task_struct(child);
@@ -1030,8 +1086,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
        ret = ptrace_check_attach(child, request == PTRACE_KILL ||
                                  request == PTRACE_INTERRUPT);
-        if (!ret)
+        if (!ret) {
                ret = compat_arch_ptrace(child, request, addr, data);
+                if (ret || request != PTRACE_DETACH)
+                        ptrace_unfreeze_traced(child);
+        }
 out_put_task_struct:
        put_task_struct(child);
diff --git a/kernel/rcu.h b/kernel/rcu.h
index 8ba99cdc6515..7f8e7590e3e5 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -109,4 +109,13 @@ static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
        }
 }
+extern int rcu_expedited;
+#ifdef CONFIG_RCU_STALL_COMMON
+extern int rcu_cpu_stall_suppress;
+int rcu_jiffies_till_stall_check(void);
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 29ca1c6da594..48ab70384a4c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,12 +46,15 @@
 #include <linux/export.h>
 #include <linux/hardirq.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/rcu.h>
 #include "rcu.h"
+module_param(rcu_expedited, int, 0);
 #ifdef CONFIG_PREEMPT_RCU
 /*
@@ -401,11 +404,65 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
-void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
+void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp,
+                               unsigned long secs,
+                               unsigned long c_old, unsigned long c)
 {
-        trace_rcu_torture_read(rcutorturename, rhp);
+        trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c);
 }
 EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
 #else
-#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+        do { } while (0)
+#endif
+#ifdef CONFIG_RCU_STALL_COMMON
+#ifdef CONFIG_PROVE_RCU
+#define RCU_STALL_DELAY_DELTA          (5 * HZ)
+#else
+#define RCU_STALL_DELAY_DELTA          0
 #endif
+int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
+module_param(rcu_cpu_stall_suppress, int, 0644);
+module_param(rcu_cpu_stall_timeout, int, 0644);
+int rcu_jiffies_till_stall_check(void)
+{
+        int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+        /*
+         * Limit check must be consistent with the Kconfig limits
+         * for CONFIG_RCU_CPU_STALL_TIMEOUT.
+         */
+        if (till_stall_check < 3) {
+                ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
+                till_stall_check = 3;
+        } else if (till_stall_check > 300) {
+                ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
+                till_stall_check = 300;
+        }
+        return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
+}
+static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
+{
+        rcu_cpu_stall_suppress = 1;
+        return NOTIFY_DONE;
+}
+static struct notifier_block rcu_panic_block = {
+        .notifier_call = rcu_panic,
+};
+static int __init check_cpu_stall_init(void)
+{
+        atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
+        return 0;
+}
+early_initcall(check_cpu_stall_init);
+#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index e4c6a598d6f7..a0714a51b6d7 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -51,10 +51,10 @@ static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
-#include "rcutiny_plugin.h"
 static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+#include "rcutiny_plugin.h"
 /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
 static void rcu_idle_enter_common(long long newval)
 {
@@ -193,9 +193,9 @@ EXPORT_SYMBOL(rcu_is_cpu_idle);
 * interrupts don't count, we must be running at the first interrupt
 * level.
 */
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
 {
-        return rcu_dynticks_nesting <= 0;
+        return rcu_dynticks_nesting <= 1;
 }
 /*
@@ -205,6 +205,7 @@ int rcu_is_cpu_rrupt_from_idle(void)
 */
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
+        reset_cpu_stall_ticks(rcp);
        if (rcp->rcucblist != NULL &&
            rcp->donetail != rcp->curtail) {
                rcp->donetail = rcp->curtail;
@@ -251,6 +252,7 @@ void rcu_bh_qs(int cpu)
 */
 void rcu_check_callbacks(int cpu, int user)
 {
+        check_cpu_stalls();
        if (user || rcu_is_cpu_rrupt_from_idle())
                rcu_sched_qs(cpu);
        else if (!in_softirq())
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 3d0190282204..8a233002faeb 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -33,6 +33,9 @@ struct rcu_ctrlblk {
        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
        struct rcu_head **curtail;      /* ->next pointer of last CB. */
        RCU_TRACE(long qlen);           /* Number of pending CBs. */
+        RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */
+        RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */
+        RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */
        RCU_TRACE(char *name);          /* Name of RCU type. */
 };
@@ -54,6 +57,51 @@ int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#ifdef CONFIG_RCU_TRACE
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+        unsigned long j;
+        unsigned long js;
+        if (rcu_cpu_stall_suppress)
+                return;
+        rcp->ticks_this_gp++;
+        j = jiffies;
+        js = rcp->jiffies_stall;
+        if (*rcp->curtail && ULONG_CMP_GE(j, js)) {
+                pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n",
+                       rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting,
+                       jiffies - rcp->gp_start, rcp->qlen);
+                dump_stack();
+        }
+        if (*rcp->curtail && ULONG_CMP_GE(j, js))
+                rcp->jiffies_stall = jiffies +
+                        3 * rcu_jiffies_till_stall_check() + 3;
+        else if (ULONG_CMP_GE(j, js))
+                rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+}
+static void check_cpu_stall_preempt(void);
+#endif /* #ifdef CONFIG_RCU_TRACE */
+static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp)
+{
+#ifdef CONFIG_RCU_TRACE
+        rcp->ticks_this_gp = 0;
+        rcp->gp_start = jiffies;
+        rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
+#endif /* #ifdef CONFIG_RCU_TRACE */
+}
+static void check_cpu_stalls(void)
+{
+        RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk));
+        RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk));
+        RCU_TRACE(check_cpu_stall_preempt());
+}
 #ifdef CONFIG_TINY_PREEMPT_RCU
 #include <linux/delay.h>
@@ -448,6 +496,7 @@ static void rcu_preempt_start_gp(void)
                /* Official start of GP. */
                rcu_preempt_ctrlblk.gpnum++;
                RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
+                reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
                /* Any blocked RCU readers block new GP. */
                if (rcu_preempt_blocked_readers_any())
@@ -706,7 +755,10 @@ void synchronize_rcu(void)
                return;
        /* Once we get past the fastpath checks, same code as rcu_barrier(). */
-        rcu_barrier();
+        if (rcu_expedited)
+                synchronize_rcu_expedited();
+        else
+                rcu_barrier();
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
@@ -1051,4 +1103,11 @@ MODULE_AUTHOR("Paul E. McKenney");
 MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
 MODULE_LICENSE("GPL");
+static void check_cpu_stall_preempt(void)
+{
+#ifdef CONFIG_TINY_PREEMPT_RCU
+        check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
+#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
+}
 #endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index aaa7b9f3532a..e1f3a8c96724 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -46,6 +46,7 @@
 #include <linux/stat.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/trace_clock.h>
 #include <asm/byteorder.h>
 MODULE_LICENSE("GPL");
@@ -207,6 +208,20 @@ MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
 #define rcu_can_boost() 0
 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+#ifdef CONFIG_RCU_TRACE
+static u64 notrace rcu_trace_clock_local(void)
+{
+        u64 ts = trace_clock_local();
+        unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+        return ts;
+}
+#else /* #ifdef CONFIG_RCU_TRACE */
+static u64 notrace rcu_trace_clock_local(void)
+{
+        return 0ULL;
+}
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
 static unsigned long shutdown_time;     /* jiffies to system shutdown. */
 static unsigned long boost_starttime;   /* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
@@ -339,7 +354,6 @@ rcu_stutter_wait(char *title)
 struct rcu_torture_ops {
        void (*init)(void);
-        void (*cleanup)(void);
        int (*readlock)(void);
        void (*read_delay)(struct rcu_random_state *rrsp);
        void (*readunlock)(int idx);
@@ -431,7 +445,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
 static struct rcu_torture_ops rcu_ops = {
        .init           = NULL,
-        .cleanup        = NULL,
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,
        .readunlock     = rcu_torture_read_unlock,
@@ -475,7 +488,6 @@ static void rcu_sync_torture_init(void)
 static struct rcu_torture_ops rcu_sync_ops = {
        .init           = rcu_sync_torture_init,
-        .cleanup        = NULL,
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,
        .readunlock     = rcu_torture_read_unlock,
@@ -493,7 +505,6 @@ static struct rcu_torture_ops rcu_sync_ops = {
 static struct rcu_torture_ops rcu_expedited_ops = {
        .init           = rcu_sync_torture_init,
-        .cleanup        = NULL,
        .readlock       = rcu_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_torture_read_unlock,
@@ -536,7 +547,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
 static struct rcu_torture_ops rcu_bh_ops = {
        .init           = NULL,
-        .cleanup        = NULL,
        .readlock       = rcu_bh_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_bh_torture_read_unlock,
@@ -553,7 +563,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
 static struct rcu_torture_ops rcu_bh_sync_ops = {
        .init           = rcu_sync_torture_init,
-        .cleanup        = NULL,
        .readlock       = rcu_bh_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_bh_torture_read_unlock,
@@ -570,7 +579,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
 static struct rcu_torture_ops rcu_bh_expedited_ops = {
        .init           = rcu_sync_torture_init,
-        .cleanup        = NULL,
        .readlock       = rcu_bh_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = rcu_bh_torture_read_unlock,
@@ -589,19 +597,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
 * Definitions for srcu torture testing.
 */
-static struct srcu_struct srcu_ctl;
+DEFINE_STATIC_SRCU(srcu_ctl);
-static void srcu_torture_init(void)
-{
-        init_srcu_struct(&srcu_ctl);
-        rcu_sync_torture_init();
-}
-static void srcu_torture_cleanup(void)
-{
-        synchronize_srcu(&srcu_ctl);
-        cleanup_srcu_struct(&srcu_ctl);
-}
 static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
 {
@@ -672,8 +668,7 @@ static int srcu_torture_stats(char *page)
 }
 static struct rcu_torture_ops srcu_ops = {
-        .init           = srcu_torture_init,
+        .init           = rcu_sync_torture_init,
-        .cleanup        = srcu_torture_cleanup,
        .readlock       = srcu_torture_read_lock,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
@@ -687,8 +682,7 @@ static struct rcu_torture_ops srcu_ops = {
 };
 static struct rcu_torture_ops srcu_sync_ops = {
-        .init           = srcu_torture_init,
+        .init           = rcu_sync_torture_init,
-        .cleanup        = srcu_torture_cleanup,
        .readlock       = srcu_torture_read_lock,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
@@ -712,8 +706,7 @@ static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
 }
 static struct rcu_torture_ops srcu_raw_ops = {
-        .init           = srcu_torture_init,
+        .init           = rcu_sync_torture_init,
-        .cleanup        = srcu_torture_cleanup,
        .readlock       = srcu_torture_read_lock_raw,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock_raw,
@@ -727,8 +720,7 @@ static struct rcu_torture_ops srcu_raw_ops = {
 };
 static struct rcu_torture_ops srcu_raw_sync_ops = {
-        .init           = srcu_torture_init,
+        .init           = rcu_sync_torture_init,
-        .cleanup        = srcu_torture_cleanup,
        .readlock       = srcu_torture_read_lock_raw,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock_raw,
@@ -747,8 +739,7 @@ static void srcu_torture_synchronize_expedited(void)
 }
 static struct rcu_torture_ops srcu_expedited_ops = {
-        .init           = srcu_torture_init,
+        .init           = rcu_sync_torture_init,
-        .cleanup        = srcu_torture_cleanup,
        .readlock       = srcu_torture_read_lock,
        .read_delay     = srcu_read_delay,
        .readunlock     = srcu_torture_read_unlock,
@@ -783,7 +774,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
 static struct rcu_torture_ops sched_ops = {
        .init           = rcu_sync_torture_init,
-        .cleanup        = NULL,
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
@@ -799,7 +789,6 @@ static struct rcu_torture_ops sched_ops = {
 static struct rcu_torture_ops sched_sync_ops = {
        .init           = rcu_sync_torture_init,
-        .cleanup        = NULL,
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
@@ -814,7 +803,6 @@ static struct rcu_torture_ops sched_sync_ops = {
 static struct rcu_torture_ops sched_expedited_ops = {
        .init           = rcu_sync_torture_init,
-        .cleanup        = NULL,
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
@@ -872,7 +860,7 @@ static int rcu_torture_boost(void *arg)
                /* Wait for the next test interval. */
                oldstarttime = boost_starttime;
                while (ULONG_CMP_LT(jiffies, oldstarttime)) {
-                        schedule_timeout_uninterruptible(1);
+                        schedule_timeout_interruptible(oldstarttime - jiffies);
                        rcu_stutter_wait("rcu_torture_boost");
                        if (kthread_should_stop() ||
                            fullstop != FULLSTOP_DONTSTOP)
@@ -1055,7 +1043,6 @@ void rcutorture_trace_dump(void)
                return;
        if (atomic_xchg(&beenhere, 1) != 0)
                return;
-        do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
        ftrace_dump(DUMP_ALL);
 }
@@ -1069,13 +1056,16 @@ static void rcu_torture_timer(unsigned long unused)
 {
        int idx;
        int completed;
+        int completed_end;
        static DEFINE_RCU_RANDOM(rand);
        static DEFINE_SPINLOCK(rand_lock);
        struct rcu_torture *p;
        int pipe_count;
+        unsigned long long ts;
        idx = cur_ops->readlock();
        completed = cur_ops->completed();
+        ts = rcu_trace_clock_local();
        p = rcu_dereference_check(rcu_torture_current,
                                  rcu_read_lock_bh_held() ||
                                  rcu_read_lock_sched_held() ||
@@ -1085,7 +1075,6 @@ static void rcu_torture_timer(unsigned long unused)
                cur_ops->readunlock(idx);
                return;
        }
-        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
        if (p->rtort_mbtest == 0)
                atomic_inc(&n_rcu_torture_mberror);
        spin_lock(&rand_lock);
@@ -1098,10 +1087,14 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
-        if (pipe_count > 1)
+        completed_end = cur_ops->completed();
+        if (pipe_count > 1) {
+                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
+                                          completed, completed_end);
                rcutorture_trace_dump();
+        }
        __this_cpu_inc(rcu_torture_count[pipe_count]);
-        completed = cur_ops->completed() - completed;
+        completed = completed_end - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
@@ -1121,11 +1114,13 @@ static int
 rcu_torture_reader(void *arg)
 {
        int completed;
+        int completed_end;
        int idx;
        DEFINE_RCU_RANDOM(rand);
        struct rcu_torture *p;
        int pipe_count;
        struct timer_list t;
+        unsigned long long ts;
        VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
        set_user_nice(current, 19);
@@ -1139,6 +1134,7 @@ rcu_torture_reader(void *arg)
                }
                idx = cur_ops->readlock();
                completed = cur_ops->completed();
+                ts = rcu_trace_clock_local();
                p = rcu_dereference_check(rcu_torture_current,
                                          rcu_read_lock_bh_held() ||
                                          rcu_read_lock_sched_held() ||
@@ -1149,7 +1145,6 @@ rcu_torture_reader(void *arg)
                        schedule_timeout_interruptible(HZ);
                        continue;
                }
-                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
                if (p->rtort_mbtest == 0)
                        atomic_inc(&n_rcu_torture_mberror);
                cur_ops->read_delay(&rand);
@@ -1159,10 +1154,14 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
-                if (pipe_count > 1)
+                completed_end = cur_ops->completed();
+                if (pipe_count > 1) {
+                        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
+                                                  ts, completed, completed_end);
                        rcutorture_trace_dump();
+                }
                __this_cpu_inc(rcu_torture_count[pipe_count]);
-                completed = cur_ops->completed() - completed;
+                completed = completed_end - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
@@ -1328,19 +1327,35 @@ static void rcu_torture_shuffle_tasks(void)
                                set_cpus_allowed_ptr(reader_tasks[i],
                                                     shuffle_tmp_mask);
        }
        if (fakewriter_tasks) {
                for (i = 0; i < nfakewriters; i++)
                        if (fakewriter_tasks[i])
                                set_cpus_allowed_ptr(fakewriter_tasks[i],
                                                     shuffle_tmp_mask);
        }
        if (writer_task)
                set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
        if (stats_task)
                set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
+        if (stutter_task)
+                set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
+        if (fqs_task)
+                set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
+        if (shutdown_task)
+                set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
+#ifdef CONFIG_HOTPLUG_CPU
+        if (onoff_task)
+                set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+        if (stall_task)
+                set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
+        if (barrier_cbs_tasks)
+                for (i = 0; i < n_barrier_cbs; i++)
+                        if (barrier_cbs_tasks[i])
+                                set_cpus_allowed_ptr(barrier_cbs_tasks[i],
+                                                     shuffle_tmp_mask);
+        if (barrier_task)
+                set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
        if (rcu_idle_cpu == -1)
                rcu_idle_cpu = num_online_cpus() - 1;
@@ -1396,12 +1411,16 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
                 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
                 "test_boost=%d/%d test_boost_interval=%d "
                 "test_boost_duration=%d shutdown_secs=%d "
+                 "stall_cpu=%d stall_cpu_holdoff=%d "
+                 "n_barrier_cbs=%d "
                 "onoff_interval=%d onoff_holdoff=%d\n",
                 torture_type, tag, nrealreaders, nfakewriters,
                 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
                 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
                 test_boost, cur_ops->can_boost,
                 test_boost_interval, test_boost_duration, shutdown_secs,
+                 stall_cpu, stall_cpu_holdoff,
+                 n_barrier_cbs,
                 onoff_interval, onoff_holdoff);
 }
@@ -1502,6 +1521,7 @@ rcu_torture_onoff(void *arg)
        unsigned long delta;
        int maxcpu = -1;
        DEFINE_RCU_RANDOM(rand);
+        int ret;
        unsigned long starttime;
        VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
@@ -1522,7 +1542,13 @@ rcu_torture_onoff(void *arg)
                                         torture_type, cpu);
                        starttime = jiffies;
                        n_offline_attempts++;
-                        if (cpu_down(cpu) == 0) {
+                        ret = cpu_down(cpu);
+                        if (ret) {
+                                if (verbose)
+                                        pr_alert("%s" TORTURE_FLAG
+                                                 "rcu_torture_onoff task: offline %d failed: errno %d\n",
+                                                 torture_type, cpu, ret);
+                        } else {
                                if (verbose)
                                        pr_alert("%s" TORTURE_FLAG
                                                 "rcu_torture_onoff task: offlined %d\n",
@@ -1765,7 +1791,7 @@ static int rcu_torture_barrier_init(void)
        barrier_cbs_wq =
                kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
                        GFP_KERNEL);
-        if (barrier_cbs_tasks == NULL || barrier_cbs_wq == 0)
+        if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
                return -ENOMEM;
        for (i = 0; i < n_barrier_cbs; i++) {
                init_waitqueue_head(&barrier_cbs_wq[i]);
@@ -1936,8 +1962,6 @@ rcu_torture_cleanup(void)
        rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
-        if (cur_ops->cleanup)
-                cur_ops->cleanup();
        if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
                rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
        else if (n_online_successes != n_online_attempts ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 74df86bd9204..5b8ad827fd86 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -68,9 +68,9 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
        .level = { &sname##_state.node[0] }, \
        .call = cr, \
        .fqs_state = RCU_GP_IDLE, \
-        .gpnum = -300, \
+        .gpnum = 0UL - 300UL, \
-        .completed = -300, \
+        .completed = 0UL - 300UL, \
-        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \
+        .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
        .orphan_nxttail = &sname##_state.orphan_nxtlist, \
        .orphan_donetail = &sname##_state.orphan_donelist, \
        .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
@@ -105,7 +105,7 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
 * The rcu_scheduler_active variable transitions from zero to one just
 * before the first task is spawned.  So when this variable is zero, RCU
 * can assume that there is but one task, allowing RCU to (for example)
- * optimized synchronize_sched() to a simple barrier().  When this variable
+ * optimize synchronize_sched() to a simple barrier().  When this variable
 * is one, RCU must actually do all the hard work required to detect real
 * grace periods.  This variable is also used to suppress boot-time false
 * positives from lockdep-RCU error checking.
@@ -207,24 +207,15 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
        .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
        .dynticks = ATOMIC_INIT(1),
-#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE)
-        .ignore_user_qs = true,
-#endif
 };
-static int blimit = 10;         /* Maximum callbacks per rcu_do_batch. */
+static long blimit = 10;        /* Maximum callbacks per rcu_do_batch. */
-static int qhimark = 10000;     /* If this many pending, ignore blimit. */
+static long qhimark = 10000;    /* If this many pending, ignore blimit. */
-static int qlowmark = 100;      /* Once only this many pending, use blimit. */
+static long qlowmark = 100;     /* Once only this many pending, use blimit. */
-module_param(blimit, int, 0444);
+module_param(blimit, long, 0444);
-module_param(qhimark, int, 0444);
+module_param(qhimark, long, 0444);
-module_param(qlowmark, int, 0444);
+module_param(qlowmark, long, 0444);
-int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
-int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
-module_param(rcu_cpu_stall_suppress, int, 0644);
-module_param(rcu_cpu_stall_timeout, int, 0644);
 static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
 static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
@@ -303,18 +294,32 @@ EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
 static int
 cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
 {
-        return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
+        return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
+               rdp->nxttail[RCU_DONE_TAIL] != NULL;
 }
 /*
- * Does the current CPU require a yet-as-unscheduled grace period?
+ * Does the current CPU require a not-yet-started grace period?
+ * The caller must have disabled interrupts to prevent races with
+ * normal callback registry.
 */
 static int
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-        return *rdp->nxttail[RCU_DONE_TAIL +
+        int i;
-                             ACCESS_ONCE(rsp->completed) != rdp->completed] &&
-               !rcu_gp_in_progress(rsp);
+        if (rcu_gp_in_progress(rsp))
+                return 0;  /* No, a grace period is already in progress. */
+        if (!rdp->nxttail[RCU_NEXT_TAIL])
+                return 0;  /* No, this is a no-CBs (or offline) CPU. */
+        if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
+                return 1;  /* Yes, this CPU has newly registered callbacks. */
+        for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+                if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
+                    ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
+                                 rdp->nxtcompleted[i]))
+                        return 1;  /* Yes, CBs for future grace period. */
+        return 0; /* No grace period needed. */
 }
 /*
@@ -335,7 +340,7 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
 static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
                                bool user)
 {
-        trace_rcu_dyntick("Start", oldval, 0);
+        trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
        if (!user && !is_idle_task(current)) {
                struct task_struct *idle = idle_task(smp_processor_id());
@@ -416,29 +421,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
 */
 void rcu_user_enter(void)
 {
-        unsigned long flags;
+        rcu_eqs_enter(1);
-        struct rcu_dynticks *rdtp;
-        /*
-         * Some contexts may involve an exception occuring in an irq,
-         * leading to that nesting:
-         * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
-         * This would mess up the dyntick_nesting count though. And rcu_irq_*()
-         * helpers are enough to protect RCU uses inside the exception. So
-         * just return immediately if we detect we are in an IRQ.
-         */
-        if (in_interrupt())
-                return;
-        WARN_ON_ONCE(!current->mm);
-        local_irq_save(flags);
-        rdtp = &__get_cpu_var(rcu_dynticks);
-        if (!rdtp->ignore_user_qs && !rdtp->in_user) {
-                rdtp->in_user = true;
-                rcu_eqs_enter(true);
-        }
-        local_irq_restore(flags);
 }
 /**
@@ -575,27 +558,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit);
 */
 void rcu_user_exit(void)
 {
-        unsigned long flags;
+        rcu_eqs_exit(1);
-        struct rcu_dynticks *rdtp;
-        /*
-         * Some contexts may involve an exception occuring in an irq,
-         * leading to that nesting:
-         * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
-         * This would mess up the dyntick_nesting count though. And rcu_irq_*()
-         * helpers are enough to protect RCU uses inside the exception. So
-         * just return immediately if we detect we are in an IRQ.
-         */
-        if (in_interrupt())
-                return;
-        local_irq_save(flags);
-        rdtp = &__get_cpu_var(rcu_dynticks);
-        if (rdtp->in_user) {
-                rdtp->in_user = false;
-                rcu_eqs_exit(true);
-        }
-        local_irq_restore(flags);
 }
 /**
@@ -718,21 +681,6 @@ int rcu_is_cpu_idle(void)
 }
 EXPORT_SYMBOL(rcu_is_cpu_idle);
-#ifdef CONFIG_RCU_USER_QS
-void rcu_user_hooks_switch(struct task_struct *prev,
-                           struct task_struct *next)
-{
-        struct rcu_dynticks *rdtp;
-        /* Interrupts are disabled in context switch */
-        rdtp = &__get_cpu_var(rcu_dynticks);
-        if (!rdtp->ignore_user_qs) {
-                clear_tsk_thread_flag(prev, TIF_NOHZ);
-                set_tsk_thread_flag(next, TIF_NOHZ);
-        }
-}
-#endif /* #ifdef CONFIG_RCU_USER_QS */
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
 /*
@@ -783,7 +731,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
 * interrupt from idle, return true.  The caller must have at least
 * disabled preemption.
 */
-int rcu_is_cpu_rrupt_from_idle(void)
+static int rcu_is_cpu_rrupt_from_idle(void)
 {
        return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
 }
@@ -849,28 +797,33 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
        return 0;
 }
-static int jiffies_till_stall_check(void)
+static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
-        int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
+        rsp->gp_start = jiffies;
+        rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
-        /*
-         * Limit check must be consistent with the Kconfig limits
-         * for CONFIG_RCU_CPU_STALL_TIMEOUT.
-         */
-        if (till_stall_check < 3) {
-                ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
-                till_stall_check = 3;
-        } else if (till_stall_check > 300) {
-                ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
-                till_stall_check = 300;
-        }
-        return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
 }
-static void record_gp_stall_check_time(struct rcu_state *rsp)
+/*
+ * Dump stacks of all tasks running on stalled CPUs.  This is a fallback
+ * for architectures that do not implement trigger_all_cpu_backtrace().
+ * The NMI-triggered stack traces are more accurate because they are
+ * printed by the target CPU.
+ */
+static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 {
-        rsp->gp_start = jiffies;
+        int cpu;
-        rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
+        unsigned long flags;
+        struct rcu_node *rnp;
+        rcu_for_each_leaf_node(rsp, rnp) {
+                raw_spin_lock_irqsave(&rnp->lock, flags);
+                if (rnp->qsmask != 0) {
+                        for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+                                if (rnp->qsmask & (1UL << cpu))
+                                        dump_cpu_task(rnp->grplo + cpu);
+                }
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        }
 }
 static void print_other_cpu_stall(struct rcu_state *rsp)
@@ -880,6 +833,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        unsigned long flags;
        int ndetected = 0;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        long totqlen = 0;
        /* Only let one CPU complain about others per time interval. */
@@ -889,7 +843,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
-        rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
+        rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /*
@@ -924,12 +878,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        print_cpu_stall_info_end();
-        printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n",
+        for_each_possible_cpu(cpu)
-               smp_processor_id(), (long)(jiffies - rsp->gp_start));
+                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+        pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
+               smp_processor_id(), (long)(jiffies - rsp->gp_start),
+               rsp->gpnum, rsp->completed, totqlen);
        if (ndetected == 0)
                printk(KERN_ERR "INFO: Stall ended before state dump start\n");
        else if (!trigger_all_cpu_backtrace())
-                dump_stack();
+                rcu_dump_cpu_stacks(rsp);
        /* Complain about tasks blocking the grace period. */
@@ -940,8 +897,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 static void print_cpu_stall(struct rcu_state *rsp)
 {
+        int cpu;
        unsigned long flags;
        struct rcu_node *rnp = rcu_get_root(rsp);
+        long totqlen = 0;
        /*
         * OK, time to rat on ourselves...
@@ -952,14 +911,17 @@ static void print_cpu_stall(struct rcu_state *rsp)
        print_cpu_stall_info_begin();
        print_cpu_stall_info(rsp, smp_processor_id());
        print_cpu_stall_info_end();
-        printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start);
+        for_each_possible_cpu(cpu)
+                totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
+        pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
+                jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
        if (!trigger_all_cpu_backtrace())
                dump_stack();
        raw_spin_lock_irqsave(&rnp->lock, flags);
        if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
                rsp->jiffies_stall = jiffies +
-                                     3 * jiffies_till_stall_check() + 3;
+                                     3 * rcu_jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        set_need_resched();  /* kick ourselves to get things going. */
@@ -990,12 +952,6 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
        }
 }
-static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
-{
-        rcu_cpu_stall_suppress = 1;
-        return NOTIFY_DONE;
-}
 /**
 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
 *
@@ -1013,15 +969,6 @@ void rcu_cpu_stall_reset(void)
                rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
 }
-static struct notifier_block rcu_panic_block = {
-        .notifier_call = rcu_panic,
-};
-static void __init check_cpu_stall_init(void)
-{
-        atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
-}
 /*
 * Update CPU-local rcu_data state to record the newly noticed grace period.
 * This is used both when we started the grace period and when we notice
@@ -1091,6 +1038,146 @@ static void init_callback_list(struct rcu_data *rdp)
        rdp->nxtlist = NULL;
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
+        init_nocb_callback_list(rdp);
+}
+/*
+ * Determine the value that ->completed will have at the end of the
+ * next subsequent grace period.  This is used to tag callbacks so that
+ * a CPU can invoke callbacks in a timely fashion even if that CPU has
+ * been dyntick-idle for an extended period with callbacks under the
+ * influence of RCU_FAST_NO_HZ.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
+                                       struct rcu_node *rnp)
+{
+        /*
+         * If RCU is idle, we just wait for the next grace period.
+         * But we can only be sure that RCU is idle if we are looking
+         * at the root rcu_node structure -- otherwise, a new grace
+         * period might have started, but just not yet gotten around
+         * to initializing the current non-root rcu_node structure.
+         */
+        if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
+                return rnp->completed + 1;
+        /*
+         * Otherwise, wait for a possible partial grace period and
+         * then the subsequent full grace period.
+         */
+        return rnp->completed + 2;
+}
+/*
+ * If there is room, assign a ->completed number to any callbacks on
+ * this CPU that have not already been assigned.  Also accelerate any
+ * callbacks that were previously assigned a ->completed number that has
+ * since proven to be too conservative, which can happen if callbacks get
+ * assigned a ->completed number while RCU is idle, but with reference to
+ * a non-root rcu_node structure.  This function is idempotent, so it does
+ * not hurt to call it repeatedly.
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+                               struct rcu_data *rdp)
+{
+        unsigned long c;
+        int i;
+        /* If the CPU has no callbacks, nothing to do. */
+        if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+                return;
+        /*
+         * Starting from the sublist containing the callbacks most
+         * recently assigned a ->completed number and working down, find the
+         * first sublist that is not assignable to an upcoming grace period.
+         * Such a sublist has something in it (first two tests) and has
+         * a ->completed number assigned that will complete sooner than
+         * the ->completed number for newly arrived callbacks (last test).
+         *
+         * The key point is that any later sublist can be assigned the
+         * same ->completed number as the newly arrived callbacks, which
+         * means that the callbacks in any of these later sublist can be
+         * grouped into a single sublist, whether or not they have already
+         * been assigned a ->completed number.
+         */
+        c = rcu_cbs_completed(rsp, rnp);
+        for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
+                if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
+                    !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
+                        break;
+        /*
+         * If there are no sublist for unassigned callbacks, leave.
+         * At the same time, advance "i" one sublist, so that "i" will
+         * index into the sublist where all the remaining callbacks should
+         * be grouped into.
+         */
+        if (++i >= RCU_NEXT_TAIL)
+                return;
+        /*
+         * Assign all subsequent callbacks' ->completed number to the next
+         * full grace period and group them all in the sublist initially
+         * indexed by "i".
+         */
+        for (; i <= RCU_NEXT_TAIL; i++) {
+                rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
+                rdp->nxtcompleted[i] = c;
+        }
+        /* Trace depending on how much we were able to accelerate. */
+        if (!*rdp->nxttail[RCU_WAIT_TAIL])
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
+        else
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
+}
+/*
+ * Move any callbacks whose grace period has completed to the
+ * RCU_DONE_TAIL sublist, then compact the remaining sublists and
+ * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
+ * sublist.  This function is idempotent, so it does not hurt to
+ * invoke it repeatedly.  As long as it is not invoked -too- often...
+ *
+ * The caller must hold rnp->lock with interrupts disabled.
+ */
+static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
+                            struct rcu_data *rdp)
+{
+        int i, j;
+        /* If the CPU has no callbacks, nothing to do. */
+        if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
+                return;
+        /*
+         * Find all callbacks whose ->completed numbers indicate that they
+         * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+         */
+        for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+                if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
+                        break;
+                rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
+        }
+        /* Clean up any sublist tail pointers that were misordered above. */
+        for (j = RCU_WAIT_TAIL; j < i; j++)
+                rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];
+        /* Copy down callbacks to fill in empty sublists. */
+        for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+                if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
+                        break;
+                rdp->nxttail[j] = rdp->nxttail[i];
+                rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
+        }
+        /* Classify any remaining callbacks. */
+        rcu_accelerate_cbs(rsp, rnp, rdp);
 }
 /*
@@ -1103,12 +1190,15 @@ static void
 __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
 {
        /* Did another grace period end? */
-        if (rdp->completed != rnp->completed) {
+        if (rdp->completed == rnp->completed) {
+                /* No, so just accelerate recent callbacks. */
+                rcu_accelerate_cbs(rsp, rnp, rdp);
+        } else {
-                /* Advance callbacks.  No harm if list empty. */
+                /* Advance callbacks. */
-                rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+                rcu_advance_cbs(rsp, rnp, rdp);
-                rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
-                rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
@@ -1404,15 +1494,30 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
            !cpu_needs_another_gp(rsp, rdp)) {
                /*
                 * Either we have not yet spawned the grace-period
-                 * task or this CPU does not need another grace period.
+                 * task, this CPU does not need another grace period,
+                 * or a grace period is already in progress.
                 * Either way, don't start a new grace period.
                 */
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
+        /*
+         * Because there is no grace period in progress right now,
+         * any callbacks we have up to this point will be satisfied
+         * by the next grace period.  So this is a good place to
+         * assign a grace period number to recently posted callbacks.
+         */
+        rcu_accelerate_cbs(rsp, rnp, rdp);
        rsp->gp_flags = RCU_GP_FLAG_INIT;
-        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+        raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+        /* Ensure that CPU is aware of completion of last grace period. */
+        rcu_process_gp_end(rsp, rdp);
+        local_irq_restore(flags);
+        /* Wake up rcu_gp_kthread() to start the grace period. */
        wake_up(&rsp->gp_wq);
 }
@@ -1528,7 +1633,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
                 * This GP can't end until cpu checks in, so all of our
                 * callbacks can be processed during the next GP.
                 */
-                rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+                rcu_accelerate_cbs(rsp, rnp, rdp);
                rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
        }
@@ -1573,16 +1678,20 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 /*
 * Send the specified CPU's RCU callbacks to the orphanage.  The
 * specified CPU must be offline, and the caller must hold the
- * ->onofflock.
+ * ->orphan_lock.
 */
 static void
 rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
                          struct rcu_node *rnp, struct rcu_data *rdp)
 {
+        /* No-CBs CPUs do not have orphanable callbacks. */
+        if (is_nocb_cpu(rdp->cpu))
+                return;
        /*
         * Orphan the callbacks.  First adjust the counts.  This is safe
-         * because ->onofflock excludes _rcu_barrier()'s adoption of
+         * because _rcu_barrier() excludes CPU-hotplug operations, so it
-         * the callbacks, thus no memory barrier is required.
+         * cannot be running now.  Thus no memory barrier is required.
         */
        if (rdp->nxtlist != NULL) {
                rsp->qlen_lazy += rdp->qlen_lazy;
@@ -1623,13 +1732,17 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 /*
 * Adopt the RCU callbacks from the specified rcu_state structure's
- * orphanage.  The caller must hold the ->onofflock.
+ * orphanage.  The caller must hold the ->orphan_lock.
 */
 static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
 {
        int i;
        struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
+        /* No-CBs CPUs are handled specially. */
+        if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
+                return;
        /* Do the accounting first. */
        rdp->qlen_lazy += rsp->qlen_lazy;
        rdp->qlen += rsp->qlen;
@@ -1702,7 +1815,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        /* Exclude any attempts to start a new grace period. */
        mutex_lock(&rsp->onoff_mutex);
-        raw_spin_lock_irqsave(&rsp->onofflock, flags);
+        raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
        /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
        rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
@@ -1729,10 +1842,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
        /*
         * We still hold the leaf rcu_node structure lock here, and
         * irqs are still disabled.  The reason for this subterfuge is
-         * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
+         * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock
         * held leads to deadlock.
         */
-        raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
+        raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */
        rnp = rdp->mynode;
        if (need_report & RCU_OFL_TASKS_NORM_GP)
                rcu_report_unblock_qs_rnp(rnp, flags);
@@ -1769,9 +1882,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
-        int bl, count, count_lazy, i;
+        long bl, count, count_lazy;
+        int i;
-        /* If no callbacks are ready, just return.*/
+        /* If no callbacks are ready, just return. */
        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
                trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
                trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
@@ -2000,19 +2114,19 @@ __rcu_process_callbacks(struct rcu_state *rsp)
        WARN_ON_ONCE(rdp->beenonline == 0);
-        /*
+        /* Handle the end of a grace period that some other CPU ended.  */
-         * Advance callbacks in response to end of earlier grace
-         * period that some other CPU ended.
-         */
        rcu_process_gp_end(rsp, rdp);
        /* Update RCU state based on any recent quiescent states. */
        rcu_check_quiescent_state(rsp, rdp);
        /* Does this CPU require a not-yet-started grace period? */
+        local_irq_save(flags);
        if (cpu_needs_another_gp(rsp, rdp)) {
-                raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+                raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
                rcu_start_gp(rsp, flags);  /* releases above lock */
+        } else {
+                local_irq_restore(flags);
        }
        /* If there are callbacks ready, invoke them. */
@@ -2107,9 +2221,15 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
        }
 }
+/*
+ * Helper function for call_rcu() and friends.  The cpu argument will
+ * normally be -1, indicating "currently running CPU".  It may specify
+ * a CPU only if that CPU is a no-CBs CPU.  Currently, only _rcu_barrier()
+ * is expected to specify a CPU.
+ */
 static void
 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
-           struct rcu_state *rsp, bool lazy)
+           struct rcu_state *rsp, int cpu, bool lazy)
 {
        unsigned long flags;
        struct rcu_data *rdp;
@@ -2129,9 +2249,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        rdp = this_cpu_ptr(rsp->rda);
        /* Add the callback to our list. */
-        if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) {
+        if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) {
+                int offline;
+                if (cpu != -1)
+                        rdp = per_cpu_ptr(rsp->rda, cpu);
+                offline = !__call_rcu_nocb(rdp, head, lazy);
+                WARN_ON_ONCE(offline);
                /* _call_rcu() is illegal on offline CPU; leak the callback. */
-                WARN_ON_ONCE(1);
                local_irq_restore(flags);
                return;
        }
@@ -2160,7 +2285,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 */
 void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_sched_state, 0);
+        __call_rcu(head, func, &rcu_sched_state, -1, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu_sched);
@@ -2169,7 +2294,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
 */
 void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_bh_state, 0);
+        __call_rcu(head, func, &rcu_bh_state, -1, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -2205,10 +2330,28 @@ static inline int rcu_blocking_is_gp(void)
 * rcu_read_lock_sched().
 *
 * This means that all preempt_disable code sequences, including NMI and
- * hardware-interrupt handlers, in progress on entry will have completed
+ * non-threaded hardware-interrupt handlers, in progress on entry will
- * before this primitive returns.  However, this does not guarantee that
+ * have completed before this primitive returns.  However, this does not
- * softirq handlers will have completed, since in some kernels, these
+ * guarantee that softirq handlers will have completed, since in some
- * handlers can run in process context, and can block.
+ * kernels, these handlers can run in process context, and can block.
+ *
+ * Note that this guarantee implies further memory-ordering guarantees.
+ * On systems with more than one CPU, when synchronize_sched() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since the
+ * end of its last RCU-sched read-side critical section whose beginning
+ * preceded the call to synchronize_sched().  In addition, each CPU having
+ * an RCU read-side critical section that extends beyond the return from
+ * synchronize_sched() is guaranteed to have executed a full memory barrier
+ * after the beginning of synchronize_sched() and before the beginning of
+ * that RCU read-side critical section.  Note that these guarantees include
+ * CPUs that are offline, idle, or executing in user mode, as well as CPUs
+ * that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_sched(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
 *
 * This primitive provides the guarantees made by the (now removed)
 * synchronize_kernel() API.  In contrast, synchronize_rcu() only
@@ -2224,7 +2367,10 @@ void synchronize_sched(void)
                           "Illegal synchronize_sched() in RCU-sched read-side critical section");
        if (rcu_blocking_is_gp())
                return;
-        wait_rcu_gp(call_rcu_sched);
+        if (rcu_expedited)
+                synchronize_sched_expedited();
+        else
+                wait_rcu_gp(call_rcu_sched);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -2236,6 +2382,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
 * read-side critical sections have completed.  RCU read-side critical
 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
 * and may be nested.
+ *
+ * See the description of synchronize_sched() for more detailed information
+ * on memory ordering guarantees.
 */
 void synchronize_rcu_bh(void)
 {
@@ -2245,13 +2394,13 @@ void synchronize_rcu_bh(void)
                           "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
        if (rcu_blocking_is_gp())
                return;
-        wait_rcu_gp(call_rcu_bh);
+        if (rcu_expedited)
+                synchronize_rcu_bh_expedited();
+        else
+                wait_rcu_gp(call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
-static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
-static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
        /*
@@ -2308,10 +2457,32 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 */
 void synchronize_sched_expedited(void)
 {
-        int firstsnap, s, snap, trycount = 0;
+        long firstsnap, s, snap;
+        int trycount = 0;
+        struct rcu_state *rsp = &rcu_sched_state;
+        /*
+         * If we are in danger of counter wrap, just do synchronize_sched().
+         * By allowing sync_sched_expedited_started to advance no more than
+         * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
+         * that more than 3.5 billion CPUs would be required to force a
+         * counter wrap on a 32-bit system.  Quite a few more CPUs would of
+         * course be required on a 64-bit system.
+         */
+        if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
+                         (ulong)atomic_long_read(&rsp->expedited_done) +
+                         ULONG_MAX / 8)) {
+                synchronize_sched();
+                atomic_long_inc(&rsp->expedited_wrap);
+                return;
+        }
-        /* Note that atomic_inc_return() implies full memory barrier. */
+        /*
-        firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
+         * Take a ticket.  Note that atomic_inc_return() implies a
+         * full memory barrier.
+         */
+        snap = atomic_long_inc_return(&rsp->expedited_start);
+        firstsnap = snap;
        get_online_cpus();
        WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
@@ -2323,48 +2494,65 @@ void synchronize_sched_expedited(void)
                             synchronize_sched_expedited_cpu_stop,
                             NULL) == -EAGAIN) {
                put_online_cpus();
+                atomic_long_inc(&rsp->expedited_tryfail);
+                /* Check to see if someone else did our work for us. */
+                s = atomic_long_read(&rsp->expedited_done);
+                if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
+                        /* ensure test happens before caller kfree */
+                        smp_mb__before_atomic_inc(); /* ^^^ */
+                        atomic_long_inc(&rsp->expedited_workdone1);
+                        return;
+                }
                /* No joy, try again later.  Or just synchronize_sched(). */
                if (trycount++ < 10) {
                        udelay(trycount * num_online_cpus());
                } else {
-                        synchronize_sched();
+                        wait_rcu_gp(call_rcu_sched);
+                        atomic_long_inc(&rsp->expedited_normal);
                        return;
                }
-                /* Check to see if someone else did our work for us. */
+                /* Recheck to see if someone else did our work for us. */
-                s = atomic_read(&sync_sched_expedited_done);
+                s = atomic_long_read(&rsp->expedited_done);
-                if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
+                if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-                        smp_mb(); /* ensure test happens before caller kfree */
+                        /* ensure test happens before caller kfree */
+                        smp_mb__before_atomic_inc(); /* ^^^ */
+                        atomic_long_inc(&rsp->expedited_workdone2);
                        return;
                }
                /*
                 * Refetching sync_sched_expedited_started allows later
-                 * callers to piggyback on our grace period.  We subtract
+                 * callers to piggyback on our grace period.  We retry
-                 * 1 to get the same token that the last incrementer got.
+                 * after they started, so our grace period works for them,
-                 * We retry after they started, so our grace period works
+                 * and they started after our first try, so their grace
-                 * for them, and they started after our first try, so their
+                 * period works for us.
-                 * grace period works for us.
                 */
                get_online_cpus();
-                snap = atomic_read(&sync_sched_expedited_started);
+                snap = atomic_long_read(&rsp->expedited_start);
                smp_mb(); /* ensure read is before try_stop_cpus(). */
        }
+        atomic_long_inc(&rsp->expedited_stoppedcpus);
        /*
         * Everyone up to our most recent fetch is covered by our grace
         * period.  Update the counter, but only if our work is still
         * relevant -- which it won't be if someone who started later
-         * than we did beat us to the punch.
+         * than we did already did their update.
         */
        do {
-                s = atomic_read(&sync_sched_expedited_done);
+                atomic_long_inc(&rsp->expedited_done_tries);
-                if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                s = atomic_long_read(&rsp->expedited_done);
-                        smp_mb(); /* ensure test happens before caller kfree */
+                if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+                        /* ensure test happens before caller kfree */
+                        smp_mb__before_atomic_inc(); /* ^^^ */
+                        atomic_long_inc(&rsp->expedited_done_lost);
                        break;
                }
-        } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
+        } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
+        atomic_long_inc(&rsp->expedited_done_exit);
        put_online_cpus();
 }
@@ -2558,9 +2746,17 @@ static void _rcu_barrier(struct rcu_state *rsp)
         * When that callback is invoked, we will know that all of the
         * corresponding CPU's preceding callbacks have been invoked.
         */
-        for_each_online_cpu(cpu) {
+        for_each_possible_cpu(cpu) {
+                if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
+                        continue;
                rdp = per_cpu_ptr(rsp->rda, cpu);
-                if (ACCESS_ONCE(rdp->qlen)) {
+                if (is_nocb_cpu(cpu)) {
+                        _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+                                           rsp->n_barrier_done);
+                        atomic_inc(&rsp->barrier_cpu_count);
+                        __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
+                                   rsp, cpu, 0);
+                } else if (ACCESS_ONCE(rdp->qlen)) {
                        _rcu_barrier_trace(rsp, "OnlineQ", cpu,
                                           rsp->n_barrier_done);
                        smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
@@ -2629,11 +2825,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
-#ifdef CONFIG_RCU_USER_QS
-        WARN_ON_ONCE(rdp->dynticks->in_user);
-#endif
        rdp->cpu = cpu;
        rdp->rsp = rsp;
+        rcu_boot_init_nocb_percpu_data(rdp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -2715,6 +2909,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
        struct rcu_state *rsp;
+        int ret = NOTIFY_OK;
        trace_rcu_utilization("Start CPU hotplug");
        switch (action) {
@@ -2728,7 +2923,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                rcu_boost_kthread_setaffinity(rnp, -1);
                break;
        case CPU_DOWN_PREPARE:
-                rcu_boost_kthread_setaffinity(rnp, cpu);
+                if (nocb_cpu_expendable(cpu))
+                        rcu_boost_kthread_setaffinity(rnp, cpu);
+                else
+                        ret = NOTIFY_BAD;
                break;
        case CPU_DYING:
        case CPU_DYING_FROZEN:
@@ -2752,7 +2950,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                break;
        }
        trace_rcu_utilization("End CPU hotplug");
-        return NOTIFY_OK;
+        return ret;
 }
 /*
@@ -2772,6 +2970,7 @@ static int __init rcu_spawn_gp_kthread(void)
                raw_spin_lock_irqsave(&rnp->lock, flags);
                rsp->gp_kthread = t;
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                rcu_spawn_nocb_kthreads(rsp);
        }
        return 0;
 }
@@ -2842,6 +3041,10 @@ static void __init rcu_init_one(struct rcu_state *rsp,
        BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */
+        /* Silence gcc 4.8 warning about array index out of range. */
+        if (rcu_num_lvls > RCU_NUM_LVLS)
+                panic("rcu_init_one: rcu_num_lvls overflow");
        /* Initialize the level-tracking arrays. */
        for (i = 0; i < rcu_num_lvls; i++)
@@ -2967,6 +3170,7 @@ void __init rcu_init(void)
        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
+        rcu_init_nocb();
         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
        /*
@@ -2977,7 +3181,6 @@ void __init rcu_init(void)
        cpu_notifier(rcu_cpu_notify, 0);
        for_each_online_cpu(cpu)
                rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
-        check_cpu_stall_init();
 }
 #include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index a240f032848e..c896b5045d9d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -102,10 +102,6 @@ struct rcu_dynticks {
                                    /* idle-period nonlazy_posted snapshot. */
        int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-#ifdef CONFIG_RCU_USER_QS
-        bool ignore_user_qs;        /* Treat userspace as extended QS or not */
-        bool in_user;               /* Is the CPU in userland from RCU POV? */
-#endif
 };
 /* RCU's kthread states for tracing. */
@@ -282,11 +278,14 @@ struct rcu_data {
         */
        struct rcu_head *nxtlist;
        struct rcu_head **nxttail[RCU_NEXT_SIZE];
+        unsigned long   nxtcompleted[RCU_NEXT_SIZE];
+                                        /* grace periods for sublists. */
        long            qlen_lazy;      /* # of lazy queued callbacks */
        long            qlen;           /* # of queued callbacks, incl lazy */
        long            qlen_last_fqs_check;
                                        /* qlen at last check for QS forcing */
        unsigned long   n_cbs_invoked;  /* count of RCU cbs invoked. */
+        unsigned long   n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
        unsigned long   n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
        unsigned long   n_cbs_adopted;  /* RCU cbs adopted from dying CPU */
        unsigned long   n_force_qs_snap;
@@ -317,6 +316,18 @@ struct rcu_data {
        struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+        /* 7) Callback offloading. */
+#ifdef CONFIG_RCU_NOCB_CPU
+        struct rcu_head *nocb_head;     /* CBs waiting for kthread. */
+        struct rcu_head **nocb_tail;
+        atomic_long_t nocb_q_count;     /* # CBs waiting for kthread */
+        atomic_long_t nocb_q_count_lazy; /*  (approximate). */
+        int nocb_p_count;               /* # CBs being invoked by kthread */
+        int nocb_p_count_lazy;          /*  (approximate). */
+        wait_queue_head_t nocb_wq;      /* For nocb kthreads to sleep on. */
+        struct task_struct *nocb_kthread;
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
        int cpu;
        struct rcu_state *rsp;
 };
@@ -330,11 +341,6 @@ struct rcu_data {
 #define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
-#ifdef CONFIG_PROVE_RCU
-#define RCU_STALL_DELAY_DELTA          (5 * HZ)
-#else
-#define RCU_STALL_DELAY_DELTA          0
-#endif
 #define RCU_STALL_RAT_DELAY             2       /* Allow other CPUs time */
                                                /*  to take at least one */
                                                /*  scheduling clock irq */
@@ -369,6 +375,12 @@ struct rcu_state {
        struct rcu_data __percpu *rda;          /* pointer of percu rcu_data. */
        void (*call)(struct rcu_head *head,     /* call_rcu() flavor. */
                     void (*func)(struct rcu_head *head));
+#ifdef CONFIG_RCU_NOCB_CPU
+        void (*call_remote)(struct rcu_head *head,
+                     void (*func)(struct rcu_head *head));
+                                                /* call_rcu() flavor, but for */
+                                                /*  placing on remote CPU. */
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
        /* The following fields are guarded by the root rcu_node's lock. */
@@ -383,9 +395,8 @@ struct rcu_state {
        /* End of fields guarded by root rcu_node's lock. */
-        raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp;
+        raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
-                                                /* exclude on/offline and */
+                                                /* Protect following fields. */
-                                                /*  starting new GP. */
        struct rcu_head *orphan_nxtlist;        /* Orphaned callbacks that */
                                                /*  need a grace period. */
        struct rcu_head **orphan_nxttail;       /* Tail of above. */
@@ -394,7 +405,7 @@ struct rcu_state {
        struct rcu_head **orphan_donetail;      /* Tail of above. */
        long qlen_lazy;                         /* Number of lazy callbacks. */
        long qlen;                              /* Total number of callbacks. */
-        /* End of fields guarded by onofflock. */
+        /* End of fields guarded by orphan_lock. */
        struct mutex onoff_mutex;               /* Coordinate hotplug & GPs. */
@@ -405,6 +416,18 @@ struct rcu_state {
                                                /*  _rcu_barrier(). */
        /* End of fields guarded by barrier_mutex. */
+        atomic_long_t expedited_start;          /* Starting ticket. */
+        atomic_long_t expedited_done;           /* Done ticket. */
+        atomic_long_t expedited_wrap;           /* # near-wrap incidents. */
+        atomic_long_t expedited_tryfail;        /* # acquisition failures. */
+        atomic_long_t expedited_workdone1;      /* # done by others #1. */
+        atomic_long_t expedited_workdone2;      /* # done by others #2. */
+        atomic_long_t expedited_normal;         /* # fallbacks to normal. */
+        atomic_long_t expedited_stoppedcpus;    /* # successful stop_cpus. */
+        atomic_long_t expedited_done_tries;     /* # tries to update _done. */
+        atomic_long_t expedited_done_lost;      /* # times beaten to _done. */
+        atomic_long_t expedited_done_exit;      /* # times exited _done loop. */
        unsigned long jiffies_force_qs;         /* Time at which to invoke */
                                                /*  force_quiescent_state(). */
        unsigned long n_force_qs;               /* Number of calls to */
@@ -428,6 +451,8 @@ struct rcu_state {
 #define RCU_GP_FLAG_FQS  0x2    /* Need grace-period quiescent-state forcing. */
 extern struct list_head rcu_struct_flavors;
+/* Sequence through rcu_state structures for each RCU flavor. */
 #define for_each_rcu_flavor(rsp) \
        list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
@@ -504,5 +529,32 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
 static void print_cpu_stall_info_end(void);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
+static bool is_nocb_cpu(int cpu);
+static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
+                            bool lazy);
+static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+                                      struct rcu_data *rdp);
+static bool nocb_cpu_expendable(int cpu);
+static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
+static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
+static void init_nocb_callback_list(struct rcu_data *rdp);
+static void __init rcu_init_nocb(void);
 #endif /* #ifndef RCU_TREE_NONCORE */
+#ifdef CONFIG_RCU_TRACE
+#ifdef CONFIG_RCU_NOCB_CPU
+/* Sum up queue lengths for tracing. */
+static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
+{
+        *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
+        *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
+}
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
+{
+        *ql = 0;
+        *qll = 0;
+}
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+#endif /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index f92115488187..c1cc7e17ff9d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,7 @@
 */
 #include <linux/delay.h>
+#include <linux/gfp.h>
 #include <linux/oom.h>
 #include <linux/smpboot.h>
@@ -36,6 +37,13 @@
 #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
 #endif
+#ifdef CONFIG_RCU_NOCB_CPU
+static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
+static bool have_rcu_nocb_mask;     /* Was rcu_nocb_mask allocated? */
+static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
+static char __initdata nocb_buf[NR_CPUS * 5];
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 /*
 * Check the RCU kernel configuration parameters and print informative
 * messages about anything out of the ordinary.  If you like #ifdef, you
@@ -76,6 +84,18 @@ static void __init rcu_bootup_announce_oddness(void)
                printk(KERN_INFO "\tExperimental boot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
        if (nr_cpu_ids != NR_CPUS)
                printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
+#ifdef CONFIG_RCU_NOCB_CPU
+        if (have_rcu_nocb_mask) {
+                if (cpumask_test_cpu(0, rcu_nocb_mask)) {
+                        cpumask_clear_cpu(0, rcu_nocb_mask);
+                        pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
+                }
+                cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
+                pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
+                if (rcu_nocb_poll)
+                        pr_info("\tExperimental polled no-CBs CPUs.\n");
+        }
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 }
 #ifdef CONFIG_TREE_PREEMPT_RCU
@@ -642,7 +662,7 @@ static void rcu_preempt_do_callbacks(void)
 */
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_preempt_state, 0);
+        __call_rcu(head, func, &rcu_preempt_state, -1, 0);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
@@ -656,7 +676,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
 void kfree_call_rcu(struct rcu_head *head,
                    void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_preempt_state, 1);
+        __call_rcu(head, func, &rcu_preempt_state, -1, 1);
 }
 EXPORT_SYMBOL_GPL(kfree_call_rcu);
@@ -670,6 +690,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu);
 * concurrently with new RCU read-side critical sections that began while
 * synchronize_rcu() was waiting.  RCU read-side critical sections are
 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
+ *
+ * See the description of synchronize_sched() for more detailed information
+ * on memory ordering guarantees.
 */
 void synchronize_rcu(void)
 {
@@ -679,7 +702,10 @@ void synchronize_rcu(void)
                           "Illegal synchronize_rcu() in RCU read-side critical section");
        if (!rcu_scheduler_active)
                return;
-        wait_rcu_gp(call_rcu);
+        if (rcu_expedited)
+                synchronize_rcu_expedited();
+        else
+                wait_rcu_gp(call_rcu);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
@@ -757,7 +783,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 * grace period for the specified rcu_node structure.  If there are no such
 * tasks, report it up the rcu_node hierarchy.
 *
- * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock.
+ * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
+ * CPU hotplug operations.
 */
 static void
 sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -831,7 +858,7 @@ void synchronize_rcu_expedited(void)
                        udelay(trycount * num_online_cpus());
                } else {
                        put_online_cpus();
-                        synchronize_rcu();
+                        wait_rcu_gp(call_rcu);
                        return;
                }
        }
@@ -875,6 +902,11 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 /**
 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ *
+ * Note that this primitive does not necessarily wait for an RCU grace period
+ * to complete.  For example, if there are no RCU callbacks queued anywhere
+ * in the system, then rcu_barrier() is within its rights to return
+ * immediately, without waiting for anything, much less an RCU grace period.
 */
 void rcu_barrier(void)
 {
@@ -1013,7 +1045,7 @@ static void rcu_preempt_check_callbacks(int cpu)
 void kfree_call_rcu(struct rcu_head *head,
                    void (*func)(struct rcu_head *rcu))
 {
-        __call_rcu(head, func, &rcu_sched_state, 1);
+        __call_rcu(head, func, &rcu_sched_state, -1, 1);
 }
 EXPORT_SYMBOL_GPL(kfree_call_rcu);
@@ -2092,3 +2124,381 @@ static void increment_cpu_stall_ticks(void)
 }
 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
+#ifdef CONFIG_RCU_NOCB_CPU
+/*
+ * Offload callback processing from the boot-time-specified set of CPUs
+ * specified by rcu_nocb_mask.  For each CPU in the set, there is a
+ * kthread created that pulls the callbacks from the corresponding CPU,
+ * waits for a grace period to elapse, and invokes the callbacks.
+ * The no-CBs CPUs do a wake_up() on their kthread when they insert
+ * a callback into any empty list, unless the rcu_nocb_poll boot parameter
+ * has been specified, in which case each kthread actively polls its
+ * CPU.  (Which isn't so great for energy efficiency, but which does
+ * reduce RCU's overhead on that CPU.)
+ *
+ * This is intended to be used in conjunction with Frederic Weisbecker's
+ * adaptive-idle work, which would seriously reduce OS jitter on CPUs
+ * running CPU-bound user-mode computations.
+ *
+ * Offloading of callback processing could also in theory be used as
+ * an energy-efficiency measure because CPUs with no RCU callbacks
+ * queued are more aggressive about entering dyntick-idle mode.
+ */
+/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
+static int __init rcu_nocb_setup(char *str)
+{
+        alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+        have_rcu_nocb_mask = true;
+        cpulist_parse(str, rcu_nocb_mask);
+        return 1;
+}
+__setup("rcu_nocbs=", rcu_nocb_setup);
+static int __init parse_rcu_nocb_poll(char *arg)
+{
+        rcu_nocb_poll = 1;
+        return 0;
+}
+early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+/* Is the specified CPU a no-CPUs CPU? */
+static bool is_nocb_cpu(int cpu)
+{
+        if (have_rcu_nocb_mask)
+                return cpumask_test_cpu(cpu, rcu_nocb_mask);
+        return false;
+}
+/*
+ * Enqueue the specified string of rcu_head structures onto the specified
+ * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the
+ * string by rhp, and the tail of the string by rhtp.  The non-lazy/lazy
+ * counts are supplied by rhcount and rhcount_lazy.
+ *
+ * If warranted, also wake up the kthread servicing this CPUs queues.
+ */
+static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
+                                    struct rcu_head *rhp,
+                                    struct rcu_head **rhtp,
+                                    int rhcount, int rhcount_lazy)
+{
+        int len;
+        struct rcu_head **old_rhpp;
+        struct task_struct *t;
+        /* Enqueue the callback on the nocb list and update counts. */
+        old_rhpp = xchg(&rdp->nocb_tail, rhtp);
+        ACCESS_ONCE(*old_rhpp) = rhp;
+        atomic_long_add(rhcount, &rdp->nocb_q_count);
+        atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
+        /* If we are not being polled and there is a kthread, awaken it ... */
+        t = ACCESS_ONCE(rdp->nocb_kthread);
+        if (rcu_nocb_poll | !t)
+                return;
+        len = atomic_long_read(&rdp->nocb_q_count);
+        if (old_rhpp == &rdp->nocb_head) {
+                wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
+                rdp->qlen_last_fqs_check = 0;
+        } else if (len > rdp->qlen_last_fqs_check + qhimark) {
+                wake_up_process(t); /* ... or if many callbacks queued. */
+                rdp->qlen_last_fqs_check = LONG_MAX / 2;
+        }
+        return;
+}
+/*
+ * This is a helper for __call_rcu(), which invokes this when the normal
+ * callback queue is inoperable.  If this is not a no-CBs CPU, this
+ * function returns failure back to __call_rcu(), which can complain
+ * appropriately.
+ *
+ * Otherwise, this function queues the callback where the corresponding
+ * "rcuo" kthread can find it.
+ */
+static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
+                            bool lazy)
+{
+        if (!is_nocb_cpu(rdp->cpu))
+                return 0;
+        __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
+        return 1;
+}
+/*
+ * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
+ * not a no-CBs CPU.
+ */
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+                                                     struct rcu_data *rdp)
+{
+        long ql = rsp->qlen;
+        long qll = rsp->qlen_lazy;
+        /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
+        if (!is_nocb_cpu(smp_processor_id()))
+                return 0;
+        rsp->qlen = 0;
+        rsp->qlen_lazy = 0;
+        /* First, enqueue the donelist, if any.  This preserves CB ordering. */
+        if (rsp->orphan_donelist != NULL) {
+                __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
+                                        rsp->orphan_donetail, ql, qll);
+                ql = qll = 0;
+                rsp->orphan_donelist = NULL;
+                rsp->orphan_donetail = &rsp->orphan_donelist;
+        }
+        if (rsp->orphan_nxtlist != NULL) {
+                __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
+                                        rsp->orphan_nxttail, ql, qll);
+                ql = qll = 0;
+                rsp->orphan_nxtlist = NULL;
+                rsp->orphan_nxttail = &rsp->orphan_nxtlist;
+        }
+        return 1;
+}
+/*
+ * There must be at least one non-no-CBs CPU in operation at any given
+ * time, because no-CBs CPUs are not capable of initiating grace periods
+ * independently.  This function therefore complains if the specified
+ * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
+ * avoid offlining the last such CPU.  (Recursion is a wonderful thing,
+ * but you have to have a base case!)
+ */
+static bool nocb_cpu_expendable(int cpu)
+{
+        cpumask_var_t non_nocb_cpus;
+        int ret;
+        /*
+         * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
+         * then offlining this CPU is harmless.  Let it happen.
+         */
+        if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
+                return 1;
+        /* If no memory, play it safe and keep the CPU around. */
+        if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
+                return 0;
+        cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
+        cpumask_clear_cpu(cpu, non_nocb_cpus);
+        ret = !cpumask_empty(non_nocb_cpus);
+        free_cpumask_var(non_nocb_cpus);
+        return ret;
+}
+/*
+ * Helper structure for remote registry of RCU callbacks.
+ * This is needed for when a no-CBs CPU needs to start a grace period.
+ * If it just invokes call_rcu(), the resulting callback will be queued,
+ * which can result in deadlock.
+ */
+struct rcu_head_remote {
+        struct rcu_head *rhp;
+        call_rcu_func_t *crf;
+        void (*func)(struct rcu_head *rhp);
+};
+/*
+ * Register a callback as specified by the rcu_head_remote struct.
+ * This function is intended to be invoked via smp_call_function_single().
+ */
+static void call_rcu_local(void *arg)
+{
+        struct rcu_head_remote *rhrp =
+                container_of(arg, struct rcu_head_remote, rhp);
+        rhrp->crf(rhrp->rhp, rhrp->func);
+}
+/*
+ * Set up an rcu_head_remote structure and the invoke call_rcu_local()
+ * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
+ * smp_call_function_single().
+ */
+static void invoke_crf_remote(struct rcu_head *rhp,
+                              void (*func)(struct rcu_head *rhp),
+                              call_rcu_func_t crf)
+{
+        struct rcu_head_remote rhr;
+        rhr.rhp = rhp;
+        rhr.crf = crf;
+        rhr.func = func;
+        smp_call_function_single(0, call_rcu_local, &rhr, 1);
+}
+/*
+ * Helper functions to be passed to wait_rcu_gp(), each of which
+ * invokes invoke_crf_remote() to register a callback appropriately.
+ */
+static void __maybe_unused
+call_rcu_preempt_remote(struct rcu_head *rhp,
+                        void (*func)(struct rcu_head *rhp))
+{
+        invoke_crf_remote(rhp, func, call_rcu);
+}
+static void call_rcu_bh_remote(struct rcu_head *rhp,
+                               void (*func)(struct rcu_head *rhp))
+{
+        invoke_crf_remote(rhp, func, call_rcu_bh);
+}
+static void call_rcu_sched_remote(struct rcu_head *rhp,
+                                  void (*func)(struct rcu_head *rhp))
+{
+        invoke_crf_remote(rhp, func, call_rcu_sched);
+}
+/*
+ * Per-rcu_data kthread, but only for no-CBs CPUs.  Each kthread invokes
+ * callbacks queued by the corresponding no-CBs CPU.
+ */
+static int rcu_nocb_kthread(void *arg)
+{
+        int c, cl;
+        struct rcu_head *list;
+        struct rcu_head *next;
+        struct rcu_head **tail;
+        struct rcu_data *rdp = arg;
+        /* Each pass through this loop invokes one batch of callbacks */
+        for (;;) {
+                /* If not polling, wait for next batch of callbacks. */
+                if (!rcu_nocb_poll)
+                        wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
+                list = ACCESS_ONCE(rdp->nocb_head);
+                if (!list) {
+                        schedule_timeout_interruptible(1);
+                        flush_signals(current);
+                        continue;
+                }
+                /*
+                 * Extract queued callbacks, update counts, and wait
+                 * for a grace period to elapse.
+                 */
+                ACCESS_ONCE(rdp->nocb_head) = NULL;
+                tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
+                c = atomic_long_xchg(&rdp->nocb_q_count, 0);
+                cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
+                ACCESS_ONCE(rdp->nocb_p_count) += c;
+                ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
+                wait_rcu_gp(rdp->rsp->call_remote);
+                /* Each pass through the following loop invokes a callback. */
+                trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
+                c = cl = 0;
+                while (list) {
+                        next = list->next;
+                        /* Wait for enqueuing to complete, if needed. */
+                        while (next == NULL && &list->next != tail) {
+                                schedule_timeout_interruptible(1);
+                                next = list->next;
+                        }
+                        debug_rcu_head_unqueue(list);
+                        local_bh_disable();
+                        if (__rcu_reclaim(rdp->rsp->name, list))
+                                cl++;
+                        c++;
+                        local_bh_enable();
+                        list = next;
+                }
+                trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
+                ACCESS_ONCE(rdp->nocb_p_count) -= c;
+                ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl;
+                rdp->n_nocbs_invoked += c;
+        }
+        return 0;
+}
+/* Initialize per-rcu_data variables for no-CBs CPUs. */
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+        rdp->nocb_tail = &rdp->nocb_head;
+        init_waitqueue_head(&rdp->nocb_wq);
+}
+/* Create a kthread for each RCU flavor for each no-CBs CPU. */
+static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
+{
+        int cpu;
+        struct rcu_data *rdp;
+        struct task_struct *t;
+        if (rcu_nocb_mask == NULL)
+                return;
+        for_each_cpu(cpu, rcu_nocb_mask) {
+                rdp = per_cpu_ptr(rsp->rda, cpu);
+                t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
+                BUG_ON(IS_ERR(t));
+                ACCESS_ONCE(rdp->nocb_kthread) = t;
+        }
+}
+/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
+static void init_nocb_callback_list(struct rcu_data *rdp)
+{
+        if (rcu_nocb_mask == NULL ||
+            !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
+                return;
+        rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+}
+/* Initialize the ->call_remote fields in the rcu_state structures. */
+static void __init rcu_init_nocb(void)
+{
+#ifdef CONFIG_PREEMPT_RCU
+        rcu_preempt_state.call_remote = call_rcu_preempt_remote;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+        rcu_bh_state.call_remote = call_rcu_bh_remote;
+        rcu_sched_state.call_remote = call_rcu_sched_remote;
+}
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static bool is_nocb_cpu(int cpu)
+{
+        return false;
+}
+static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
+                            bool lazy)
+{
+        return 0;
+}
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+                                                     struct rcu_data *rdp)
+{
+        return 0;
+}
+static bool nocb_cpu_expendable(int cpu)
+{
+        return 1;
+}
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+}
+static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
+{
+}
+static void init_nocb_callback_list(struct rcu_data *rdp)
+{
+}
+static void __init rcu_init_nocb(void)
+{
+}
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 693513bc50e6..0d095dcaa670 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,29 +46,58 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
-static int show_rcubarrier(struct seq_file *m, void *unused)
+#define ulong2long(a) (*(long *)(&(a)))
+static int r_open(struct inode *inode, struct file *file,
+                                        const struct seq_operations *op)
 {
-        struct rcu_state *rsp;
+        int ret = seq_open(file, op);
+        if (!ret) {
+                struct seq_file *m = (struct seq_file *)file->private_data;
+                m->private = inode->i_private;
+        }
+        return ret;
+}
+static void *r_start(struct seq_file *m, loff_t *pos)
+{
+        struct rcu_state *rsp = (struct rcu_state *)m->private;
+        *pos = cpumask_next(*pos - 1, cpu_possible_mask);
+        if ((*pos) < nr_cpu_ids)
+                return per_cpu_ptr(rsp->rda, *pos);
+        return NULL;
+}
-        for_each_rcu_flavor(rsp)
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
-                seq_printf(m, "%s: bcc: %d nbd: %lu\n",
+{
-                           rsp->name,
+        (*pos)++;
-                           atomic_read(&rsp->barrier_cpu_count),
+        return r_start(m, pos);
-                           rsp->n_barrier_done);
+}
+static void r_stop(struct seq_file *m, void *v)
+{
+}
+static int show_rcubarrier(struct seq_file *m, void *v)
+{
+        struct rcu_state *rsp = (struct rcu_state *)m->private;
+        seq_printf(m, "bcc: %d nbd: %lu\n",
+                   atomic_read(&rsp->barrier_cpu_count),
+                   rsp->n_barrier_done);
        return 0;
 }
 static int rcubarrier_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, show_rcubarrier, NULL);
+        return single_open(file, show_rcubarrier, inode->i_private);
 }
 static const struct file_operations rcubarrier_fops = {
        .owner = THIS_MODULE,
        .open = rcubarrier_open,
        .read = seq_read,
-        .llseek = seq_lseek,
+        .llseek = no_llseek,
-        .release = single_release,
+        .release = seq_release,
 };
 #ifdef CONFIG_RCU_BOOST
@@ -84,12 +113,14 @@ static char convert_kthread_status(unsigned int kthread_status)
 static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
+        long ql, qll;
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d",
+        seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
-                   rdp->completed, rdp->gpnum,
+                   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
                   rdp->passed_quiesce, rdp->qs_pending);
        seq_printf(m, " dt=%d/%llx/%d df=%lu",
                   atomic_read(&rdp->dynticks->dynticks),
@@ -97,8 +128,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
        seq_printf(m, " of=%lu", rdp->offline_fqs);
+        rcu_nocb_q_lengths(rdp, &ql, &qll);
+        qll += rdp->qlen_lazy;
+        ql += rdp->qlen;
        seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
-                   rdp->qlen_lazy, rdp->qlen,
+                   qll, ql,
                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
                        rdp->nxttail[RCU_NEXT_TAIL]],
                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -114,101 +148,67 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
 #endif /* #ifdef CONFIG_RCU_BOOST */
        seq_printf(m, " b=%ld", rdp->blimit);
-        seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
+        seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n",
-                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
+                   rdp->n_cbs_invoked, rdp->n_nocbs_invoked,
+                   rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
 }
-static int show_rcudata(struct seq_file *m, void *unused)
+static int show_rcudata(struct seq_file *m, void *v)
 {
-        int cpu;
+        print_one_rcu_data(m, (struct rcu_data *)v);
-        struct rcu_state *rsp;
-        for_each_rcu_flavor(rsp) {
-                seq_printf(m, "%s:\n", rsp->name);
-                for_each_possible_cpu(cpu)
-                        print_one_rcu_data(m, per_cpu_ptr(rsp->rda, cpu));
-        }
        return 0;
 }
+static const struct seq_operations rcudate_op = {
+        .start = r_start,
+        .next  = r_next,
+        .stop  = r_stop,
+        .show  = show_rcudata,
+};
 static int rcudata_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, show_rcudata, NULL);
+        return r_open(inode, file, &rcudate_op);
 }
 static const struct file_operations rcudata_fops = {
        .owner = THIS_MODULE,
        .open = rcudata_open,
        .read = seq_read,
-        .llseek = seq_lseek,
+        .llseek = no_llseek,
-        .release = single_release,
+        .release = seq_release,
 };
-static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
+static int show_rcuexp(struct seq_file *m, void *v)
-{
-        if (!rdp->beenonline)
-                return;
-        seq_printf(m, "%d,%s,%lu,%lu,%d,%d",
-                   rdp->cpu,
-                   cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
-                   rdp->completed, rdp->gpnum,
-                   rdp->passed_quiesce, rdp->qs_pending);
-        seq_printf(m, ",%d,%llx,%d,%lu",
-                   atomic_read(&rdp->dynticks->dynticks),
-                   rdp->dynticks->dynticks_nesting,
-                   rdp->dynticks->dynticks_nmi_nesting,
-                   rdp->dynticks_fqs);
-        seq_printf(m, ",%lu", rdp->offline_fqs);
-        seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,
-                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
-                        rdp->nxttail[RCU_NEXT_TAIL]],
-                   ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
-                        rdp->nxttail[RCU_NEXT_READY_TAIL]],
-                   ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
-                        rdp->nxttail[RCU_WAIT_TAIL]],
-                   ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
-#ifdef CONFIG_RCU_BOOST
-        seq_printf(m, ",%d,\"%c\"",
-                   per_cpu(rcu_cpu_has_work, rdp->cpu),
-                   convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
-                                          rdp->cpu)));
-#endif /* #ifdef CONFIG_RCU_BOOST */
-        seq_printf(m, ",%ld", rdp->blimit);
-        seq_printf(m, ",%lu,%lu,%lu\n",
-                   rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
-}
-static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
-        int cpu;
+        struct rcu_state *rsp = (struct rcu_state *)m->private;
-        struct rcu_state *rsp;
+        seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
-        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\",");
+                   atomic_long_read(&rsp->expedited_start),
-        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
+                   atomic_long_read(&rsp->expedited_done),
-        seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
+                   atomic_long_read(&rsp->expedited_wrap),
-#ifdef CONFIG_RCU_BOOST
+                   atomic_long_read(&rsp->expedited_tryfail),
-        seq_puts(m, "\"kt\",\"ktl\"");
+                   atomic_long_read(&rsp->expedited_workdone1),
-#endif /* #ifdef CONFIG_RCU_BOOST */
+                   atomic_long_read(&rsp->expedited_workdone2),
-        seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n");
+                   atomic_long_read(&rsp->expedited_normal),
-        for_each_rcu_flavor(rsp) {
+                   atomic_long_read(&rsp->expedited_stoppedcpus),
-                seq_printf(m, "\"%s:\"\n", rsp->name);
+                   atomic_long_read(&rsp->expedited_done_tries),
-                for_each_possible_cpu(cpu)
+                   atomic_long_read(&rsp->expedited_done_lost),
-                        print_one_rcu_data_csv(m, per_cpu_ptr(rsp->rda, cpu));
+                   atomic_long_read(&rsp->expedited_done_exit));
-        }
        return 0;
 }
-static int rcudata_csv_open(struct inode *inode, struct file *file)
+static int rcuexp_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, show_rcudata_csv, NULL);
+        return single_open(file, show_rcuexp, inode->i_private);
 }
-static const struct file_operations rcudata_csv_fops = {
+static const struct file_operations rcuexp_fops = {
        .owner = THIS_MODULE,
-        .open = rcudata_csv_open,
+        .open = rcuexp_open,
        .read = seq_read,
-        .llseek = seq_lseek,
+        .llseek = no_llseek,
-        .release = single_release,
+        .release = seq_release,
 };
 #ifdef CONFIG_RCU_BOOST
@@ -254,27 +254,11 @@ static const struct file_operations rcu_node_boost_fops = {
        .owner = THIS_MODULE,
        .open = rcu_node_boost_open,
        .read = seq_read,
-        .llseek = seq_lseek,
+        .llseek = no_llseek,
        .release = single_release,
 };
-/*
+#endif /* #ifdef CONFIG_RCU_BOOST */
- * Create the rcuboost debugfs entry.  Standard error return.
- */
-static int rcu_boost_trace_create_file(struct dentry *rcudir)
-{
-        return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
-                                    &rcu_node_boost_fops);
-}
-#else /* #ifdef CONFIG_RCU_BOOST */
-static int rcu_boost_trace_create_file(struct dentry *rcudir)
-{
-        return 0;  /* There cannot be an error if we didn't create it! */
-}
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
 static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 {
@@ -283,8 +267,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        struct rcu_node *rnp;
        gpnum = rsp->gpnum;
-        seq_printf(m, "%s: c=%lu g=%lu s=%d jfq=%ld j=%x ",
+        seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
-                   rsp->name, rsp->completed, gpnum, rsp->fqs_state,
+                   ulong2long(rsp->completed), ulong2long(gpnum),
+                   rsp->fqs_state,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff));
        seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
@@ -306,26 +291,24 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        seq_puts(m, "\n");
 }
-static int show_rcuhier(struct seq_file *m, void *unused)
+static int show_rcuhier(struct seq_file *m, void *v)
 {
-        struct rcu_state *rsp;
+        struct rcu_state *rsp = (struct rcu_state *)m->private;
+        print_one_rcu_state(m, rsp);
-        for_each_rcu_flavor(rsp)
-                print_one_rcu_state(m, rsp);
        return 0;
 }
 static int rcuhier_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, show_rcuhier, NULL);
+        return single_open(file, show_rcuhier, inode->i_private);
 }
 static const struct file_operations rcuhier_fops = {
        .owner = THIS_MODULE,
        .open = rcuhier_open,
        .read = seq_read,
-        .llseek = seq_lseek,
+        .llseek = no_llseek,
-        .release = single_release,
+        .release = seq_release,
 };
 static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -338,42 +321,42 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
        struct rcu_node *rnp = &rsp->node[0];
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        completed = rsp->completed;
+        completed = ACCESS_ONCE(rsp->completed);
-        gpnum = rsp->gpnum;
+        gpnum = ACCESS_ONCE(rsp->gpnum);
-        if (rsp->completed == rsp->gpnum)
+        if (completed == gpnum)
                gpage = 0;
        else
                gpage = jiffies - rsp->gp_start;
        gpmax = rsp->gp_max;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        seq_printf(m, "%s: completed=%ld  gpnum=%lu  age=%ld  max=%ld\n",
+        seq_printf(m, "completed=%ld  gpnum=%ld  age=%ld  max=%ld\n",
-                   rsp->name, completed, gpnum, gpage, gpmax);
+                   ulong2long(completed), ulong2long(gpnum), gpage, gpmax);
 }
-static int show_rcugp(struct seq_file *m, void *unused)
+static int show_rcugp(struct seq_file *m, void *v)
 {
-        struct rcu_state *rsp;
+        struct rcu_state *rsp = (struct rcu_state *)m->private;
+        show_one_rcugp(m, rsp);
-        for_each_rcu_flavor(rsp)
-                show_one_rcugp(m, rsp);
        return 0;
 }
 static int rcugp_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, show_rcugp, NULL);
+        return single_open(file, show_rcugp, inode->i_private);
 }
 static const struct file_operations rcugp_fops = {
        .owner = THIS_MODULE,
        .open = rcugp_open,
        .read = seq_read,
-        .llseek = seq_lseek,
+        .llseek = no_llseek,
-        .release = single_release,
+        .release = seq_release,
 };
 static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
 {
+        if (!rdp->beenonline)
+                return;
        seq_printf(m, "%3d%cnp=%ld ",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
@@ -389,34 +372,30 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
                   rdp->n_rp_need_nothing);
 }
-static int show_rcu_pending(struct seq_file *m, void *unused)
+static int show_rcu_pending(struct seq_file *m, void *v)
 {
-        int cpu;
+        print_one_rcu_pending(m, (struct rcu_data *)v);
-        struct rcu_data *rdp;
-        struct rcu_state *rsp;
-        for_each_rcu_flavor(rsp) {
-                seq_printf(m, "%s:\n", rsp->name);
-                for_each_possible_cpu(cpu) {
-                        rdp = per_cpu_ptr(rsp->rda, cpu);
-                        if (rdp->beenonline)
-                                print_one_rcu_pending(m, rdp);
-                }
-        }
        return 0;
 }
+static const struct seq_operations rcu_pending_op = {
+        .start = r_start,
+        .next  = r_next,
+        .stop  = r_stop,
+        .show  = show_rcu_pending,
+};
 static int rcu_pending_open(struct inode *inode, struct file *file)
 {
-        return single_open(file, show_rcu_pending, NULL);
+        return r_open(inode, file, &rcu_pending_op);
 }
 static const struct file_operations rcu_pending_fops = {
        .owner = THIS_MODULE,
        .open = rcu_pending_open,
        .read = seq_read,
-        .llseek = seq_lseek,
+        .llseek = no_llseek,
-        .release = single_release,
+        .release = seq_release,
 };
 static int show_rcutorture(struct seq_file *m, void *unused)
@@ -446,43 +425,58 @@ static struct dentry *rcudir;
 static int __init rcutree_trace_init(void)
 {
+        struct rcu_state *rsp;
        struct dentry *retval;
+        struct dentry *rspdir;
        rcudir = debugfs_create_dir("rcu", NULL);
        if (!rcudir)
                goto free_out;
-        retval = debugfs_create_file("rcubarrier", 0444, rcudir,
+        for_each_rcu_flavor(rsp) {
-                                                NULL, &rcubarrier_fops);
+                rspdir = debugfs_create_dir(rsp->name, rcudir);
-        if (!retval)
+                if (!rspdir)
-                goto free_out;
+                        goto free_out;
-        retval = debugfs_create_file("rcudata", 0444, rcudir,
+                retval = debugfs_create_file("rcudata", 0444,
-                                                NULL, &rcudata_fops);
+                                rspdir, rsp, &rcudata_fops);
-        if (!retval)
+                if (!retval)
-                goto free_out;
+                        goto free_out;
-        retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
+                retval = debugfs_create_file("rcuexp", 0444,
-                                                NULL, &rcudata_csv_fops);
+                                rspdir, rsp, &rcuexp_fops);
-        if (!retval)
+                if (!retval)
-                goto free_out;
+                        goto free_out;
-        if (rcu_boost_trace_create_file(rcudir))
+                retval = debugfs_create_file("rcu_pending", 0444,
-                goto free_out;
+                                rspdir, rsp, &rcu_pending_fops);
+                if (!retval)
+                        goto free_out;
+                retval = debugfs_create_file("rcubarrier", 0444,
+                                rspdir, rsp, &rcubarrier_fops);
+                if (!retval)
+                        goto free_out;
-        retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+#ifdef CONFIG_RCU_BOOST
-        if (!retval)
+                if (rsp == &rcu_preempt_state) {
-                goto free_out;
+                        retval = debugfs_create_file("rcuboost", 0444,
+                                rspdir, NULL, &rcu_node_boost_fops);
+                        if (!retval)
+                                goto free_out;
+                }
+#endif
-        retval = debugfs_create_file("rcuhier", 0444, rcudir,
+                retval = debugfs_create_file("rcugp", 0444,
-                                                NULL, &rcuhier_fops);
+                                rspdir, rsp, &rcugp_fops);
-        if (!retval)
+                if (!retval)
-                goto free_out;
+                        goto free_out;
-        retval = debugfs_create_file("rcu_pending", 0444, rcudir,
+                retval = debugfs_create_file("rcuhier", 0444,
-                                                NULL, &rcu_pending_fops);
+                                rspdir, rsp, &rcuhier_fops);
-        if (!retval)
+                if (!retval)
-                goto free_out;
+                        goto free_out;
+        }
        retval = debugfs_create_file("rcutorture", 0444, rcudir,
                                                NULL, &rcutorture_fops);
diff --git a/kernel/relay.c b/kernel/relay.c
index e8cd2027abbd..01ab081ac53a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1139,7 +1139,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
        if (!desc->count)
                return 0;
-        mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+        mutex_lock(&file_inode(filp)->i_mutex);
        do {
                if (!relay_file_read_avail(buf, *ppos))
                        break;
@@ -1159,7 +1159,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
                        *ppos = relay_file_read_end_pos(buf, read_start, ret);
                }
        } while (desc->count && ret);
-        mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+        mutex_unlock(&file_inode(filp)->i_mutex);
        return desc->written;
 }
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index ad581aa2369a..ff55247e7049 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
        return __res_counter_charge(counter, val, limit_fail_at, true);
 }
-void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
+u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 {
        if (WARN_ON(counter->usage < val))
                val = counter->usage;
        counter->usage -= val;
+        return counter->usage;
 }
-void res_counter_uncharge_until(struct res_counter *counter,
+u64 res_counter_uncharge_until(struct res_counter *counter,
-                                struct res_counter *top,
+                               struct res_counter *top,
-                                unsigned long val)
+                               unsigned long val)
 {
        unsigned long flags;
        struct res_counter *c;
+        u64 ret = 0;
        local_irq_save(flags);
        for (c = counter; c != top; c = c->parent) {
+                u64 r;
                spin_lock(&c->lock);
-                res_counter_uncharge_locked(c, val);
+                r = res_counter_uncharge_locked(c, val);
+                if (c == counter)
+                        ret = r;
                spin_unlock(&c->lock);
        }
        local_irq_restore(flags);
+        return ret;
 }
-void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
 {
-        res_counter_uncharge_until(counter, NULL, val);
+        return res_counter_uncharge_until(counter, NULL, val);
 }
 static inline unsigned long long *
@@ -192,25 +198,3 @@ int res_counter_memparse_write_strategy(const char *buf,
        *res = PAGE_ALIGN(*res);
        return 0;
 }
-int res_counter_write(struct res_counter *counter, int member,
-                      const char *buf, write_strategy_fn write_strategy)
-{
-        char *end;
-        unsigned long flags;
-        unsigned long long tmp, *val;
-        if (write_strategy) {
-                if (write_strategy(buf, &tmp))
-                        return -EINVAL;
-        } else {
-                tmp = simple_strtoull(buf, &end, 10);
-                if (*end != '\0')
-                        return -EINVAL;
-        }
-        spin_lock_irqsave(&counter->lock, flags);
-        val = res_counter_member(counter, member);
-        *val = tmp;
-        spin_unlock_irqrestore(&counter->lock, flags);
-        return 0;
-}
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 16502d3a71c8..13b243a323fa 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -17,6 +17,7 @@
 * See rt.c in preempt-rt for proper credits and further information
 */
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 98ec49475460..7890b10084a7 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -10,6 +10,7 @@
 #include <linux/kthread.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a242e691c993..1e09308bf2a1 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include <linux/timer.h>
 #include "rtmutex_common.h"
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 6850f53e02d8..b3c6c3fcd847 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
 EXPORT_SYMBOL(down_read_nested);
+void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
+{
+        might_sleep();
+        rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
+        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+}
+EXPORT_SYMBOL(_down_write_nest_lock);
 void down_write_nested(struct rw_semaphore *sem, int subclass)
 {
        might_sleep();
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
        ag->tg->rt_se = NULL;
        ag->tg->rt_rq = NULL;
 #endif
+        sched_offline_group(ag->tg);
        sched_destroy_group(ag->tg);
 }
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
        if (IS_ERR(tg))
                goto out_free;
+        sched_online_group(tg, &root_task_group);
        kref_init(&ag->kref);
        init_rwsem(&ag->lock);
        ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492df..c3ae1446461c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
        u64 this_clock, remote_clock;
        u64 *ptr, old_val, val;
+#if BITS_PER_LONG != 64
+again:
+        /*
+         * Careful here: The local and the remote clock values need to
+         * be read out atomic as we need to compare the values and
+         * then update either the local or the remote side. So the
+         * cmpxchg64 below only protects one readout.
+         *
+         * We must reread via sched_clock_local() in the retry case on
+         * 32bit as an NMI could use sched_clock_local() via the
+         * tracer and hit between the readout of
+         * the low32bit and the high 32bit portion.
+         */
+        this_clock = sched_clock_local(my_scd);
+        /*
+         * We must enforce atomic readout on 32bit, otherwise the
+         * update on the remote cpu can hit inbetween the readout of
+         * the low32bit and the high 32bit portion.
+         */
+        remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+        /*
+         * On 64bit the read of [my]scd->clock is atomic versus the
+         * update, so we can avoid the above 32bit dance.
+         */
        sched_clock_local(my_scd);
 again:
        this_clock = my_scd->clock;
        remote_clock = scd->clock;
+#endif
        /*
         * Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..67d04651f44b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -72,6 +72,7 @@
 #include <linux/slab.h>
 #include <linux/init_task.h>
 #include <linux/binfmts.h>
+#include <linux/context_tracking.h>
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -82,7 +83,7 @@
 #endif
 #include "sched.h"
-#include "../workqueue_sched.h"
+#include "../workqueue_internal.h"
 #include "../smpboot.h"
 #define CREATE_TRACE_POINTS
@@ -192,23 +193,10 @@ static void sched_feat_disable(int i) { };
 static void sched_feat_enable(int i) { };
 #endif /* HAVE_JUMP_LABEL */
-static ssize_t
+static int sched_feat_set(char *cmp)
-sched_feat_write(struct file *filp, const char __user *ubuf,
-                size_t cnt, loff_t *ppos)
 {
-        char buf[64];
-        char *cmp;
-        int neg = 0;
        int i;
+        int neg = 0;
-        if (cnt > 63)
-                cnt = 63;
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        cmp = strstrip(buf);
        if (strncmp(cmp, "NO_", 3) == 0) {
                neg = 1;
@@ -228,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                }
        }
+        return i;
+}
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+                size_t cnt, loff_t *ppos)
+{
+        char buf[64];
+        char *cmp;
+        int i;
+        if (cnt > 63)
+                cnt = 63;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        cmp = strstrip(buf);
+        i = sched_feat_set(cmp);
        if (i == __SCHED_FEAT_NR)
                return -EINVAL;
@@ -922,6 +931,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                rq->skip_clock_update = 1;
 }
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+void register_task_migration_notifier(struct notifier_block *n)
+{
+        atomic_notifier_chain_register(&task_migration_notifier, n);
+}
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -952,8 +968,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        trace_sched_migrate_task(p, new_cpu);
        if (task_cpu(p) != new_cpu) {
+                struct task_migration_notifier tmn;
+                if (p->sched_class->migrate_task_rq)
+                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+                tmn.task = p;
+                tmn.from_cpu = task_cpu(p);
+                tmn.to_cpu = new_cpu;
+                atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
        }
        __set_task_cpu(p, new_cpu);
@@ -1106,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
 */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
-        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+        int nid = cpu_to_node(cpu);
+        const struct cpumask *nodemask = NULL;
        enum { cpuset, possible, fail } state = cpuset;
        int dest_cpu;
-        /* Look for allowed, online CPU in same node. */
+        /*
-        for_each_cpu(dest_cpu, nodemask) {
+         * If the node that the cpu is on has been offlined, cpu_to_node()
-                if (!cpu_online(dest_cpu))
+         * will return -1. There is no cpu on the node, and we should
-                        continue;
+         * select the cpu on the other node.
-                if (!cpu_active(dest_cpu))
+         */
-                        continue;
+        if (nid != -1) {
-                if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+                nodemask = cpumask_of_node(nid);
-                        return dest_cpu;
+                /* Look for allowed, online CPU in same node. */
+                for_each_cpu(dest_cpu, nodemask) {
+                        if (!cpu_online(dest_cpu))
+                                continue;
+                        if (!cpu_active(dest_cpu))
+                                continue;
+                        if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+                                return dest_cpu;
+                }
        }
        for (;;) {
@@ -1462,8 +1498,10 @@ static void try_to_wake_up_local(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
-        BUG_ON(rq != this_rq());
+        if (WARN_ON_ONCE(rq != this_rq()) ||
-        BUG_ON(p == current);
+            WARN_ON_ONCE(p == current))
+                return;
        lockdep_assert_held(&rq->lock);
        if (!raw_spin_trylock(&p->pi_lock)) {
@@ -1497,7 +1535,8 @@ out:
 */
 int wake_up_process(struct task_struct *p)
 {
-        return try_to_wake_up(p, TASK_ALL, 0);
+        WARN_ON(task_is_stopped_or_traced(p));
+        return try_to_wake_up(p, TASK_NORMAL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
@@ -1524,6 +1563,15 @@ static void __sched_fork(struct task_struct *p)
        p->se.vruntime                  = 0;
        INIT_LIST_HEAD(&p->se.group_node);
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+        p->se.avg.runnable_avg_period = 0;
+        p->se.avg.runnable_avg_sum = 0;
+#endif
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
@@ -1533,8 +1581,41 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+                p->mm->numa_next_scan = jiffies;
+                p->mm->numa_next_reset = jiffies;
+                p->mm->numa_scan_seq = 0;
+        }
+        p->node_stamp = 0ULL;
+        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+        p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
 }
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_SCHED_DEBUG
+void set_numabalancing_state(bool enabled)
+{
+        if (enabled)
+                sched_feat_set("NUMA");
+        else
+                sched_feat_set("NO_NUMA");
+}
+#else
+__read_mostly bool numabalancing_enabled;
+void set_numabalancing_state(bool enabled)
+{
+        numabalancing_enabled = enabled;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_NUMA_BALANCING */
 /*
 * fork()/clone()-time setup:
 */
@@ -1673,9 +1754,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
        struct preempt_notifier *notifier;
-        struct hlist_node *node;
-        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+        hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
                notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }
@@ -1684,9 +1764,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
                                 struct task_struct *next)
 {
        struct preempt_notifier *notifier;
-        struct hlist_node *node;
-        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+        hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
                notifier->ops->sched_out(notifier, next);
 }
@@ -1886,8 +1965,8 @@ context_switch(struct rq *rq, struct task_struct *prev,
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 #endif
+        context_tracking_task_switch(prev, next);
        /* Here we just switch the register state and the stack. */
-        rcu_switch(prev, next);
        switch_to(prev, next, prev);
        barrier();
@@ -1900,11 +1979,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
 }
 /*
- * nr_running, nr_uninterruptible and nr_context_switches:
+ * nr_running and nr_context_switches:
 *
 * externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
+ * threads, total number of context switches performed since bootup.
- * number of context switches performed since bootup.
 */
 unsigned long nr_running(void)
 {
@@ -1916,23 +1994,6 @@ unsigned long nr_running(void)
        return sum;
 }
-unsigned long nr_uninterruptible(void)
-{
-        unsigned long i, sum = 0;
-        for_each_possible_cpu(i)
-                sum += cpu_rq(i)->nr_uninterruptible;
-        /*
-         * Since we read the counters lockless, it might be slightly
-         * inaccurate. Do not allow it to go below zero though:
-         */
-        if (unlikely((long)sum < 0))
-                sum = 0;
-        return sum;
-}
 unsigned long long nr_context_switches(void)
 {
        int i;
@@ -2717,7 +2778,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
        if (irqs_disabled())
                print_irqtrace_events(prev);
        dump_stack();
-        add_taint(TAINT_WARN);
+        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 /*
@@ -2911,7 +2972,7 @@ asmlinkage void __sched schedule(void)
 }
 EXPORT_SYMBOL(schedule);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_CONTEXT_TRACKING
 asmlinkage void __sched schedule_user(void)
 {
        /*
@@ -2920,9 +2981,9 @@ asmlinkage void __sched schedule_user(void)
         * we haven't yet exited the RCU idle mode. Do it here manually until
         * we find a better solution.
         */
-        rcu_user_exit();
+        user_exit();
        schedule();
-        rcu_user_enter();
+        user_enter();
 }
 #endif
@@ -3027,7 +3088,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
        /* Catch callers which need to be fixed */
        BUG_ON(ti->preempt_count || !irqs_disabled());
-        rcu_user_exit();
+        user_exit();
        do {
                add_preempt_count(PREEMPT_ACTIVE);
                local_irq_enable();
@@ -3199,7 +3260,8 @@ void complete_all(struct completion *x)
 EXPORT_SYMBOL(complete_all);
 static inline long __sched
-do_wait_for_common(struct completion *x, long timeout, int state)
+do_wait_for_common(struct completion *x,
+                   long (*action)(long), long timeout, int state)
 {
        if (!x->done) {
                DECLARE_WAITQUEUE(wait, current);
@@ -3212,7 +3274,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
                        }
                        __set_current_state(state);
                        spin_unlock_irq(&x->wait.lock);
-                        timeout = schedule_timeout(timeout);
+                        timeout = action(timeout);
                        spin_lock_irq(&x->wait.lock);
                } while (!x->done && timeout);
                __remove_wait_queue(&x->wait, &wait);
@@ -3223,17 +3285,30 @@ do_wait_for_common(struct completion *x, long timeout, int state)
        return timeout ?: 1;
 }
-static long __sched
+static inline long __sched
-wait_for_common(struct completion *x, long timeout, int state)
+__wait_for_common(struct completion *x,
+                  long (*action)(long), long timeout, int state)
 {
        might_sleep();
        spin_lock_irq(&x->wait.lock);
-        timeout = do_wait_for_common(x, timeout, state);
+        timeout = do_wait_for_common(x, action, timeout, state);
        spin_unlock_irq(&x->wait.lock);
        return timeout;
 }
+static long __sched
+wait_for_common(struct completion *x, long timeout, int state)
+{
+        return __wait_for_common(x, schedule_timeout, timeout, state);
+}
+static long __sched
+wait_for_common_io(struct completion *x, long timeout, int state)
+{
+        return __wait_for_common(x, io_schedule_timeout, timeout, state);
+}
 /**
 * wait_for_completion: - waits for completion of a task
 * @x:  holds the state of this particular completion
@@ -3270,6 +3345,39 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 EXPORT_SYMBOL(wait_for_completion_timeout);
 /**
+ * wait_for_completion_io: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout. The caller is accounted as waiting
+ * for IO.
+ */
+void __sched wait_for_completion_io(struct completion *x)
+{
+        wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io);
+/**
+ * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible. The caller is accounted as waiting for IO.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
+ */
+unsigned long __sched
+wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
+{
+        return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(wait_for_completion_io_timeout);
+/**
 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
 * @x:  holds the state of this particular completion
 *
@@ -4029,8 +4137,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
-        if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
+        if (!check_same_owner(p)) {
-                goto out_unlock;
+                rcu_read_lock();
+                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+                        rcu_read_unlock();
+                        goto out_unlock;
+                }
+                rcu_read_unlock();
+        }
        retval = security_task_setscheduler(p);
        if (retval)
@@ -4289,20 +4403,32 @@ EXPORT_SYMBOL(yield);
 * It's the caller's job to ensure that the target task struct
 * can't go away on us before we can do any checks.
 *
- * Returns true if we indeed boosted the target task.
+ * Returns:
+ *      true (>0) if we indeed boosted the target task.
+ *      false (0) if we failed to boost the target.
+ *      -ESRCH if there's no task to yield to.
 */
 bool __sched yield_to(struct task_struct *p, bool preempt)
 {
        struct task_struct *curr = current;
        struct rq *rq, *p_rq;
        unsigned long flags;
-        bool yielded = 0;
+        int yielded = 0;
        local_irq_save(flags);
        rq = this_rq();
 again:
        p_rq = task_rq(p);
+        /*
+         * If we're the only runnable task on the rq and target rq also
+         * has only one task, there's absolutely no point in yielding.
+         */
+        if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+                yielded = -ESRCH;
+                goto out_irq;
+        }
        double_rq_lock(rq, p_rq);
        while (task_rq(p) != p_rq) {
                double_rq_unlock(rq, p_rq);
@@ -4310,13 +4436,13 @@ again:
        }
        if (!curr->sched_class->yield_to_task)
-                goto out;
+                goto out_unlock;
        if (curr->sched_class != p->sched_class)
-                goto out;
+                goto out_unlock;
        if (task_running(p_rq, p) || p->state)
-                goto out;
+                goto out_unlock;
        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
        if (yielded) {
@@ -4329,11 +4455,12 @@ again:
                        resched_task(p_rq->curr);
        }
-out:
+out_unlock:
        double_rq_unlock(rq, p_rq);
+out_irq:
        local_irq_restore(flags);
-        if (yielded)
+        if (yielded > 0)
                schedule();
        return yielded;
@@ -4474,6 +4601,7 @@ static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 void sched_show_task(struct task_struct *p)
 {
        unsigned long free = 0;
+        int ppid;
        unsigned state;
        state = p->state ? __ffs(p->state) + 1 : 0;
@@ -4493,8 +4621,11 @@ void sched_show_task(struct task_struct *p)
 #ifdef CONFIG_DEBUG_STACK_USAGE
        free = stack_not_used(p);
 #endif
+        rcu_read_lock();
+        ppid = task_pid_nr(rcu_dereference(p->real_parent));
+        rcu_read_unlock();
        printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-                task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
+                task_pid_nr(p), ppid,
                (unsigned long)task_thread_info(p)->flags);
        show_stack(p, NULL);
@@ -4588,6 +4719,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         */
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
+        vtime_init_idle(idle);
 #if defined(CONFIG_SMP)
        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
@@ -4869,7 +5001,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
 }
 static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
 static void
 set_table_entry(struct ctl_table *entry,
@@ -7081,7 +7213,6 @@ static void free_sched_group(struct task_group *tg)
 struct task_group *sched_create_group(struct task_group *parent)
 {
        struct task_group *tg;
-        unsigned long flags;
        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
        if (!tg)
@@ -7093,6 +7224,17 @@ struct task_group *sched_create_group(struct task_group *parent)
        if (!alloc_rt_sched_group(tg, parent))
                goto err;
+        return tg;
+err:
+        free_sched_group(tg);
+        return ERR_PTR(-ENOMEM);
+}
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+        unsigned long flags;
        spin_lock_irqsave(&task_group_lock, flags);
        list_add_rcu(&tg->list, &task_groups);
@@ -7102,12 +7244,6 @@ struct task_group *sched_create_group(struct task_group *parent)
        INIT_LIST_HEAD(&tg->children);
        list_add_rcu(&tg->siblings, &parent->children);
        spin_unlock_irqrestore(&task_group_lock, flags);
-        return tg;
-err:
-        free_sched_group(tg);
-        return ERR_PTR(-ENOMEM);
 }
 /* rcu callback to free various structures associated with a task group */
@@ -7120,6 +7256,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
+        /* wait for possible concurrent references to cfs_rqs complete */
+        call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+void sched_offline_group(struct task_group *tg)
+{
        unsigned long flags;
        int i;
@@ -7131,9 +7273,6 @@ void sched_destroy_group(struct task_group *tg)
        list_del_rcu(&tg->list);
        list_del_rcu(&tg->siblings);
        spin_unlock_irqrestore(&task_group_lock, flags);
-        /* wait for possible concurrent references to cfs_rqs complete */
-        call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 /* change task's runqueue when it moves between groups.
@@ -7429,6 +7568,25 @@ static int sched_rt_global_constraints(void)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
+int sched_rr_handler(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret;
+        static DEFINE_MUTEX(mutex);
+        mutex_lock(&mutex);
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        /* make sure that internally we keep jiffies */
+        /* also, writing zero resets timeslice to default */
+        if (!ret && write) {
+                sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+                        RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+        }
+        mutex_unlock(&mutex);
+        return ret;
+}
 int sched_rt_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -7468,7 +7626,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
                            struct task_group, css);
 }
-static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
 {
        struct task_group *tg, *parent;
@@ -7485,13 +7643,33 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
        return &tg->css;
 }
-static void cpu_cgroup_destroy(struct cgroup *cgrp)
+static int cpu_cgroup_css_online(struct cgroup *cgrp)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        struct task_group *parent;
+        if (!cgrp->parent)
+                return 0;
+        parent = cgroup_tg(cgrp->parent);
+        sched_online_group(tg, parent);
+        return 0;
+}
+static void cpu_cgroup_css_free(struct cgroup *cgrp)
 {
        struct task_group *tg = cgroup_tg(cgrp);
        sched_destroy_group(tg);
 }
+static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        sched_offline_group(tg);
+}
 static int cpu_cgroup_can_attach(struct cgroup *cgrp,
                                 struct cgroup_taskset *tset)
 {
@@ -7845,8 +8023,10 @@ static struct cftype cpu_files[] = {
 struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
-        .create         = cpu_cgroup_create,
+        .css_alloc      = cpu_cgroup_css_alloc,
-        .destroy        = cpu_cgroup_destroy,
+        .css_free       = cpu_cgroup_css_free,
+        .css_online     = cpu_cgroup_css_online,
+        .css_offline    = cpu_cgroup_css_offline,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
@@ -7869,7 +8049,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 struct cpuacct root_cpuacct;
 /* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
+static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
 {
        struct cpuacct *ca;
@@ -7899,7 +8079,7 @@ out:
 }
 /* destroy an existing cpu accounting group */
-static void cpuacct_destroy(struct cgroup *cgrp)
+static void cpuacct_css_free(struct cgroup *cgrp)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
@@ -8070,9 +8250,15 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 struct cgroup_subsys cpuacct_subsys = {
        .name = "cpuacct",
-        .create = cpuacct_create,
+        .css_alloc = cpuacct_css_alloc,
-        .destroy = cpuacct_destroy,
+        .css_free = cpuacct_css_free,
        .subsys_id = cpuacct_subsys_id,
        .base_cftypes = files,
 };
 #endif  /* CONFIG_CGROUP_CPUACCT */
+void dump_cpu_task(int cpu)
+{
+        pr_info("Task dump for CPU %d:\n", cpu);
+        sched_show_task(cpu_curr(cpu));
+}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 23aa789c53ee..1095e878a46f 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,8 @@
 */
 #include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include "cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 81b763ba58a6..e93cca92f38b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/kernel_stat.h>
 #include <linux/static_key.h>
+#include <linux/context_tracking.h>
 #include "sched.h"
@@ -43,7 +44,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
 * Called before incrementing preempt_count on {soft,}irq_enter
 * and before decrementing preempt_count on {soft,}irq_exit.
 */
-void vtime_account(struct task_struct *curr)
+void irqtime_account_irq(struct task_struct *curr)
 {
        unsigned long flags;
        s64 delta;
@@ -73,7 +74,7 @@ void vtime_account(struct task_struct *curr)
        irq_time_write_end();
        local_irq_restore(flags);
 }
-EXPORT_SYMBOL_GPL(vtime_account);
+EXPORT_SYMBOL_GPL(irqtime_account_irq);
 static int irqtime_account_hi_update(void)
 {
@@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
        task_group_account_field(p, index, (__force u64) cputime);
        /* Account for user time used */
-        acct_update_integrals(p);
+        acct_account_cputime(p);
 }
 /*
@@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
        task_group_account_field(p, index, (__force u64) cputime);
        /* Account for system time used */
-        acct_update_integrals(p);
+        acct_account_cputime(p);
 }
 /*
@@ -288,7 +289,35 @@ static __always_inline bool steal_account_process_tick(void)
        return false;
 }
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+/*
+ * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
+ * tasks (sum on group iteration) belonging to @tsk's group.
+ */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+        struct signal_struct *sig = tsk->signal;
+        cputime_t utime, stime;
+        struct task_struct *t;
+        times->utime = sig->utime;
+        times->stime = sig->stime;
+        times->sum_exec_runtime = sig->sum_sched_runtime;
+        rcu_read_lock();
+        /* make sure we can trust tsk->thread_group list */
+        if (!likely(pid_alive(tsk)))
+                goto out;
+        t = tsk;
+        do {
+                task_cputime(t, &utime, &stime);
+                times->utime += utime;
+                times->stime += stime;
+                times->sum_exec_runtime += task_sched_runtime(t);
+        } while_each_thread(tsk, t);
+out:
+        rcu_read_unlock();
+}
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
@@ -354,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks)
                irqtime_account_process_tick(current, 0, rq);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                                                struct rq *rq) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /*
 * Account a single tick of cpu time.
 * @p: the process that the cpu time gets accounted to
@@ -369,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
        struct rq *rq = this_rq();
+        if (vtime_accounting_enabled())
+                return;
        if (sched_clock_irqtime) {
                irqtime_account_process_tick(p, user_tick, rq);
                return;
@@ -410,20 +443,19 @@ void account_idle_ticks(unsigned long ticks)
        account_idle_time(jiffies_to_cputime(ticks));
 }
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-#endif
 /*
 * Use precise platform statistics if available:
 */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        *ut = p->utime;
        *st = p->stime;
 }
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        struct task_cputime cputime;
@@ -433,6 +465,24 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        *st = cputime.stime;
 }
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_task_switch(struct task_struct *prev)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        if (is_idle_task(prev))
+                vtime_account_idle(prev);
+        else
+                vtime_account_system(prev);
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+        vtime_account_user(prev);
+#endif
+        arch_vtime_task_switch(prev);
+}
+#endif
 /*
 * Archs that account the whole time spent in the idle task
 * (outside irq) as idle time can rely on this and just implement
@@ -442,33 +492,40 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 * vtime_account().
 */
 #ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
 {
-        unsigned long flags;
+        if (!vtime_accounting_enabled())
+                return;
-        local_irq_save(flags);
-        if (in_interrupt() || !is_idle_task(tsk))
-                vtime_account_system(tsk);
-        else
-                vtime_account_idle(tsk);
-        local_irq_restore(flags);
+        if (!in_interrupt()) {
+                /*
+                 * If we interrupted user, context_tracking_in_user()
+                 * is 1 because the context tracking don't hook
+                 * on irq entry/exit. This way we know if
+                 * we need to flush user time on kernel entry.
+                 */
+                if (context_tracking_in_user()) {
+                        vtime_account_user(tsk);
+                        return;
+                }
+                if (is_idle_task(tsk)) {
+                        vtime_account_idle(tsk);
+                        return;
+                }
+        }
+        vtime_account_system(tsk);
 }
-EXPORT_SYMBOL_GPL(vtime_account);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#else
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
-#ifndef nsecs_to_cputime
+static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
-# define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
-#endif
-static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
 {
        u64 temp = (__force u64) rtime;
-        temp *= (__force u64) utime;
+        temp *= (__force u64) stime;
        if (sizeof(cputime_t) == 4)
                temp = div_u64(temp, (__force u32) total);
@@ -478,53 +535,283 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
        return (__force cputime_t) temp;
 }
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+/*
+ * Adjust tick based cputime random precision against scheduler
+ * runtime accounting.
+ */
+static void cputime_adjust(struct task_cputime *curr,
+                           struct cputime *prev,
+                           cputime_t *ut, cputime_t *st)
 {
-        cputime_t rtime, utime = p->utime, total = utime + p->stime;
+        cputime_t rtime, stime, total;
+        stime = curr->stime;
+        total = stime + curr->utime;
        /*
-         * Use CFS's precise accounting:
+         * Tick based cputime accounting depend on random scheduling
+         * timeslices of a task to be interrupted or not by the timer.
+         * Depending on these circumstances, the number of these interrupts
+         * may be over or under-optimistic, matching the real user and system
+         * cputime with a variable precision.
+         *
+         * Fix this by scaling these tick based values against the total
+         * runtime accounted by the CFS scheduler.
         */
-        rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
+        rtime = nsecs_to_cputime(curr->sum_exec_runtime);
        if (total)
-                utime = scale_utime(utime, rtime, total);
+                stime = scale_stime(stime, rtime, total);
        else
-                utime = rtime;
+                stime = rtime;
        /*
-         * Compare with previous values, to keep monotonicity:
+         * If the tick based count grows faster than the scheduler one,
+         * the result of the scaling may go backward.
+         * Let's enforce monotonicity.
         */
-        p->prev_utime = max(p->prev_utime, utime);
+        prev->stime = max(prev->stime, stime);
-        p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+        prev->utime = max(prev->utime, rtime - prev->stime);
+        *ut = prev->utime;
+        *st = prev->stime;
+}
-        *ut = p->prev_utime;
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-        *st = p->prev_stime;
+{
+        struct task_cputime cputime = {
+                .sum_exec_runtime = p->se.sum_exec_runtime,
+        };
+        task_cputime(p, &cputime.utime, &cputime.stime);
+        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 /*
 * Must be called with siglock held.
 */
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-        struct signal_struct *sig = p->signal;
        struct task_cputime cputime;
-        cputime_t rtime, utime, total;
        thread_group_cputime(p, &cputime);
+        cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
+}
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
-        total = cputime.utime + cputime.stime;
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-        rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+static unsigned long long vtime_delta(struct task_struct *tsk)
+{
+        unsigned long long clock;
-        if (total)
+        clock = local_clock();
-                utime = scale_utime(cputime.utime, rtime, total);
+        if (clock < tsk->vtime_snap)
-        else
+                return 0;
-                utime = rtime;
-        sig->prev_utime = max(sig->prev_utime, utime);
+        return clock - tsk->vtime_snap;
-        sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
+}
-        *ut = sig->prev_utime;
+static cputime_t get_vtime_delta(struct task_struct *tsk)
-        *st = sig->prev_stime;
+{
+        unsigned long long delta = vtime_delta(tsk);
+        WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+        tsk->vtime_snap += delta;
+        /* CHECKME: always safe to convert nsecs to cputime? */
+        return nsecs_to_cputime(delta);
 }
-#endif
+static void __vtime_account_system(struct task_struct *tsk)
+{
+        cputime_t delta_cpu = get_vtime_delta(tsk);
+        account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+}
+void vtime_account_system(struct task_struct *tsk)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        write_seqlock(&tsk->vtime_seqlock);
+        __vtime_account_system(tsk);
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_account_irq_exit(struct task_struct *tsk)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        write_seqlock(&tsk->vtime_seqlock);
+        if (context_tracking_in_user())
+                tsk->vtime_snap_whence = VTIME_USER;
+        __vtime_account_system(tsk);
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_account_user(struct task_struct *tsk)
+{
+        cputime_t delta_cpu;
+        if (!vtime_accounting_enabled())
+                return;
+        delta_cpu = get_vtime_delta(tsk);
+        write_seqlock(&tsk->vtime_seqlock);
+        tsk->vtime_snap_whence = VTIME_SYS;
+        account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_user_enter(struct task_struct *tsk)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        write_seqlock(&tsk->vtime_seqlock);
+        tsk->vtime_snap_whence = VTIME_USER;
+        __vtime_account_system(tsk);
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_guest_enter(struct task_struct *tsk)
+{
+        write_seqlock(&tsk->vtime_seqlock);
+        __vtime_account_system(tsk);
+        current->flags |= PF_VCPU;
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_guest_exit(struct task_struct *tsk)
+{
+        write_seqlock(&tsk->vtime_seqlock);
+        __vtime_account_system(tsk);
+        current->flags &= ~PF_VCPU;
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_account_idle(struct task_struct *tsk)
+{
+        cputime_t delta_cpu = get_vtime_delta(tsk);
+        account_idle_time(delta_cpu);
+}
+bool vtime_accounting_enabled(void)
+{
+        return context_tracking_active();
+}
+void arch_vtime_task_switch(struct task_struct *prev)
+{
+        write_seqlock(&prev->vtime_seqlock);
+        prev->vtime_snap_whence = VTIME_SLEEPING;
+        write_sequnlock(&prev->vtime_seqlock);
+        write_seqlock(&current->vtime_seqlock);
+        current->vtime_snap_whence = VTIME_SYS;
+        current->vtime_snap = sched_clock();
+        write_sequnlock(&current->vtime_seqlock);
+}
+void vtime_init_idle(struct task_struct *t)
+{
+        unsigned long flags;
+        write_seqlock_irqsave(&t->vtime_seqlock, flags);
+        t->vtime_snap_whence = VTIME_SYS;
+        t->vtime_snap = sched_clock();
+        write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+}
+cputime_t task_gtime(struct task_struct *t)
+{
+        unsigned int seq;
+        cputime_t gtime;
+        do {
+                seq = read_seqbegin(&t->vtime_seqlock);
+                gtime = t->gtime;
+                if (t->flags & PF_VCPU)
+                        gtime += vtime_delta(t);
+        } while (read_seqretry(&t->vtime_seqlock, seq));
+        return gtime;
+}
+/*
+ * Fetch cputime raw values from fields of task_struct and
+ * add up the pending nohz execution time since the last
+ * cputime snapshot.
+ */
+static void
+fetch_task_cputime(struct task_struct *t,
+                   cputime_t *u_dst, cputime_t *s_dst,
+                   cputime_t *u_src, cputime_t *s_src,
+                   cputime_t *udelta, cputime_t *sdelta)
+{
+        unsigned int seq;
+        unsigned long long delta;
+        do {
+                *udelta = 0;
+                *sdelta = 0;
+                seq = read_seqbegin(&t->vtime_seqlock);
+                if (u_dst)
+                        *u_dst = *u_src;
+                if (s_dst)
+                        *s_dst = *s_src;
+                /* Task is sleeping, nothing to add */
+                if (t->vtime_snap_whence == VTIME_SLEEPING ||
+                    is_idle_task(t))
+                        continue;
+                delta = vtime_delta(t);
+                /*
+                 * Task runs either in user or kernel space, add pending nohz time to
+                 * the right place.
+                 */
+                if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
+                        *udelta = delta;
+                } else {
+                        if (t->vtime_snap_whence == VTIME_SYS)
+                                *sdelta = delta;
+                }
+        } while (read_seqretry(&t->vtime_seqlock, seq));
+}
+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+{
+        cputime_t udelta, sdelta;
+        fetch_task_cputime(t, utime, stime, &t->utime,
+                           &t->stime, &udelta, &sdelta);
+        if (utime)
+                *utime += udelta;
+        if (stime)
+                *stime += sdelta;
+}
+void task_cputime_scaled(struct task_struct *t,
+                         cputime_t *utimescaled, cputime_t *stimescaled)
+{
+        cputime_t udelta, sdelta;
+        fetch_task_cputime(t, utimescaled, stimescaled,
+                           &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
+        if (utimescaled)
+                *utimescaled += cputime_to_scaled(udelta);
+        if (stimescaled)
+                *stimescaled += cputime_to_scaled(sdelta);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6f79596e0ea9..75024a673520 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)
 static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
        struct sched_entity *se = tg->se[cpu];
-        if (!se)
-                return;
 #define P(F) \
        SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
 #define PN(F) \
        SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+        if (!se) {
+                struct sched_avg *avg = &cpu_rq(cpu)->avg;
+                P(avg->runnable_avg_sum);
+                P(avg->runnable_avg_period);
+                return;
+        }
        PN(se->exec_start);
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
@@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        P(se->statistics.wait_count);
 #endif
        P(se->load.weight);
+#ifdef CONFIG_SMP
+        P(se->avg.runnable_avg_sum);
+        P(se->avg.runnable_avg_period);
+        P(se->avg.load_avg_contrib);
+        P(se->avg.decay_count);
+#endif
 #undef PN
 #undef P
 }
@@ -98,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
        if (autogroup_path(tg, group_path, PATH_MAX))
                return group_path;
-        /*
-         * May be NULL if the underlying cgroup isn't fully-created yet
-         */
-        if (!tg->css.cgroup) {
-                group_path[0] = '\0';
-                return group_path;
-        }
        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
        return group_path;
 }
@@ -206,14 +211,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg",
+        SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg",
-                        SPLIT_NS(cfs_rq->load_avg));
+                        cfs_rq->runnable_load_avg);
-        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period",
+        SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
-                        SPLIT_NS(cfs_rq->load_period));
+                        cfs_rq->blocked_load_avg);
-        SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
+        SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_avg",
-                        cfs_rq->load_contribution);
+                        (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg));
-        SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
+        SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
-                        atomic_read(&cfs_rq->tg->load_weight));
+                        cfs_rq->tg_load_contrib);
+        SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
+                        cfs_rq->tg_runnable_contrib);
+        SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
+                        atomic_read(&cfs_rq->tg->runnable_avg));
 #endif
        print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -253,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)
        {
                unsigned int freq = cpu_khz ? : 1;
-                SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+                SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
                           cpu, freq / 1000, (freq % 1000));
        }
 #else
-        SEQ_printf(m, "\ncpu#%d\n", cpu);
+        SEQ_printf(m, "cpu#%d\n", cpu);
 #endif
 #define P(x)                                                            \
@@ -314,6 +323,7 @@ do {									\
        print_rq(m, rq, cpu);
        rcu_read_unlock();
        spin_unlock_irqrestore(&sched_debug_lock, flags);
+        SEQ_printf(m, "\n");
 }
 static const char *sched_tunable_scaling_names[] = {
@@ -322,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {
        "linear"
 };
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
 {
        u64 ktime, sched_clk, cpu_clk;
        unsigned long flags;
-        int cpu;
        local_irq_save(flags);
        ktime = ktime_to_ns(ktime_get());
@@ -368,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #undef PN
 #undef P
-        SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+        SEQ_printf(m, "  .%-40s: %d (%s)\n",
+                "sysctl_sched_tunable_scaling",
                sysctl_sched_tunable_scaling,
                sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+        SEQ_printf(m, "\n");
+}
-        for_each_online_cpu(cpu)
+static int sched_debug_show(struct seq_file *m, void *v)
-                print_cpu(m, cpu);
+{
+        int cpu = (unsigned long)(v - 2);
-        SEQ_printf(m, "\n");
+        if (cpu != -1)
+                print_cpu(m, cpu);
+        else
+                sched_debug_header(m);
        return 0;
 }
 void sysrq_sched_debug_show(void)
 {
-        sched_debug_show(NULL, NULL);
+        int cpu;
+        sched_debug_header(NULL);
+        for_each_online_cpu(cpu)
+                print_cpu(NULL, cpu);
+}
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+        unsigned long n = *offset;
+        if (n == 0)
+                return (void *) 1;
+        n--;
+        if (n > 0)
+                n = cpumask_next(n - 1, cpu_online_mask);
+        else
+                n = cpumask_first(cpu_online_mask);
+        *offset = n + 1;
+        if (n < nr_cpu_ids)
+                return (void *)(unsigned long)(n + 2);
+        return NULL;
+}
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+        (*offset)++;
+        return sched_debug_start(file, offset);
+}
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+static const struct seq_operations sched_debug_sops = {
+        .start = sched_debug_start,
+        .next = sched_debug_next,
+        .stop = sched_debug_stop,
+        .show = sched_debug_show,
+};
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+        seq_release(inode, file);
+        return 0;
 }
 static int sched_debug_open(struct inode *inode, struct file *filp)
 {
-        return single_open(filp, sched_debug_show, NULL);
+        int ret = 0;
+        ret = seq_open(filp, &sched_debug_sops);
+        return ret;
 }
 static const struct file_operations sched_debug_fops = {
        .open           = sched_debug_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = single_release,
+        .release        = sched_debug_release,
 };
 static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..7a33e5986fc5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
 #include <trace/events/sched.h>
@@ -259,6 +262,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return grp->my_q;
 }
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+                                       int force_update);
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        if (!cfs_rq->on_list) {
@@ -278,6 +284,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
                }
                cfs_rq->on_list = 1;
+                /* We should have no load, but we need to update last_decay. */
+                update_cfs_rq_blocked_load(cfs_rq, 0);
        }
 }
@@ -653,9 +661,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
 /*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
@@ -675,10 +680,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-        cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
 }
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -776,6 +777,230 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+static void task_numa_placement(struct task_struct *p)
+{
+        int seq;
+        if (!p->mm)     /* for example, ksmd faulting in a user's mm */
+                return;
+        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+        if (p->numa_scan_seq == seq)
+                return;
+        p->numa_scan_seq = seq;
+        /* FIXME: Scheduling placement policy hints go here */
+}
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages, bool migrated)
+{
+        struct task_struct *p = current;
+        if (!sched_feat_numa(NUMA))
+                return;
+        /* FIXME: Allocate task-specific structure for placement policy here */
+        /*
+         * If pages are properly placed (did not migrate) then scan slower.
+         * This is reset periodically in case of phase changes
+         */
+        if (!migrated)
+                p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+                        p->numa_scan_period + jiffies_to_msecs(10));
+        task_numa_placement(p);
+}
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+        ACCESS_ONCE(p->mm->numa_scan_seq)++;
+        p->mm->numa_scan_offset = 0;
+}
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+        unsigned long migrate, next_scan, now = jiffies;
+        struct task_struct *p = current;
+        struct mm_struct *mm = p->mm;
+        struct vm_area_struct *vma;
+        unsigned long start, end;
+        long pages;
+        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+        work->next = work; /* protect against double add */
+        /*
+         * Who cares about NUMA placement when they're dying.
+         *
+         * NOTE: make sure not to dereference p->mm before this check,
+         * exit_task_work() happens _after_ exit_mm() so we could be called
+         * without p->mm even though we still had it when we enqueued this
+         * work.
+         */
+        if (p->flags & PF_EXITING)
+                return;
+        /*
+         * We do not care about task placement until a task runs on a node
+         * other than the first one used by the address space. This is
+         * largely because migrations are driven by what CPU the task
+         * is running on. If it's never scheduled on another node, it'll
+         * not migrate so why bother trapping the fault.
+         */
+        if (mm->first_nid == NUMA_PTE_SCAN_INIT)
+                mm->first_nid = numa_node_id();
+        if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
+                /* Are we running on a new node yet? */
+                if (numa_node_id() == mm->first_nid &&
+                    !sched_feat_numa(NUMA_FORCE))
+                        return;
+                mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+        }
+        /*
+         * Reset the scan period if enough time has gone by. Objective is that
+         * scanning will be reduced if pages are properly placed. As tasks
+         * can enter different phases this needs to be re-examined. Lacking
+         * proper tracking of reference behaviour, this blunt hammer is used.
+         */
+        migrate = mm->numa_next_reset;
+        if (time_after(now, migrate)) {
+                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
+                xchg(&mm->numa_next_reset, next_scan);
+        }
+        /*
+         * Enforce maximal scan/migration frequency..
+         */
+        migrate = mm->numa_next_scan;
+        if (time_before(now, migrate))
+                return;
+        if (p->numa_scan_period == 0)
+                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+                return;
+        /*
+         * Do not set pte_numa if the current running node is rate-limited.
+         * This loses statistics on the fault but if we are unwilling to
+         * migrate to this node, it is less likely we can do useful work
+         */
+        if (migrate_ratelimited(numa_node_id()))
+                return;
+        start = mm->numa_scan_offset;
+        pages = sysctl_numa_balancing_scan_size;
+        pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+        if (!pages)
+                return;
+        down_read(&mm->mmap_sem);
+        vma = find_vma(mm, start);
+        if (!vma) {
+                reset_ptenuma_scan(p);
+                start = 0;
+                vma = mm->mmap;
+        }
+        for (; vma; vma = vma->vm_next) {
+                if (!vma_migratable(vma))
+                        continue;
+                /* Skip small VMAs. They are not likely to be of relevance */
+                if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+                        continue;
+                do {
+                        start = max(start, vma->vm_start);
+                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
+                        end = min(end, vma->vm_end);
+                        pages -= change_prot_numa(vma, start, end);
+                        start = end;
+                        if (pages <= 0)
+                                goto out;
+                } while (end != vma->vm_end);
+        }
+out:
+        /*
+         * It is possible to reach the end of the VMA list but the last few VMAs are
+         * not guaranteed to the vma_migratable. If they are not, we would find the
+         * !migratable VMA on the next scan but not reset the scanner to the start
+         * so check it now.
+         */
+        if (vma)
+                mm->numa_scan_offset = start;
+        else
+                reset_ptenuma_scan(p);
+        up_read(&mm->mmap_sem);
+}
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+        struct callback_head *work = &curr->numa_work;
+        u64 period, now;
+        /*
+         * We don't care about NUMA placement if we don't have memory.
+         */
+        if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+                return;
+        /*
+         * Using runtime rather than walltime has the dual advantage that
+         * we (mostly) drive the selection from busy threads and that the
+         * task needs to have done some actual work before we bother with
+         * NUMA placement.
+         */
+        now = curr->se.sum_exec_runtime;
+        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+        if (now - curr->node_stamp > period) {
+                if (!curr->node_stamp)
+                        curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                curr->node_stamp = now;
+                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+                        task_work_add(curr, work, true);
+                }
+        }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -801,72 +1026,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 # ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
-                                            int global_update)
-{
-        struct task_group *tg = cfs_rq->tg;
-        long load_avg;
-        load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-        load_avg -= cfs_rq->load_contribution;
-        if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
-                atomic_add(load_avg, &tg->load_weight);
-                cfs_rq->load_contribution += load_avg;
-        }
-}
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-        u64 period = sysctl_sched_shares_window;
-        u64 now, delta;
-        unsigned long load = cfs_rq->load.weight;
-        if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
-                return;
-        now = rq_of(cfs_rq)->clock_task;
-        delta = now - cfs_rq->load_stamp;
-        /* truncate load history at 4 idle periods */
-        if (cfs_rq->load_stamp > cfs_rq->load_last &&
-            now - cfs_rq->load_last > 4 * period) {
-                cfs_rq->load_period = 0;
-                cfs_rq->load_avg = 0;
-                delta = period - 1;
-        }
-        cfs_rq->load_stamp = now;
-        cfs_rq->load_unacc_exec_time = 0;
-        cfs_rq->load_period += delta;
-        if (load) {
-                cfs_rq->load_last = now;
-                cfs_rq->load_avg += delta * load;
-        }
-        /* consider updating load contribution on each fold or truncate */
-        if (global_update || cfs_rq->load_period > period
-            || !cfs_rq->load_period)
-                update_cfs_rq_load_contribution(cfs_rq, global_update);
-        while (cfs_rq->load_period > period) {
-                /*
-                 * Inline assembly required to prevent the compiler
-                 * optimising this loop into a divmod call.
-                 * See __iter_div_u64_rem() for another example of this.
-                 */
-                asm("" : "+rm" (cfs_rq->load_period));
-                cfs_rq->load_period /= 2;
-                cfs_rq->load_avg /= 2;
-        }
-        if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
-                list_del_leaf_cfs_rq(cfs_rq);
-}
 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 {
        long tg_weight;
@@ -876,8 +1036,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
         * to gain a more accurate current total weight. See
         * update_cfs_rq_load_contribution().
         */
-        tg_weight = atomic_read(&tg->load_weight);
+        tg_weight = atomic64_read(&tg->load_avg);
-        tg_weight -= cfs_rq->load_contribution;
+        tg_weight -= cfs_rq->tg_load_contrib;
        tg_weight += cfs_rq->load.weight;
        return tg_weight;
@@ -901,27 +1061,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
        return shares;
 }
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-        if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-                update_cfs_load(cfs_rq, 0);
-                update_cfs_shares(cfs_rq);
-        }
-}
 # else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
        return tg->shares;
 }
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
 # endif /* CONFIG_SMP */
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
@@ -939,6 +1083,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                account_entity_enqueue(cfs_rq, se);
 }
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 static void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
        struct task_group *tg;
@@ -958,18 +1104,477 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
        reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 }
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+        0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+        0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+        0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+        0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+        0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+        0x85aac367, 0x82cd8698,
+};
+/*
+ * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
+ * over-estimates when re-combining.
+ */
+static const u32 runnable_avg_yN_sum[] = {
+            0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+         9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+        17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+/*
+ * Approximate:
+ *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
+{
+        unsigned int local_n;
+        if (!n)
+                return val;
+        else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+                return 0;
+        /* after bounds checking we can collapse to 32-bit */
+        local_n = n;
+        /*
+         * As y^PERIOD = 1/2, we can combine
+         *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+         * With a look-up table which covers k^n (n<PERIOD)
+         *
+         * To achieve constant time decay_load.
+         */
+        if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+                val >>= local_n / LOAD_AVG_PERIOD;
+                local_n %= LOAD_AVG_PERIOD;
+        }
+        val *= runnable_avg_yN_inv[local_n];
+        /* We don't use SRR here since we always want to round down. */
+        return val >> 32;
+}
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(u64 n)
+{
+        u32 contrib = 0;
+        if (likely(n <= LOAD_AVG_PERIOD))
+                return runnable_avg_yN_sum[n];
+        else if (unlikely(n >= LOAD_AVG_MAX_N))
+                return LOAD_AVG_MAX;
+        /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+        do {
+                contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+                contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+                n -= LOAD_AVG_PERIOD;
+        } while (n > LOAD_AVG_PERIOD);
+        contrib = decay_load(contrib, n);
+        return contrib + runnable_avg_yN_sum[n];
+}
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series.  To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ *      p0            p1           p2
+ *     (now)       (~1ms ago)  (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ *   y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+                                                        struct sched_avg *sa,
+                                                        int runnable)
 {
+        u64 delta, periods;
+        u32 runnable_contrib;
+        int delta_w, decayed = 0;
+        delta = now - sa->last_runnable_update;
+        /*
+         * This should only happen when time goes backwards, which it
+         * unfortunately does during sched clock init when we swap over to TSC.
+         */
+        if ((s64)delta < 0) {
+                sa->last_runnable_update = now;
+                return 0;
+        }
+        /*
+         * Use 1024ns as the unit of measurement since it's a reasonable
+         * approximation of 1us and fast to compute.
+         */
+        delta >>= 10;
+        if (!delta)
+                return 0;
+        sa->last_runnable_update = now;
+        /* delta_w is the amount already accumulated against our next period */
+        delta_w = sa->runnable_avg_period % 1024;
+        if (delta + delta_w >= 1024) {
+                /* period roll-over */
+                decayed = 1;
+                /*
+                 * Now that we know we're crossing a period boundary, figure
+                 * out how much from delta we need to complete the current
+                 * period and accrue it.
+                 */
+                delta_w = 1024 - delta_w;
+                if (runnable)
+                        sa->runnable_avg_sum += delta_w;
+                sa->runnable_avg_period += delta_w;
+                delta -= delta_w;
+                /* Figure out how many additional periods this update spans */
+                periods = delta / 1024;
+                delta %= 1024;
+                sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+                                                  periods + 1);
+                sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+                                                     periods + 1);
+                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+                runnable_contrib = __compute_runnable_contrib(periods);
+                if (runnable)
+                        sa->runnable_avg_sum += runnable_contrib;
+                sa->runnable_avg_period += runnable_contrib;
+        }
+        /* Remainder of delta accrued against u_0` */
+        if (runnable)
+                sa->runnable_avg_sum += delta;
+        sa->runnable_avg_period += delta;
+        return decayed;
 }
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+/* Synchronize an entity's decay with its parenting cfs_rq.*/
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
 {
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        u64 decays = atomic64_read(&cfs_rq->decay_counter);
+        decays -= se->avg.decay_count;
+        if (!decays)
+                return 0;
+        se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+        se->avg.decay_count = 0;
+        return decays;
 }
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+                                                 int force_update)
+{
+        struct task_group *tg = cfs_rq->tg;
+        s64 tg_contrib;
+        tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+        tg_contrib -= cfs_rq->tg_load_contrib;
+        if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+                atomic64_add(tg_contrib, &tg->load_avg);
+                cfs_rq->tg_load_contrib += tg_contrib;
+        }
+}
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+                                                  struct cfs_rq *cfs_rq)
+{
+        struct task_group *tg = cfs_rq->tg;
+        long contrib;
+        /* The fraction of a cpu used by this cfs_rq */
+        contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+                          sa->runnable_avg_period + 1);
+        contrib -= cfs_rq->tg_runnable_contrib;
+        if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+                atomic_add(contrib, &tg->runnable_avg);
+                cfs_rq->tg_runnable_contrib += contrib;
+        }
+}
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+        struct cfs_rq *cfs_rq = group_cfs_rq(se);
+        struct task_group *tg = cfs_rq->tg;
+        int runnable_avg;
+        u64 contrib;
+        contrib = cfs_rq->tg_load_contrib * tg->shares;
+        se->avg.load_avg_contrib = div64_u64(contrib,
+                                             atomic64_read(&tg->load_avg) + 1);
+        /*
+         * For group entities we need to compute a correction term in the case
+         * that they are consuming <1 cpu so that we would contribute the same
+         * load as a task of equal weight.
+         *
+         * Explicitly co-ordinating this measurement would be expensive, but
+         * fortunately the sum of each cpus contribution forms a usable
+         * lower-bound on the true value.
+         *
+         * Consider the aggregate of 2 contributions.  Either they are disjoint
+         * (and the sum represents true value) or they are disjoint and we are
+         * understating by the aggregate of their overlap.
+         *
+         * Extending this to N cpus, for a given overlap, the maximum amount we
+         * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
+         * cpus that overlap for this interval and w_i is the interval width.
+         *
+         * On a small machine; the first term is well-bounded which bounds the
+         * total error since w_i is a subset of the period.  Whereas on a
+         * larger machine, while this first term can be larger, if w_i is the
+         * of consequential size guaranteed to see n_i*w_i quickly converge to
+         * our upper bound of 1-cpu.
+         */
+        runnable_avg = atomic_read(&tg->runnable_avg);
+        if (runnable_avg < NICE_0_LOAD) {
+                se->avg.load_avg_contrib *= runnable_avg;
+                se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+        }
+}
+#else
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+                                                 int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+                                                  struct cfs_rq *cfs_rq) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+#endif
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+        u32 contrib;
+        /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+        contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+        contrib /= (se->avg.runnable_avg_period + 1);
+        se->avg.load_avg_contrib = scale_load(contrib);
+}
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+        long old_contrib = se->avg.load_avg_contrib;
+        if (entity_is_task(se)) {
+                __update_task_entity_contrib(se);
+        } else {
+                __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+                __update_group_entity_contrib(se);
+        }
+        return se->avg.load_avg_contrib - old_contrib;
+}
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+                                                 long load_contrib)
+{
+        if (likely(load_contrib < cfs_rq->blocked_load_avg))
+                cfs_rq->blocked_load_avg -= load_contrib;
+        else
+                cfs_rq->blocked_load_avg = 0;
+}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se,
+                                          int update_cfs_rq)
+{
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        long contrib_delta;
+        u64 now;
+        /*
+         * For a group entity we need to use their owned cfs_rq_clock_task() in
+         * case they are the parent of a throttled hierarchy.
+         */
+        if (entity_is_task(se))
+                now = cfs_rq_clock_task(cfs_rq);
+        else
+                now = cfs_rq_clock_task(group_cfs_rq(se));
+        if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+                return;
+        contrib_delta = __update_entity_load_avg_contrib(se);
+        if (!update_cfs_rq)
+                return;
+        if (se->on_rq)
+                cfs_rq->runnable_load_avg += contrib_delta;
+        else
+                subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+{
+        u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+        u64 decays;
+        decays = now - cfs_rq->last_decay;
+        if (!decays && !force_update)
+                return;
+        if (atomic64_read(&cfs_rq->removed_load)) {
+                u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+                subtract_blocked_load_contrib(cfs_rq, removed_load);
+        }
+        if (decays) {
+                cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+                                                      decays);
+                atomic64_add(decays, &cfs_rq->decay_counter);
+                cfs_rq->last_decay = now;
+        }
+        __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+        __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+}
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+                                                  struct sched_entity *se,
+                                                  int wakeup)
+{
+        /*
+         * We track migrations using entity decay_count <= 0, on a wake-up
+         * migration we use a negative decay count to track the remote decays
+         * accumulated while sleeping.
+         */
+        if (unlikely(se->avg.decay_count <= 0)) {
+                se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+                if (se->avg.decay_count) {
+                        /*
+                         * In a wake-up migration we have to approximate the
+                         * time sleeping.  This is because we can't synchronize
+                         * clock_task between the two cpus, and it is not
+                         * guaranteed to be read-safe.  Instead, we can
+                         * approximate this using our carried decays, which are
+                         * explicitly atomically readable.
+                         */
+                        se->avg.last_runnable_update -= (-se->avg.decay_count)
+                                                        << 20;
+                        update_entity_load_avg(se, 0);
+                        /* Indicate that we're now synchronized and on-rq */
+                        se->avg.decay_count = 0;
+                }
+                wakeup = 0;
+        } else {
+                __synchronize_entity_decay(se);
+        }
+        /* migrated tasks did not contribute to our blocked load */
+        if (wakeup) {
+                subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+                update_entity_load_avg(se, 0);
+        }
+        cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+        /* we force update consideration on load-balancer moves */
+        update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+}
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+                                                  struct sched_entity *se,
+                                                  int sleep)
+{
+        update_entity_load_avg(se, 1);
+        /* we force update consideration on load-balancer moves */
+        update_cfs_rq_blocked_load(cfs_rq, !sleep);
+        cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+        if (sleep) {
+                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+        } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se,
+                                          int update_cfs_rq) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+                                           struct sched_entity *se,
+                                           int wakeup) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+                                           struct sched_entity *se,
+                                           int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+                                              int force_update) {}
+#endif
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -1075,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        }
        /* ensure we never gain time by being placed backwards. */
-        vruntime = max_vruntime(se->vruntime, vruntime);
+        se->vruntime = max_vruntime(se->vruntime, vruntime);
-        se->vruntime = vruntime;
 }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -1096,7 +1699,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
-        update_cfs_load(cfs_rq, 0);
+        enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
@@ -1171,6 +1774,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
+        dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
        update_stats_dequeue(cfs_rq, se);
        if (flags & DEQUEUE_SLEEP) {
@@ -1191,7 +1795,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
        se->on_rq = 0;
-        update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
        /*
@@ -1340,6 +1943,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
                update_stats_wait_start(cfs_rq, prev);
                /* Put 'current' back into the tree. */
                __enqueue_entity(cfs_rq, prev);
+                /* in !on_rq case, update occurred at dequeue */
+                update_entity_load_avg(prev, 1);
        }
        cfs_rq->curr = NULL;
 }
@@ -1353,9 +1958,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
        update_curr(cfs_rq);
        /*
-         * Update share accounting for long-running entities.
+         * Ensure that runnable average is periodically updated.
         */
-        update_entity_shares_tick(cfs_rq);
+        update_entity_load_avg(curr, 1);
+        update_cfs_rq_blocked_load(cfs_rq, 1);
 #ifdef CONFIG_SCHED_HRTICK
        /*
@@ -1448,6 +2054,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
        return &tg->cfs_bandwidth;
 }
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+        if (unlikely(cfs_rq->throttle_count))
+                return cfs_rq->throttled_clock_task;
+        return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+}
 /* returns 0 on failure to allocate runtime */
 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
@@ -1592,14 +2207,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
        cfs_rq->throttle_count--;
 #ifdef CONFIG_SMP
        if (!cfs_rq->throttle_count) {
-                u64 delta = rq->clock_task - cfs_rq->load_stamp;
+                /* adjust cfs_rq_clock_task() */
+                cfs_rq->throttled_clock_task_time += rq->clock_task -
-                /* leaving throttled state, advance shares averaging windows */
+                                             cfs_rq->throttled_clock_task;
-                cfs_rq->load_stamp += delta;
-                cfs_rq->load_last += delta;
-                /* update entity weight now that we are on_rq again */
-                update_cfs_shares(cfs_rq);
        }
 #endif
@@ -1611,9 +2221,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
        struct rq *rq = data;
        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
-        /* group is entering throttled state, record last load */
+        /* group is entering throttled state, stop time */
        if (!cfs_rq->throttle_count)
-                update_cfs_load(cfs_rq, 0);
+                cfs_rq->throttled_clock_task = rq->clock_task;
        cfs_rq->throttle_count++;
        return 0;
@@ -1628,7 +2238,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
-        /* account load preceding throttle */
+        /* freeze hierarchy runnable averages while throttled */
        rcu_read_lock();
        walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
        rcu_read_unlock();
@@ -1652,7 +2262,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
                rq->nr_running -= task_delta;
        cfs_rq->throttled = 1;
-        cfs_rq->throttled_timestamp = rq->clock;
+        cfs_rq->throttled_clock = rq->clock;
        raw_spin_lock(&cfs_b->lock);
        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
        raw_spin_unlock(&cfs_b->lock);
@@ -1670,10 +2280,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->throttled = 0;
        raw_spin_lock(&cfs_b->lock);
-        cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+        cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
        list_del_rcu(&cfs_rq->throttled_list);
        raw_spin_unlock(&cfs_b->lock);
-        cfs_rq->throttled_timestamp = 0;
        update_rq_clock(rq);
        /* update hierarchical throttle state */
@@ -2052,7 +2661,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        hrtimer_cancel(&cfs_b->slack_timer);
 }
-static void unthrottle_offline_cfs_rqs(struct rq *rq)
+static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
        struct cfs_rq *cfs_rq;
@@ -2073,8 +2682,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
 }
 #else /* CONFIG_CFS_BANDWIDTH */
-static __always_inline
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
+{
+        return rq_of(cfs_rq)->clock_task;
+}
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2207,12 +2821,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
-                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
+                update_entity_load_avg(se, 1);
        }
-        if (!se)
+        if (!se) {
+                update_rq_runnable_avg(rq, rq->nr_running);
                inc_nr_running(rq);
+        }
        hrtick_update(rq);
 }
@@ -2266,12 +2882,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
-                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
+                update_entity_load_avg(se, 1);
        }
-        if (!se)
+        if (!se) {
                dec_nr_running(rq);
+                update_rq_runnable_avg(rq, 1);
+        }
        hrtick_update(rq);
 }
@@ -2634,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 */
 static int select_idle_sibling(struct task_struct *p, int target)
 {
-        int cpu = smp_processor_id();
-        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
        struct sched_group *sg;
-        int i;
+        int i = task_cpu(p);
-        /*
+        if (idle_cpu(target))
-         * If the task is going to be woken-up on this cpu and if it is
+                return target;
-         * already idle, then it is the right target.
-         */
-        if (target == cpu && idle_cpu(cpu))
-                return cpu;
        /*
-         * If the task is going to be woken-up on the cpu where it previously
+         * If the prevous cpu is cache affine and idle, don't be stupid.
-         * ran and if it is currently idle, then it the right target.
         */
-        if (target == prev_cpu && idle_cpu(prev_cpu))
+        if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-                return prev_cpu;
+                return i;
        /*
         * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -2666,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
                                goto next;
                        for_each_cpu(i, sched_group_cpus(sg)) {
-                                if (!idle_cpu(i))
+                                if (i == target || !idle_cpu(i))
                                        goto next;
                        }
@@ -2781,6 +3392,37 @@ unlock:
        return new_cpu;
 }
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including the state of rq->lock, should be made.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+{
+        struct sched_entity *se = &p->se;
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        /*
+         * Load tracking: accumulate removed load so that it can be processed
+         * when we next update owning cfs_rq under rq->lock.  Tasks contribute
+         * to blocked load iff they have a positive decay-count.  It can never
+         * be negative here since on-rq tasks have decay-count == 0.
+         */
+        if (se->avg.decay_count) {
+                se->avg.decay_count = -__synchronize_entity_decay(se);
+                atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+        }
+}
+#endif
 #endif /* CONFIG_SMP */
 static unsigned long
@@ -2907,7 +3549,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         * Batch and idle tasks do not preempt non-idle tasks (their preemption
         * is driven by the tick):
         */
-        if (unlikely(p->policy != SCHED_NORMAL))
+        if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
                return;
        find_matching_se(&se, &pse);
@@ -3033,8 +3675,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 #ifdef CONFIG_SMP
 /**************************************************
- * Fair scheduling class load-balancing methods:
+ * Fair scheduling class load-balancing methods.
- */
+ *
+ * BASICS
+ *
+ * The purpose of load-balancing is to achieve the same basic fairness the
+ * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * time to each task. This is expressed in the following equation:
+ *
+ *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
+ *
+ * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * W_i,0 is defined as:
+ *
+ *   W_i,0 = \Sum_j w_i,j                                             (2)
+ *
+ * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ * is derived from the nice value as per prio_to_weight[].
+ *
+ * The weight average is an exponential decay average of the instantaneous
+ * weight:
+ *
+ *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
+ *
+ * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
+ * fraction of 'recent' time available for SCHED_OTHER task execution. But it
+ * can also include other factors [XXX].
+ *
+ * To achieve this balance we define a measure of imbalance which follows
+ * directly from (1):
+ *
+ *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
+ *
+ * We them move tasks around to minimize the imbalance. In the continuous
+ * function space it is obvious this converges, in the discrete case we get
+ * a few fun cases generally called infeasible weight scenarios.
+ *
+ * [XXX expand on:
+ *     - infeasible weights;
+ *     - local vs global optima in the discrete case. ]
+ *
+ *
+ * SCHED DOMAINS
+ *
+ * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
+ * for all i,j solution, we create a tree of cpus that follows the hardware
+ * topology where each level pairs two lower groups (or better). This results
+ * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * tree to only the first of the previous level and we decrease the frequency
+ * of load-balance at each level inv. proportional to the number of cpus in
+ * the groups.
+ *
+ * This yields:
+ *
+ *     log_2 n     1     n
+ *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
+ *     i = 0      2^i   2^i
+ *                               `- size of each group
+ *         |         |     `- number of cpus doing load-balance
+ *         |         `- freq
+ *         `- sum over all levels
+ *
+ * Coupled with a limit on how many tasks we can migrate every balance pass,
+ * this makes (5) the runtime complexity of the balancer.
+ *
+ * An important property here is that each CPU is still (indirectly) connected
+ * to every other cpu in at most O(log n) steps:
+ *
+ * The adjacency matrix of the resulting graph is given by:
+ *
+ *             log_2 n     
+ *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
+ *             k = 0
+ *
+ * And you'll find that:
+ *
+ *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
+ *
+ * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * The task movement gives a factor of O(m), giving a convergence complexity
+ * of:
+ *
+ *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
+ *
+ *
+ * WORK CONSERVING
+ *
+ * In order to avoid CPUs going idle while there's still work to do, new idle
+ * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * tree itself instead of relying on other CPUs to bring it work.
+ *
+ * This adds some complexity to both (5) and (8) but it reduces the total idle
+ * time.
+ *
+ * [XXX more?]
+ *
+ *
+ * CGROUPS
+ *
+ * Cgroups make a horror show out of (2), instead of a simple sum we get:
+ *
+ *                                s_k,i
+ *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
+ *                                 S_k
+ *
+ * Where
+ *
+ *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
+ *
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ *
+ * The big problem is S_k, its a global sum needed to compute a local (W_i)
+ * property.
+ *
+ * [XXX write more on how we solve this.. _after_ merging pjt's patches that
+ *      rewrite all of this once again.]
+ */ 
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
@@ -3300,52 +4056,58 @@ next:
 /*
 * update tg->load_weight by folding this cpu's load_avg
 */
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
 {
-        struct cfs_rq *cfs_rq;
+        struct sched_entity *se = tg->se[cpu];
-        unsigned long flags;
+        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
-        struct rq *rq;
-        if (!tg->se[cpu])
-                return 0;
-        rq = cpu_rq(cpu);
-        cfs_rq = tg->cfs_rq[cpu];
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        update_rq_clock(rq);
-        update_cfs_load(cfs_rq, 1);
-        /*
+        /* throttled entities do not contribute to load */
-         * We need to update shares after updating tg->load_weight in
+        if (throttled_hierarchy(cfs_rq))
-         * order to adjust the weight of groups with long running tasks.
+                return;
-         */
-        update_cfs_shares(cfs_rq);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
+        update_cfs_rq_blocked_load(cfs_rq, 1);
-        return 0;
+        if (se) {
+                update_entity_load_avg(se, 1);
+                /*
+                 * We pivot on our runnable average having decayed to zero for
+                 * list removal.  This generally implies that all our children
+                 * have also been removed (modulo rounding error or bandwidth
+                 * control); however, such cases are rare and we can fix these
+                 * at enqueue.
+                 *
+                 * TODO: fix up out-of-order children on enqueue.
+                 */
+                if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
+                        list_del_leaf_cfs_rq(cfs_rq);
+        } else {
+                struct rq *rq = rq_of(cfs_rq);
+                update_rq_runnable_avg(rq, rq->nr_running);
+        }
 }
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
 {
-        struct cfs_rq *cfs_rq;
        struct rq *rq = cpu_rq(cpu);
+        struct cfs_rq *cfs_rq;
+        unsigned long flags;
-        rcu_read_lock();
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        update_rq_clock(rq);
        /*
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
        for_each_leaf_cfs_rq(rq, cfs_rq) {
-                /* throttled entities do not contribute to load */
+                /*
-                if (throttled_hierarchy(cfs_rq))
+                 * Note: We may want to consider periodically releasing
-                        continue;
+                 * rq->lock about these updates so that creating many task
+                 * groups does not result in continually extending hold time.
-                update_shares_cpu(cfs_rq->tg, cpu);
+                 */
+                __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
        }
-        rcu_read_unlock();
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 /*
@@ -3397,7 +4159,7 @@ static unsigned long task_h_load(struct task_struct *p)
        return load;
 }
 #else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
 {
 }
@@ -4457,12 +5219,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        if (this_rq->avg_idle < sysctl_sched_migration_cost)
                return;
+        update_rq_runnable_avg(this_rq, 1);
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
         */
        raw_spin_unlock(&this_rq->lock);
-        update_shares(this_cpu);
+        update_blocked_averages(this_cpu);
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
@@ -4717,7 +5481,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        int update_next_balance = 0;
        int need_serialize;
-        update_shares(cpu);
+        update_blocked_averages(cpu);
        rcu_read_lock();
        for_each_domain(cpu, sd) {
@@ -4954,6 +5718,11 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                cfs_rq = cfs_rq_of(se);
                entity_tick(cfs_rq, se, queued);
        }
+        if (sched_feat_numa(NUMA))
+                task_tick_numa(rq, curr);
+        update_rq_runnable_avg(rq, 1);
 }
 /*
@@ -5046,6 +5815,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
                place_entity(cfs_rq, se, 0);
                se->vruntime -= cfs_rq->min_vruntime;
        }
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+        /*
+        * Remove our load from contribution when we leave sched_fair
+        * and ensure we don't carry in an old decay_count if we
+        * switch back.
+        */
+        if (p->se.avg.decay_count) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+                __synchronize_entity_decay(&p->se);
+                subtract_blocked_load_contrib(cfs_rq,
+                                p->se.avg.load_avg_contrib);
+        }
+#endif
 }
 /*
@@ -5092,11 +5875,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifndef CONFIG_64BIT
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+        atomic64_set(&cfs_rq->decay_counter, 1);
+        atomic64_set(&cfs_rq->removed_load, 0);
+#endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+        struct cfs_rq *cfs_rq;
        /*
         * If the task was not on the rq at the time of this cgroup movement
         * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5128,8 +5916,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
        if (!on_rq)
                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
        set_task_rq(p, task_cpu(p));
-        if (!on_rq)
+        if (!on_rq) {
-                p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+                cfs_rq = cfs_rq_of(&p->se);
+                p->se.vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+                /*
+                 * migrate_task_rq_fair() will have removed our previous
+                 * contribution, but we must synchronize for ongoing future
+                 * decay.
+                 */
+                p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+                cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+#endif
+        }
 }
 void free_fair_sched_group(struct task_group *tg)
@@ -5214,10 +6013,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
        cfs_rq->tg = tg;
        cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
-        /* allow initial update_cfs_load() to truncate */
-        cfs_rq->load_stamp = 1;
-#endif
        init_cfs_rq_runtime(cfs_rq);
        tg->cfs_rq[cpu] = cfs_rq;
@@ -5297,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
         * idle runqueue:
         */
        if (rq->cfs.load.weight)
-                rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+                rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
        return rr_interval;
 }
@@ -5319,7 +6114,9 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_fair,
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        .migrate_task_rq        = migrate_task_rq_fair,
+#endif
        .rq_online              = rq_online_fair,
        .rq_offline             = rq_offline_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..1ad1d2b5395f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -32,6 +32,11 @@ SCHED_FEAT(LAST_BUDDY, true)
 SCHED_FEAT(CACHE_HOT_BUDDY, true)
 /*
+ * Allow wakeup-time preemption of the current task:
+ */
+SCHED_FEAT(WAKEUP_PREEMPTION, true)
+/*
 * Use arch dependent cpu power functions
 */
 SCHED_FEAT(ARCH_POWER, true)
@@ -61,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
+/*
+ * Apply the automatic NUMA scheduling policy. Enabled automatically
+ * at runtime if running on a NUMA machine. Can be controlled via
+ * numa_balancing=. Allow PTE scanning to be forced on UMA machines
+ * for debugging the core machinery.
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA,        false)
+SCHED_FEAT(NUMA_FORCE,  false)
+#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 418feb01344e..127a2c4cf4ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
 #include <linux/slab.h>
+int sched_rr_timeslice = RR_TIMESLICE;
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 struct rt_bandwidth def_rt_bandwidth;
@@ -566,7 +568,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 static int do_balance_runtime(struct rt_rq *rt_rq)
 {
        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+        struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
        int i, weight, more = 0;
        u64 rt_period;
@@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq)
                return;
        delta_exec = rq->clock_task - curr->se.exec_start;
-        if (unlikely((s64)delta_exec < 0))
+        if (unlikely((s64)delta_exec <= 0))
-                delta_exec = 0;
+                return;
        schedstat_set(curr->se.statistics.exec_max,
                      max(curr->se.statistics.exec_max, delta_exec));
@@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
+            cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
-            (p->nr_cpus_allowed > 1))
                return 1;
        return 0;
 }
@@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
         * we may need to handle the pulling of RT tasks
         * now.
         */
-        if (p->on_rq && !rq->rt.rt_nr_running)
+        if (!p->on_rq || rq->rt.rt_nr_running)
-                pull_rt_task(rq);
+                return;
+        if (pull_rt_task(rq))
+                resched_task(rq->curr);
 }
 void init_sched_rt_class(void)
@@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
        if (soft != RLIM_INFINITY) {
                unsigned long next;
-                p->rt.timeout++;
+                if (p->rt.watchdog_stamp != jiffies) {
+                        p->rt.timeout++;
+                        p->rt.watchdog_stamp = jiffies;
+                }
                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
                if (p->rt.timeout > next)
                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        if (--p->rt.time_slice)
                return;
-        p->rt.time_slice = RR_TIMESLICE;
+        p->rt.time_slice = sched_rr_timeslice;
        /*
         * Requeue to the end of queue if we (and all of our ancestors) are the
@@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
         * Time slice is 0 for SCHED_FIFO tasks
         */
        if (task->policy == SCHED_RR)
-                return RR_TIMESLICE;
+                return sched_rr_timeslice;
        else
                return 0;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..cc03cfdf469f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,7 @@
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
@@ -112,6 +114,8 @@ struct task_group {
        unsigned long shares;
        atomic_t load_weight;
+        atomic64_t load_avg;
+        atomic_t runnable_avg;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -222,22 +226,29 @@ struct cfs_rq {
        unsigned int nr_spread_over;
 #endif
+#ifdef CONFIG_SMP
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
        /*
-         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+         * CFS Load tracking
-         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+         * Under CFS, load is tracked on a per-entity basis and aggregated up.
-         * (like users, containers etc.)
+         * This allows for the description of both thread and group usage (in
-         *
+         * the FAIR_GROUP_SCHED case).
-         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
-         * list is used during load balance.
         */
-        int on_list;
+        u64 runnable_load_avg, blocked_load_avg;
-        struct list_head leaf_cfs_rq_list;
+        atomic64_t decay_counter, removed_load;
-        struct task_group *tg;  /* group that "owns" this runqueue */
+        u64 last_decay;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+/* These always depend on CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        u32 tg_runnable_contrib;
+        u64 tg_load_contrib;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_SMP
        /*
         *   h_load = weight * f(tg)
         *
@@ -245,26 +256,30 @@ struct cfs_rq {
         * this group.
         */
        unsigned long h_load;
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
        /*
-         * Maintaining per-cpu shares distribution for group scheduling
+         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+         * (like users, containers etc.)
         *
-         * load_stamp is the last time we updated the load average
+         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
-         * load_last is the last time we updated the load average and saw load
+         * list is used during load balance.
-         * load_unacc_exec_time is currently unaccounted execution time
         */
-        u64 load_avg;
+        int on_list;
-        u64 load_period;
+        struct list_head leaf_cfs_rq_list;
-        u64 load_stamp, load_last, load_unacc_exec_time;
+        struct task_group *tg;  /* group that "owns" this runqueue */
-        unsigned long load_contribution;
-#endif /* CONFIG_SMP */
 #ifdef CONFIG_CFS_BANDWIDTH
        int runtime_enabled;
        u64 runtime_expires;
        s64 runtime_remaining;
-        u64 throttled_timestamp;
+        u64 throttled_clock, throttled_clock_task;
+        u64 throttled_clock_task_time;
        int throttled, throttle_count;
        struct list_head throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
@@ -467,6 +482,8 @@ struct rq {
 #ifdef CONFIG_SMP
        struct llist_head wake_list;
 #endif
+        struct sched_avg avg;
 };
 static inline int cpu_of(struct rq *rq)
@@ -648,6 +665,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#ifdef CONFIG_SCHED_DEBUG
+#define numabalancing_enabled sched_feat_numa(NUMA)
+#else
+extern bool numabalancing_enabled;
+#endif /* CONFIG_SCHED_DEBUG */
+#else
+#define sched_feat_numa(x) (0)
+#define numabalancing_enabled (0)
+#endif /* CONFIG_NUMA_BALANCING */
 static inline u64 global_rt_period(void)
 {
        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
@@ -1212,4 +1241,3 @@ static inline u64 irq_time_read(int cpu)
 }
 #endif /* CONFIG_64BIT */
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9e8872..e036eda1a9c9 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
        if (mask_str == NULL)
                return -ENOMEM;
-        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+        if (v == (void *)1) {
-        seq_printf(seq, "timestamp %lu\n", jiffies);
+                seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-        for_each_online_cpu(cpu) {
+                seq_printf(seq, "timestamp %lu\n", jiffies);
-                struct rq *rq = cpu_rq(cpu);
+        } else {
+                struct rq *rq;
 #ifdef CONFIG_SMP
                struct sched_domain *sd;
                int dcount = 0;
 #endif
+                cpu = (unsigned long)(v - 2);
+                rq = cpu_rq(cpu);
                /* runqueue-specific stats */
                seq_printf(seq,
@@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)
        return 0;
 }
-static int schedstat_open(struct inode *inode, struct file *file)
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *schedstat_start(struct seq_file *file, loff_t *offset)
 {
-        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+        unsigned long n = *offset;
-        char *buf = kmalloc(size, GFP_KERNEL);
-        struct seq_file *m;
-        int res;
-        if (!buf)
+        if (n == 0)
-                return -ENOMEM;
+                return (void *) 1;
-        res = single_open(file, show_schedstat, NULL);
-        if (!res) {
+        n--;
-                m = file->private_data;
-                m->buf = buf;
+        if (n > 0)
-                m->size = size;
+                n = cpumask_next(n - 1, cpu_online_mask);
-        } else
+        else
-                kfree(buf);
+                n = cpumask_first(cpu_online_mask);
-        return res;
+        *offset = n + 1;
+        if (n < nr_cpu_ids)
+                return (void *)(unsigned long)(n + 2);
+        return NULL;
+}
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+        (*offset)++;
+        return schedstat_start(file, offset);
+}
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+static const struct seq_operations schedstat_sops = {
+        .start = schedstat_start,
+        .next  = schedstat_next,
+        .stop  = schedstat_stop,
+        .show  = show_schedstat,
+};
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &schedstat_sops);
 }
+static int schedstat_release(struct inode *inode, struct file *file)
+{
+        return 0;
+};
 static const struct file_operations proc_schedstat_operations = {
        .open    = schedstat_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-        .release = single_release,
+        .release = schedstat_release,
 };
 static int __init proc_schedstat_init(void)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ee376beedaf9..5af44b593770 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall)
 #ifdef CONFIG_SECCOMP_FILTER
        case SECCOMP_MODE_FILTER: {
                int data;
+                struct pt_regs *regs = task_pt_regs(current);
                ret = seccomp_run_filters(this_syscall);
                data = ret & SECCOMP_RET_DATA;
                ret &= SECCOMP_RET_ACTION;
                switch (ret) {
                case SECCOMP_RET_ERRNO:
                        /* Set the low-order 16-bits as a errno. */
-                        syscall_set_return_value(current, task_pt_regs(current),
+                        syscall_set_return_value(current, regs,
                                                 -data, 0);
                        goto skip;
                case SECCOMP_RET_TRAP:
                        /* Show the handler the original registers. */
-                        syscall_rollback(current, task_pt_regs(current));
+                        syscall_rollback(current, regs);
                        /* Let the filter pass back 16 bits of data. */
                        seccomp_send_sigsys(this_syscall, data);
                        goto skip;
                case SECCOMP_RET_TRACE:
                        /* Skip these calls if there is no tracer. */
-                        if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
+                        if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+                                syscall_set_return_value(current, regs,
+                                                         -ENOSYS, 0);
                                goto skip;
+                        }
                        /* Allow the BPF to provide the event message */
                        ptrace_event(PTRACE_EVENT_SECCOMP, data);
                        /*
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall)
                         */
                        if (fatal_signal_pending(current))
                                break;
+                        if (syscall_get_nr(current, regs) < 0)
+                                goto skip;  /* Explicit request to skip. */
                        return 0;
                case SECCOMP_RET_ALLOW:
                        return 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index 0af8868525d6..dd72567767d9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -31,6 +31,7 @@
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
 #include <linux/uprobes.h>
+#include <linux/compat.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -484,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
                if (force_default || ka->sa.sa_handler != SIG_IGN)
                        ka->sa.sa_handler = SIG_DFL;
                ka->sa.sa_flags = 0;
+#ifdef __ARCH_HAS_SA_RESTORER
+                ka->sa.sa_restorer = NULL;
+#endif
                sigemptyset(&ka->sa.sa_mask);
                ka++;
        }
@@ -679,23 +683,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 * No need to set need_resched since signal event passing
 * goes through ->blocked
 */
-void signal_wake_up(struct task_struct *t, int resume)
+void signal_wake_up_state(struct task_struct *t, unsigned int state)
 {
-        unsigned int mask;
        set_tsk_thread_flag(t, TIF_SIGPENDING);
        /*
-         * For SIGKILL, we want to wake it up in the stopped/traced/killable
+         * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
         * case. We don't check t->state here because there is a race with it
         * executing another processor and just now entering stopped state.
         * By using wake_up_state, we ensure the process will wake up and
         * handle its death signal.
         */
-        mask = TASK_INTERRUPTIBLE;
+        if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
-        if (resume)
-                mask |= TASK_WAKEKILL;
-        if (!wake_up_state(t, mask))
                kick_process(t);
 }
@@ -843,7 +841,7 @@ static void ptrace_trap_notify(struct task_struct *t)
        assert_spin_locked(&t->sighand->siglock);
        task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
-        signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
+        ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
 }
 /*
@@ -1159,13 +1157,14 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
        return __send_signal(sig, info, t, group, from_ancestor_ns);
 }
-static void print_fatal_signal(struct pt_regs *regs, int signr)
+static void print_fatal_signal(int signr)
 {
-        printk("%s/%d: potentially unexpected fatal signal %d.\n",
+        struct pt_regs *regs = signal_pt_regs();
+        printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n",
                current->comm, task_pid_nr(current), signr);
 #if defined(__i386__) && !defined(__arch_um__)
-        printk("code at %08lx: ", regs->ip);
+        printk(KERN_INFO "code at %08lx: ", regs->ip);
        {
                int i;
                for (i = 0; i < 16; i++) {
@@ -1173,11 +1172,11 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
                        if (get_user(insn, (unsigned char *)(regs->ip + i)))
                                break;
-                        printk("%02x ", insn);
+                        printk(KERN_CONT "%02x ", insn);
                }
        }
+        printk(KERN_CONT "\n");
 #endif
-        printk("\n");
        preempt_disable();
        show_regs(regs);
        preempt_enable();
@@ -1636,6 +1635,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
        unsigned long flags;
        struct sighand_struct *psig;
        bool autoreap = false;
+        cputime_t utime, stime;
        BUG_ON(sig == -1);
@@ -1673,8 +1673,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
                                       task_uid(tsk));
        rcu_read_unlock();
-        info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
+        task_cputime(tsk, &utime, &stime);
-        info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
+        info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime);
+        info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime);
        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
@@ -1738,6 +1739,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
        unsigned long flags;
        struct task_struct *parent;
        struct sighand_struct *sighand;
+        cputime_t utime, stime;
        if (for_ptracer) {
                parent = tsk->parent;
@@ -1752,12 +1754,13 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
         * see comment in do_notify_parent() about the following 4 lines
         */
        rcu_read_lock();
-        info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
+        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
        rcu_read_unlock();
-        info.si_utime = cputime_to_clock_t(tsk->utime);
+        task_cputime(tsk, &utime, &stime);
-        info.si_stime = cputime_to_clock_t(tsk->stime);
+        info.si_utime = cputime_to_clock_t(utime);
+        info.si_stime = cputime_to_clock_t(stime);
        info.si_code = why;
        switch (why) {
@@ -1798,6 +1801,10 @@ static inline int may_ptrace_stop(void)
         * If SIGKILL was already sent before the caller unlocked
         * ->siglock we must see ->core_state != NULL. Otherwise it
         * is safe to enter schedule().
+         *
+         * This is almost outdated, a task with the pending SIGKILL can't
+         * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
+         * after SIGKILL was already dequeued.
         */
        if (unlikely(current->mm->core_state) &&
            unlikely(current->mm == current->parent->mm))
@@ -1908,7 +1915,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
                preempt_disable();
                read_unlock(&tasklist_lock);
                preempt_enable_no_resched();
-                schedule();
+                freezable_schedule();
        } else {
                /*
                 * By the time we got the lock, our tracer went away.
@@ -1923,6 +1930,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
                if (gstop_done)
                        do_notify_parent_cldstop(current, false, why);
+                /* tasklist protects us from ptrace_freeze_traced() */
                __set_current_state(TASK_RUNNING);
                if (clear_code)
                        current->exit_code = 0;
@@ -1930,13 +1938,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
        }
        /*
-         * While in TASK_TRACED, we were considered "frozen enough".
-         * Now that we woke up, it's crucial if we're supposed to be
-         * frozen that we freeze now before running anything substantial.
-         */
-        try_to_freeze();
-        /*
         * We are back.  Now reacquire the siglock before touching
         * last_siginfo, so that we are sure to have synchronized with
         * any signal-sending on another CPU that wants to examine it.
@@ -2092,7 +2093,7 @@ static bool do_signal_stop(int signr)
                }
                /* Now we don't run again until woken by SIGCONT or SIGKILL */
-                schedule();
+                freezable_schedule();
                return true;
        } else {
                /*
@@ -2138,10 +2139,9 @@ static void do_jobctl_trap(void)
        }
 }
-static int ptrace_signal(int signr, siginfo_t *info,
+static int ptrace_signal(int signr, siginfo_t *info)
-                         struct pt_regs *regs, void *cookie)
 {
-        ptrace_signal_deliver(regs, cookie);
+        ptrace_signal_deliver();
        /*
         * We do not check sig_kernel_stop(signr) but set this marker
         * unconditionally because we do not know whether debugger will
@@ -2200,15 +2200,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
        if (unlikely(uprobe_deny_signal()))
                return 0;
-relock:
        /*
-         * We'll jump back here after any time we were stopped in TASK_STOPPED.
+         * Do this once, we can't return to user-mode if freezing() == T.
-         * While in TASK_STOPPED, we were considered "frozen enough".
+         * do_signal_stop() and ptrace_stop() do freezable_schedule() and
-         * Now that we woke up, it's crucial if we're supposed to be
+         * thus do not need another check after return.
-         * frozen that we freeze now before running anything substantial.
         */
        try_to_freeze();
+relock:
        spin_lock_irq(&sighand->siglock);
        /*
         * Every stopped thread goes here after wakeup. Check to see if
@@ -2265,8 +2264,7 @@ relock:
                        break; /* will return 0 */
                if (unlikely(current->ptrace) && signr != SIGKILL) {
-                        signr = ptrace_signal(signr, info,
+                        signr = ptrace_signal(signr, info);
-                                              regs, cookie);
                        if (!signr)
                                continue;
                }
@@ -2351,7 +2349,7 @@ relock:
                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
-                                print_fatal_signal(regs, info->si_signo);
+                                print_fatal_signal(info->si_signo);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
@@ -2360,7 +2358,7 @@ relock:
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
-                        do_coredump(info, regs);
+                        do_coredump(info);
                }
                /*
@@ -2404,6 +2402,15 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
        tracehook_signal_handler(sig, info, ka, regs, stepping);
 }
+void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
+{
+        if (failed)
+                force_sigsegv(ksig->sig, current);
+        else
+                signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
+                        signal_pt_regs(), stepping);
+}
 /*
 * It could be that complete_signal() picked us to notify about the
 * group-wide signal. Other threads should be notified now to take
@@ -2536,11 +2543,8 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
 */
 void set_current_blocked(sigset_t *newset)
 {
-        struct task_struct *tsk = current;
        sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
-        spin_lock_irq(&tsk->sighand->siglock);
+        __set_current_blocked(newset);
-        __set_task_blocked(tsk, newset);
-        spin_unlock_irq(&tsk->sighand->siglock);
 }
 void __set_current_blocked(const sigset_t *newset)
@@ -2624,41 +2628,96 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
        return 0;
 }
-long do_sigpending(void __user *set, unsigned long sigsetsize)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
+                compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
 {
-        long error = -EINVAL;
+#ifdef __BIG_ENDIAN
-        sigset_t pending;
+        sigset_t old_set = current->blocked;
+        /* XXX: Don't preclude handling different sized sigset_t's.  */
+        if (sigsetsize != sizeof(sigset_t))
+                return -EINVAL;
+        if (nset) {
+                compat_sigset_t new32;
+                sigset_t new_set;
+                int error;
+                if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+                        return -EFAULT;
+                sigset_from_compat(&new_set, &new32);
+                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
+                error = sigprocmask(how, &new_set, NULL);
+                if (error)
+                        return error;
+        }
+        if (oset) {
+                compat_sigset_t old32;
+                sigset_to_compat(&old32, &old_set);
+                if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
+                        return -EFAULT;
+        }
+        return 0;
+#else
+        return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
+                                  (sigset_t __user *)oset, sigsetsize);
+#endif
+}
+#endif
+static int do_sigpending(void *set, unsigned long sigsetsize)
+{
        if (sigsetsize > sizeof(sigset_t))
-                goto out;
+                return -EINVAL;
        spin_lock_irq(&current->sighand->siglock);
-        sigorsets(&pending, &current->pending.signal,
+        sigorsets(set, &current->pending.signal,
                  &current->signal->shared_pending.signal);
        spin_unlock_irq(&current->sighand->siglock);
        /* Outside the lock because only this thread touches it.  */
-        sigandsets(&pending, &current->blocked, &pending);
+        sigandsets(set, &current->blocked, set);
+        return 0;
-        error = -EFAULT;
-        if (!copy_to_user(set, &pending, sigsetsize))
-                error = 0;
-out:
-        return error;
 }
 /**
 *  sys_rt_sigpending - examine a pending signal that has been raised
 *                      while blocked
- *  @set: stores pending signals
+ *  @uset: stores pending signals
 *  @sigsetsize: size of sigset_t type or larger
 */
-SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
+SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
 {
-        return do_sigpending(set, sigsetsize);
+        sigset_t set;
+        int err = do_sigpending(&set, sigsetsize);
+        if (!err && copy_to_user(uset, &set, sigsetsize))
+                err = -EFAULT;
+        return err;
 }
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
+                compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+        sigset_t set;
+        int err = do_sigpending(&set, sigsetsize);
+        if (!err) {
+                compat_sigset_t set32;
+                sigset_to_compat(&set32, &set);
+                /* we can get here only if sigsetsize <= sizeof(set) */
+                if (copy_to_user(uset, &set32, sigsetsize))
+                        err = -EFAULT;
+        }
+        return err;
+#else
+        return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
+#endif
+}
+#endif
 #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
 int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
@@ -2935,6 +2994,23 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
        return do_tkill(0, pid, sig);
 }
+static int do_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t *info)
+{
+        /* Not even root can pretend to send signals from the kernel.
+         * Nor can they impersonate a kill()/tgkill(), which adds source info.
+         */
+        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
+            (task_pid_vnr(current) != pid)) {
+                /* We used to allow any < 0 si_code */
+                WARN_ON_ONCE(info->si_code < 0);
+                return -EPERM;
+        }
+        info->si_signo = sig;
+        /* POSIX.1b doesn't mention process groups.  */
+        return kill_proc_info(sig, info, pid);
+}
 /**
 *  sys_rt_sigqueueinfo - send signal information to a signal
 *  @pid: the PID of the thread
@@ -2945,25 +3021,26 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
 {
        siginfo_t info;
        if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
                return -EFAULT;
+        return do_rt_sigqueueinfo(pid, sig, &info);
+}
-        /* Not even root can pretend to send signals from the kernel.
+#ifdef CONFIG_COMPAT
-         * Nor can they impersonate a kill()/tgkill(), which adds source info.
+COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
-         */
+                        compat_pid_t, pid,
-        if (info.si_code >= 0 || info.si_code == SI_TKILL) {
+                        int, sig,
-                /* We used to allow any < 0 si_code */
+                        struct compat_siginfo __user *, uinfo)
-                WARN_ON_ONCE(info.si_code < 0);
+{
-                return -EPERM;
+        siginfo_t info;
-        }
+        int ret = copy_siginfo_from_user32(&info, uinfo);
-        info.si_signo = sig;
+        if (unlikely(ret))
+                return ret;
-        /* POSIX.1b doesn't mention process groups.  */
+        return do_rt_sigqueueinfo(pid, sig, &info);
-        return kill_proc_info(sig, &info, pid);
 }
+#endif
-long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
 {
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
@@ -2972,7 +3049,8 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
-        if (info->si_code >= 0 || info->si_code == SI_TKILL) {
+        if (((info->si_code >= 0 || info->si_code == SI_TKILL)) &&
+            (task_pid_vnr(current) != pid)) {
                /* We used to allow any < 0 si_code */
                WARN_ON_ONCE(info->si_code < 0);
                return -EPERM;
@@ -2993,6 +3071,21 @@ SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
 }
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
+                        compat_pid_t, tgid,
+                        compat_pid_t, pid,
+                        int, sig,
+                        struct compat_siginfo __user *, uinfo)
+{
+        siginfo_t info;
+        if (copy_siginfo_from_user32(&info, uinfo))
+                return -EFAULT;
+        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+#endif
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
        struct task_struct *t = current;
@@ -3038,7 +3131,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
        return 0;
 }
-int 
+static int 
 do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
 {
        stack_t oss;
@@ -3103,6 +3196,76 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
 out:
        return error;
 }
+SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
+{
+        return do_sigaltstack(uss, uoss, current_user_stack_pointer());
+}
+int restore_altstack(const stack_t __user *uss)
+{
+        int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
+        /* squash all but EFAULT for now */
+        return err == -EFAULT ? err : 0;
+}
+int __save_altstack(stack_t __user *uss, unsigned long sp)
+{
+        struct task_struct *t = current;
+        return  __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
+                __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+                __put_user(t->sas_ss_size, &uss->ss_size);
+}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sigaltstack,
+                        const compat_stack_t __user *, uss_ptr,
+                        compat_stack_t __user *, uoss_ptr)
+{
+        stack_t uss, uoss;
+        int ret;
+        mm_segment_t seg;
+        if (uss_ptr) {
+                compat_stack_t uss32;
+                memset(&uss, 0, sizeof(stack_t));
+                if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
+                        return -EFAULT;
+                uss.ss_sp = compat_ptr(uss32.ss_sp);
+                uss.ss_flags = uss32.ss_flags;
+                uss.ss_size = uss32.ss_size;
+        }
+        seg = get_fs();
+        set_fs(KERNEL_DS);
+        ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
+                             (stack_t __force __user *) &uoss,
+                             compat_user_stack_pointer());
+        set_fs(seg);
+        if (ret >= 0 && uoss_ptr)  {
+                if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
+                    __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
+                    __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
+                    __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+                        ret = -EFAULT;
+        }
+        return ret;
+}
+int compat_restore_altstack(const compat_stack_t __user *uss)
+{
+        int err = compat_sys_sigaltstack(uss, NULL);
+        /* squash all but -EFAULT for now */
+        return err == -EFAULT ? err : 0;
+}
+int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
+{
+        struct task_struct *t = current;
+        return  __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
+                __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+                __put_user(t->sas_ss_size, &uss->ss_size);
+}
+#endif
 #ifdef __ARCH_WANT_SYS_SIGPENDING
@@ -3112,7 +3275,7 @@ out:
 */
 SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 {
-        return do_sigpending(set, sizeof(*set));
+        return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t)); 
 }
 #endif
@@ -3139,7 +3302,6 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(*nset)))
                        return -EFAULT;
-                new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
                new_blocked = current->blocked;
@@ -3157,7 +3319,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
                        return -EINVAL;
                }
-                __set_current_blocked(&new_blocked);
+                set_current_blocked(&new_blocked);
        }
        if (oset) {
@@ -3169,7 +3331,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
 }
 #endif /* __ARCH_WANT_SYS_SIGPROCMASK */
-#ifdef __ARCH_WANT_SYS_RT_SIGACTION
+#ifndef CONFIG_ODD_RT_SIGACTION
 /**
 *  sys_rt_sigaction - alter an action taken by a process
 *  @sig: signal to be sent
@@ -3203,7 +3365,132 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig,
 out:
        return ret;
 }
-#endif /* __ARCH_WANT_SYS_RT_SIGACTION */
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
+                const struct compat_sigaction __user *, act,
+                struct compat_sigaction __user *, oact,
+                compat_size_t, sigsetsize)
+{
+        struct k_sigaction new_ka, old_ka;
+        compat_sigset_t mask;
+#ifdef __ARCH_HAS_SA_RESTORER
+        compat_uptr_t restorer;
+#endif
+        int ret;
+        /* XXX: Don't preclude handling different sized sigset_t's.  */
+        if (sigsetsize != sizeof(compat_sigset_t))
+                return -EINVAL;
+        if (act) {
+                compat_uptr_t handler;
+                ret = get_user(handler, &act->sa_handler);
+                new_ka.sa.sa_handler = compat_ptr(handler);
+#ifdef __ARCH_HAS_SA_RESTORER
+                ret |= get_user(restorer, &act->sa_restorer);
+                new_ka.sa.sa_restorer = compat_ptr(restorer);
+#endif
+                ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+                ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+                if (ret)
+                        return -EFAULT;
+                sigset_from_compat(&new_ka.sa.sa_mask, &mask);
+        }
+        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+        if (!ret && oact) {
+                sigset_to_compat(&mask, &old_ka.sa.sa_mask);
+                ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
+                               &oact->sa_handler);
+                ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+                ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+#ifdef __ARCH_HAS_SA_RESTORER
+                ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+                                &oact->sa_restorer);
+#endif
+        }
+        return ret;
+}
+#endif
+#endif /* !CONFIG_ODD_RT_SIGACTION */
+#ifdef CONFIG_OLD_SIGACTION
+SYSCALL_DEFINE3(sigaction, int, sig,
+                const struct old_sigaction __user *, act,
+                struct old_sigaction __user *, oact)
+{
+        struct k_sigaction new_ka, old_ka;
+        int ret;
+        if (act) {
+                old_sigset_t mask;
+                if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+                    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+                    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
+                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+                    __get_user(mask, &act->sa_mask))
+                        return -EFAULT;
+#ifdef __ARCH_HAS_KA_RESTORER
+                new_ka.ka_restorer = NULL;
+#endif
+                siginitset(&new_ka.sa.sa_mask, mask);
+        }
+        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+        if (!ret && oact) {
+                if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+                    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+                    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
+                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+                        return -EFAULT;
+        }
+        return ret;
+}
+#endif
+#ifdef CONFIG_COMPAT_OLD_SIGACTION
+COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
+                const struct compat_old_sigaction __user *, act,
+                struct compat_old_sigaction __user *, oact)
+{
+        struct k_sigaction new_ka, old_ka;
+        int ret;
+        compat_old_sigset_t mask;
+        compat_uptr_t handler, restorer;
+        if (act) {
+                if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+                    __get_user(handler, &act->sa_handler) ||
+                    __get_user(restorer, &act->sa_restorer) ||
+                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+                    __get_user(mask, &act->sa_mask))
+                        return -EFAULT;
+#ifdef __ARCH_HAS_KA_RESTORER
+                new_ka.ka_restorer = NULL;
+#endif
+                new_ka.sa.sa_handler = compat_ptr(handler);
+                new_ka.sa.sa_restorer = compat_ptr(restorer);
+                siginitset(&new_ka.sa.sa_mask, mask);
+        }
+        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+        if (!ret && oact) {
+                if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+                    __put_user(ptr_to_compat(old_ka.sa.sa_handler),
+                               &oact->sa_handler) ||
+                    __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
+                               &oact->sa_restorer) ||
+                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+                        return -EFAULT;
+        }
+        return ret;
+}
+#endif
 #ifdef __ARCH_WANT_SYS_SGETMASK
@@ -3221,6 +3508,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
        int old = current->blocked.sig[0];
        sigset_t newset;
+        siginitset(&newset, newmask);
        set_current_blocked(&newset);
        return old;
@@ -3270,7 +3558,6 @@ int sigsuspend(sigset_t *set)
        return -ERESTARTNOHAND;
 }
-#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
 /**
 *  sys_rt_sigsuspend - replace the signal mask for a value with the
 *      @unewset value until a signal is received
@@ -3289,7 +3576,45 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
                return -EFAULT;
        return sigsuspend(&newset);
 }
-#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
+ 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
+{
+#ifdef __BIG_ENDIAN
+        sigset_t newset;
+        compat_sigset_t newset32;
+        /* XXX: Don't preclude handling different sized sigset_t's.  */
+        if (sigsetsize != sizeof(sigset_t))
+                return -EINVAL;
+        if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+                return -EFAULT;
+        sigset_from_compat(&newset, &newset32);
+        return sigsuspend(&newset);
+#else
+        /* on little-endian bitmaps don't care about granularity */
+        return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
+#endif
+}
+#endif
+#ifdef CONFIG_OLD_SIGSUSPEND
+SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
+{
+        sigset_t blocked;
+        siginitset(&blocked, mask);
+        return sigsuspend(&blocked);
+}
+#endif
+#ifdef CONFIG_OLD_SIGSUSPEND3
+SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
+{
+        sigset_t blocked;
+        siginitset(&blocked, mask);
+        return sigsuspend(&blocked);
+}
+#endif
 __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
 {
diff --git a/kernel/smp.c b/kernel/smp.c
index 29dd40a9f2f4..8e451f3ff51b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,23 +16,14 @@
 #include "smpboot.h"
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
-static struct {
-        struct list_head        queue;
-        raw_spinlock_t          lock;
-} call_function __cacheline_aligned_in_smp =
-        {
-                .queue          = LIST_HEAD_INIT(call_function.queue),
-                .lock           = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
-        };
 enum {
        CSD_FLAG_LOCK           = 0x01,
 };
 struct call_function_data {
-        struct call_single_data csd;
+        struct call_single_data __percpu *csd;
-        atomic_t                refs;
        cpumask_var_t           cpumask;
+        cpumask_var_t           cpumask_ipi;
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -56,6 +47,14 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
                if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
                                cpu_to_node(cpu)))
                        return notifier_from_errno(-ENOMEM);
+                if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+                                cpu_to_node(cpu)))
+                        return notifier_from_errno(-ENOMEM);
+                cfd->csd = alloc_percpu(struct call_single_data);
+                if (!cfd->csd) {
+                        free_cpumask_var(cfd->cpumask);
+                        return notifier_from_errno(-ENOMEM);
+                }
                break;
 #ifdef CONFIG_HOTPLUG_CPU
@@ -65,6 +64,8 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                free_cpumask_var(cfd->cpumask);
+                free_cpumask_var(cfd->cpumask_ipi);
+                free_percpu(cfd->csd);
                break;
 #endif
        };
@@ -166,85 +167,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 }
 /*
- * Invoked by arch to handle an IPI for call function. Must be called with
- * interrupts disabled.
- */
-void generic_smp_call_function_interrupt(void)
-{
-        struct call_function_data *data;
-        int cpu = smp_processor_id();
-        /*
-         * Shouldn't receive this interrupt on a cpu that is not yet online.
-         */
-        WARN_ON_ONCE(!cpu_online(cpu));
-        /*
-         * Ensure entry is visible on call_function_queue after we have
-         * entered the IPI. See comment in smp_call_function_many.
-         * If we don't have this, then we may miss an entry on the list
-         * and never get another IPI to process it.
-         */
-        smp_mb();
-        /*
-         * It's ok to use list_for_each_rcu() here even though we may
-         * delete 'pos', since list_del_rcu() doesn't clear ->next
-         */
-        list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
-                int refs;
-                smp_call_func_t func;
-                /*
-                 * Since we walk the list without any locks, we might
-                 * see an entry that was completed, removed from the
-                 * list and is in the process of being reused.
-                 *
-                 * We must check that the cpu is in the cpumask before
-                 * checking the refs, and both must be set before
-                 * executing the callback on this cpu.
-                 */
-                if (!cpumask_test_cpu(cpu, data->cpumask))
-                        continue;
-                smp_rmb();
-                if (atomic_read(&data->refs) == 0)
-                        continue;
-                func = data->csd.func;          /* save for later warn */
-                func(data->csd.info);
-                /*
-                 * If the cpu mask is not still set then func enabled
-                 * interrupts (BUG), and this cpu took another smp call
-                 * function interrupt and executed func(info) twice
-                 * on this cpu.  That nested execution decremented refs.
-                 */
-                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
-                        WARN(1, "%pf enabled interrupts and double executed\n", func);
-                        continue;
-                }
-                refs = atomic_dec_return(&data->refs);
-                WARN_ON(refs < 0);
-                if (refs)
-                        continue;
-                WARN_ON(!cpumask_empty(data->cpumask));
-                raw_spin_lock(&call_function.lock);
-                list_del_rcu(&data->csd.list);
-                raw_spin_unlock(&call_function.lock);
-                csd_unlock(&data->csd);
-        }
-}
-/*
 * Invoked by arch to handle an IPI for call function single. Must be
 * called from the arch with interrupts disabled.
 */
@@ -448,8 +370,7 @@ void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
 {
        struct call_function_data *data;
-        unsigned long flags;
+        int cpu, next_cpu, this_cpu = smp_processor_id();
-        int refs, cpu, next_cpu, this_cpu = smp_processor_id();
        /*
         * Can deadlock when called with interrupts disabled.
@@ -481,79 +402,46 @@ void smp_call_function_many(const struct cpumask *mask,
        }
        data = &__get_cpu_var(cfd_data);
-        csd_lock(&data->csd);
-        /* This BUG_ON verifies our reuse assertions and can be removed */
-        BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
-        /*
-         * The global call function queue list add and delete are protected
-         * by a lock, but the list is traversed without any lock, relying
-         * on the rcu list add and delete to allow safe concurrent traversal.
-         * We reuse the call function data without waiting for any grace
-         * period after some other cpu removes it from the global queue.
-         * This means a cpu might find our data block as it is being
-         * filled out.
-         *
-         * We hold off the interrupt handler on the other cpu by
-         * ordering our writes to the cpu mask vs our setting of the
-         * refs counter.  We assert only the cpu owning the data block
-         * will set a bit in cpumask, and each bit will only be cleared
-         * by the subject cpu.  Each cpu must first find its bit is
-         * set and then check that refs is set indicating the element is
-         * ready to be processed, otherwise it must skip the entry.
-         *
-         * On the previous iteration refs was set to 0 by another cpu.
-         * To avoid the use of transitivity, set the counter to 0 here
-         * so the wmb will pair with the rmb in the interrupt handler.
-         */
-        atomic_set(&data->refs, 0);     /* convert 3rd to 1st party write */
-        data->csd.func = func;
-        data->csd.info = info;
-        /* Ensure 0 refs is visible before mask.  Also orders func and info */
-        smp_wmb();
-        /* We rely on the "and" being processed before the store */
        cpumask_and(data->cpumask, mask, cpu_online_mask);
        cpumask_clear_cpu(this_cpu, data->cpumask);
-        refs = cpumask_weight(data->cpumask);
        /* Some callers race with other cpus changing the passed mask */
-        if (unlikely(!refs)) {
+        if (unlikely(!cpumask_weight(data->cpumask)))
-                csd_unlock(&data->csd);
                return;
-        }
-        raw_spin_lock_irqsave(&call_function.lock, flags);
        /*
-         * Place entry at the _HEAD_ of the list, so that any cpu still
+         * After we put an entry into the list, data->cpumask
-         * observing the entry in generic_smp_call_function_interrupt()
+         * may be cleared again when another CPU sends another IPI for
-         * will not miss any other list entries:
+         * a SMP function call, so data->cpumask will be zero.
         */
-        list_add_rcu(&data->csd.list, &call_function.queue);
+        cpumask_copy(data->cpumask_ipi, data->cpumask);
-        /*
-         * We rely on the wmb() in list_add_rcu to complete our writes
-         * to the cpumask before this write to refs, which indicates
-         * data is on the list and is ready to be processed.
-         */
-        atomic_set(&data->refs, refs);
-        raw_spin_unlock_irqrestore(&call_function.lock, flags);
-        /*
+        for_each_cpu(cpu, data->cpumask) {
-         * Make the list addition visible before sending the ipi.
+                struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
-         * (IPIs must obey or appear to obey normal Linux cache
+                struct call_single_queue *dst =
-         * coherency rules -- see comment in generic_exec_single).
+                                        &per_cpu(call_single_queue, cpu);
-         */
+                unsigned long flags;
-        smp_mb();
+                csd_lock(csd);
+                csd->func = func;
+                csd->info = info;
+                raw_spin_lock_irqsave(&dst->lock, flags);
+                list_add_tail(&csd->list, &dst->list);
+                raw_spin_unlock_irqrestore(&dst->lock, flags);
+        }
        /* Send a message to all CPUs in the map */
-        arch_send_call_function_ipi_mask(data->cpumask);
+        arch_send_call_function_ipi_mask(data->cpumask_ipi);
-        /* Optionally wait for the CPUs to complete */
+        if (wait) {
-        if (wait)
+                for_each_cpu(cpu, data->cpumask) {
-                csd_lock_wait(&data->csd);
+                        struct call_single_data *csd =
+                                        per_cpu_ptr(data->csd, cpu);
+                        csd_lock_wait(csd);
+                }
+        }
 }
 EXPORT_SYMBOL(smp_call_function_many);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d6c5fc054242..02fc5c933673 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -183,9 +183,20 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
                kfree(td);
                return PTR_ERR(tsk);
        }
        get_task_struct(tsk);
        *per_cpu_ptr(ht->store, cpu) = tsk;
+        if (ht->create) {
+                /*
+                 * Make sure that the task has actually scheduled out
+                 * into park position, before calling the create
+                 * callback. At least the migration thread callback
+                 * requires that the task is off the runqueue.
+                 */
+                if (!wait_task_inactive(tsk, TASK_PARKED))
+                        WARN_ON(1);
+                else
+                        ht->create(cpu);
+        }
        return 0;
 }
@@ -208,6 +219,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
 {
        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+        if (ht->pre_unpark)
+                ht->pre_unpark(cpu);
        kthread_unpark(tsk);
 }
@@ -225,7 +238,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
 {
        struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-        if (tsk)
+        if (tsk && !ht->selfparking)
                kthread_park(tsk);
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index cc96bdc0c2c9..14d7758074aa 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)
 EXPORT_SYMBOL(local_bh_enable_ip);
 /*
- * We restart softirq processing MAX_SOFTIRQ_RESTART times,
+ * We restart softirq processing for at most 2 ms,
- * and we fall back to softirqd after that.
+ * and if need_resched() is not set.
 *
- * This number has been established via experimentation.
+ * These limits have been established via experimentation.
 * The two things to balance is latency against fairness -
 * we want to handle softirqs as soon as possible, but they
 * should not be able to lock up the box.
 */
-#define MAX_SOFTIRQ_RESTART 10
+#define MAX_SOFTIRQ_TIME  msecs_to_jiffies(2)
 asmlinkage void __do_softirq(void)
 {
        struct softirq_action *h;
        __u32 pending;
-        int max_restart = MAX_SOFTIRQ_RESTART;
+        unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
        int cpu;
        unsigned long old_flags = current->flags;
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
        current->flags &= ~PF_MEMALLOC;
        pending = local_softirq_pending();
-        vtime_account(current);
+        account_irq_enter_time(current);
        __local_bh_disable((unsigned long)__builtin_return_address(0),
                                SOFTIRQ_OFFSET);
@@ -264,15 +264,16 @@ restart:
        local_irq_disable();
        pending = local_softirq_pending();
-        if (pending && --max_restart)
+        if (pending) {
-                goto restart;
+                if (time_before(jiffies, end) && !need_resched())
+                        goto restart;
-        if (pending)
                wakeup_softirqd();
+        }
        lockdep_softirq_exit();
-        vtime_account(current);
+        account_irq_exit_time(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
        tsk_restore_flags(current, old_flags, PF_MEMALLOC);
 }
@@ -322,18 +323,10 @@ void irq_enter(void)
 static inline void invoke_softirq(void)
 {
-        if (!force_irqthreads) {
+        if (!force_irqthreads)
-#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
                __do_softirq();
-#else
+        else
-                do_softirq();
-#endif
-        } else {
-                __local_bh_disable((unsigned long)__builtin_return_address(0),
-                                SOFTIRQ_OFFSET);
                wakeup_softirqd();
-                __local_bh_enable(SOFTIRQ_OFFSET);
-        }
 }
 /*
@@ -341,9 +334,15 @@ static inline void invoke_softirq(void)
 */
 void irq_exit(void)
 {
-        vtime_account(current);
+#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
+        local_irq_disable();
+#else
+        WARN_ON_ONCE(!irqs_disabled());
+#endif
+        account_irq_exit_time(current);
        trace_hardirq_exit();
-        sub_preempt_count(IRQ_EXIT_OFFSET);
+        sub_preempt_count(HARDIRQ_OFFSET);
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
@@ -353,7 +352,6 @@ void irq_exit(void)
                tick_nohz_irq_exit();
 #endif
        rcu_irq_exit();
-        sched_preempt_enable_no_resched();
 }
 /*
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 97c465ebd844..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,8 +16,10 @@
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
 *
 * Author: Paul McKenney <paulmck@us.ibm.com>
+ *         Lai Jiangshan <laijs@cn.fujitsu.com>
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *              Documentation/RCU/ *.txt
@@ -34,6 +36,10 @@
 #include <linux/delay.h>
 #include <linux/srcu.h>
+#include <trace/events/rcu.h>
+#include "rcu.h"
 /*
 * Initialize an rcu_batch structure to empty.
 */
@@ -92,9 +98,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
        }
 }
-/* single-thread state-machine */
-static void process_srcu(struct work_struct *work);
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
        sp->completed = 0;
@@ -279,12 +282,8 @@ static int srcu_readers_active(struct srcu_struct *sp)
 */
 void cleanup_srcu_struct(struct srcu_struct *sp)
 {
-        int sum;
+        if (WARN_ON(srcu_readers_active(sp)))
+                return; /* Leakage unless caller handles error. */
-        sum = srcu_readers_active(sp);
-        WARN_ON(sum);  /* Leakage unless caller handles error. */
-        if (sum != 0)
-                return;
        free_percpu(sp->per_cpu_ref);
        sp->per_cpu_ref = NULL;
 }
@@ -299,9 +298,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
 {
        int idx;
+        idx = ACCESS_ONCE(sp->completed) & 0x1;
        preempt_disable();
-        idx = rcu_dereference_index_check(sp->completed,
-                                          rcu_read_lock_sched_held()) & 0x1;
        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
        smp_mb(); /* B */  /* Avoid leaking the critical section. */
        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
@@ -318,10 +316,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
 */
 void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
-        preempt_disable();
        smp_mb(); /* C */  /* Avoid leaking the critical section. */
-        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
+        this_cpu_dec(sp->per_cpu_ref->c[idx]);
-        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -420,6 +416,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
                           !lock_is_held(&rcu_sched_lock_map),
                           "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+        might_sleep();
        init_completion(&rcu.completion);
        head->next = NULL;
@@ -452,10 +449,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 * synchronize_srcu - wait for prior SRCU read-side critical-section completion
 * @sp: srcu_struct with which to synchronize.
 *
- * Flip the completed counter, and wait for the old count to drain to zero.
+ * Wait for the count to drain to zero of both indexes. To avoid the
- * As with classic RCU, the updater must use some separate means of
+ * possible starvation of synchronize_srcu(), it waits for the count of
- * synchronizing concurrent updates.  Can block; must be called from
+ * the index=((->completed & 1) ^ 1) to drain to zero at first,
- * process context.
+ * and then flip the completed and wait for the count of the other index.
+ *
+ * Can block; must be called from process context.
 *
 * Note that it is illegal to call synchronize_srcu() from the corresponding
 * SRCU read-side critical section; doing so will result in deadlock.
@@ -464,7 +463,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-        __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT);
+        __synchronize_srcu(sp, rcu_expedited
+                           ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
+                           : SYNCHRONIZE_SRCU_TRYCOUNT);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
@@ -475,12 +476,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
 * Wait for an SRCU grace period to elapse, but be more aggressive about
 * spinning rather than blocking when waiting.
 *
- * Note that it is illegal to call this function while holding any lock
+ * Note that it is also illegal to call synchronize_srcu_expedited()
- * that is acquired by a CPU-hotplug notifier.  It is also illegal to call
+ * from the corresponding SRCU read-side critical section;
- * synchronize_srcu_expedited() from the corresponding SRCU read-side
+ * doing so will result in deadlock.  However, it is perfectly legal
- * critical section; doing so will result in deadlock.  However, it is
+ * to call synchronize_srcu_expedited() on one srcu_struct from some
- * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
+ * other srcu_struct's read-side critical section, as long as
- * from some other srcu_struct's read-side critical section, as long as
 * the resulting graph of srcu_structs is acyclic.
 */
 void synchronize_srcu_expedited(struct srcu_struct *sp)
@@ -637,7 +637,7 @@ static void srcu_reschedule(struct srcu_struct *sp)
 /*
 * This is the work-queue function that handles SRCU grace periods.
 */
-static void process_srcu(struct work_struct *work)
+void process_srcu(struct work_struct *work)
 {
        struct srcu_struct *sp;
@@ -648,3 +648,4 @@ static void process_srcu(struct work_struct *work)
        srcu_invoke_callbacks(sp);
        srcu_reschedule(sp);
 }
+EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2f194e965715..c09f2955ae30 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -18,7 +18,7 @@
 #include <linux/stop_machine.h>
 #include <linux/interrupt.h>
 #include <linux/kallsyms.h>
+#include <linux/smpboot.h>
 #include <linux/atomic.h>
 /*
@@ -37,10 +37,10 @@ struct cpu_stopper {
        spinlock_t              lock;
        bool                    enabled;        /* is this stopper enabled? */
        struct list_head        works;          /* list of pending works */
-        struct task_struct      *thread;        /* stopper thread */
 };
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
 static bool stop_machine_initialized = false;
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
@@ -62,16 +62,18 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
 }
 /* queue @work to @stopper.  if offline, @work is completed immediately */
-static void cpu_stop_queue_work(struct cpu_stopper *stopper,
+static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
-                                struct cpu_stop_work *work)
 {
+        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+        struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
        unsigned long flags;
        spin_lock_irqsave(&stopper->lock, flags);
        if (stopper->enabled) {
                list_add_tail(&work->list, &stopper->works);
-                wake_up_process(stopper->thread);
+                wake_up_process(p);
        } else
                cpu_stop_signal_done(work->done, false);
@@ -108,7 +110,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
        struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
        cpu_stop_init_done(&done, 1);
-        cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
+        cpu_stop_queue_work(cpu, &work);
        wait_for_completion(&done.completion);
        return done.executed ? done.ret : -ENOENT;
 }
@@ -130,7 +132,7 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
                        struct cpu_stop_work *work_buf)
 {
        *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
-        cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
+        cpu_stop_queue_work(cpu, work_buf);
 }
 /* static data for stop_cpus */
@@ -159,8 +161,7 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
         */
        preempt_disable();
        for_each_cpu(cpu, cpumask)
-                cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
+                cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
-                                    &per_cpu(stop_cpus_work, cpu));
        preempt_enable();
 }
@@ -244,20 +245,25 @@ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
        return ret;
 }
-static int cpu_stopper_thread(void *data)
+static int cpu_stop_should_run(unsigned int cpu)
+{
+        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+        unsigned long flags;
+        int run;
+        spin_lock_irqsave(&stopper->lock, flags);
+        run = !list_empty(&stopper->works);
+        spin_unlock_irqrestore(&stopper->lock, flags);
+        return run;
+}
+static void cpu_stopper_thread(unsigned int cpu)
 {
-        struct cpu_stopper *stopper = data;
+        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
        struct cpu_stop_work *work;
        int ret;
 repeat:
-        set_current_state(TASK_INTERRUPTIBLE);  /* mb paired w/ kthread_stop */
-        if (kthread_should_stop()) {
-                __set_current_state(TASK_RUNNING);
-                return 0;
-        }
        work = NULL;
        spin_lock_irq(&stopper->lock);
        if (!list_empty(&stopper->works)) {
@@ -273,8 +279,6 @@ repeat:
                struct cpu_stop_done *done = work->done;
                char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
-                __set_current_state(TASK_RUNNING);
                /* cpu stop callbacks are not allowed to sleep */
                preempt_disable();
@@ -290,88 +294,55 @@ repeat:
                                          ksym_buf), arg);
                cpu_stop_signal_done(done, true);
-        } else
+                goto repeat;
-                schedule();
+        }
-        goto repeat;
 }
 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
-/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
+static void cpu_stop_create(unsigned int cpu)
-static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
+{
-                                           unsigned long action, void *hcpu)
+        sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
+}
+static void cpu_stop_park(unsigned int cpu)
 {
-        unsigned int cpu = (unsigned long)hcpu;
        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
-        struct task_struct *p;
+        struct cpu_stop_work *work;
+        unsigned long flags;
-        switch (action & ~CPU_TASKS_FROZEN) {
-        case CPU_UP_PREPARE:
-                BUG_ON(stopper->thread || stopper->enabled ||
-                       !list_empty(&stopper->works));
-                p = kthread_create_on_node(cpu_stopper_thread,
-                                           stopper,
-                                           cpu_to_node(cpu),
-                                           "migration/%d", cpu);
-                if (IS_ERR(p))
-                        return notifier_from_errno(PTR_ERR(p));
-                get_task_struct(p);
-                kthread_bind(p, cpu);
-                sched_set_stop_task(cpu, p);
-                stopper->thread = p;
-                break;
-        case CPU_ONLINE:
-                /* strictly unnecessary, as first user will wake it */
-                wake_up_process(stopper->thread);
-                /* mark enabled */
-                spin_lock_irq(&stopper->lock);
-                stopper->enabled = true;
-                spin_unlock_irq(&stopper->lock);
-                break;
-#ifdef CONFIG_HOTPLUG_CPU
-        case CPU_UP_CANCELED:
-        case CPU_POST_DEAD:
-        {
-                struct cpu_stop_work *work;
-                sched_set_stop_task(cpu, NULL);
-                /* kill the stopper */
-                kthread_stop(stopper->thread);
-                /* drain remaining works */
-                spin_lock_irq(&stopper->lock);
-                list_for_each_entry(work, &stopper->works, list)
-                        cpu_stop_signal_done(work->done, false);
-                stopper->enabled = false;
-                spin_unlock_irq(&stopper->lock);
-                /* release the stopper */
-                put_task_struct(stopper->thread);
-                stopper->thread = NULL;
-                break;
-        }
-#endif
-        }
-        return NOTIFY_OK;
+        /* drain remaining works */
+        spin_lock_irqsave(&stopper->lock, flags);
+        list_for_each_entry(work, &stopper->works, list)
+                cpu_stop_signal_done(work->done, false);
+        stopper->enabled = false;
+        spin_unlock_irqrestore(&stopper->lock, flags);
 }
-/*
+static void cpu_stop_unpark(unsigned int cpu)
- * Give it a higher priority so that cpu stopper is available to other
+{
- * cpu notifiers.  It currently shares the same priority as sched
+        struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- * migration_notifier.
- */
+        spin_lock_irq(&stopper->lock);
-static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
+        stopper->enabled = true;
-        .notifier_call  = cpu_stop_cpu_callback,
+        spin_unlock_irq(&stopper->lock);
-        .priority       = 10,
+}
+static struct smp_hotplug_thread cpu_stop_threads = {
+        .store                  = &cpu_stopper_task,
+        .thread_should_run      = cpu_stop_should_run,
+        .thread_fn              = cpu_stopper_thread,
+        .thread_comm            = "migration/%u",
+        .create                 = cpu_stop_create,
+        .setup                  = cpu_stop_unpark,
+        .park                   = cpu_stop_park,
+        .pre_unpark             = cpu_stop_unpark,
+        .selfparking            = true,
 };
 static int __init cpu_stop_init(void)
 {
-        void *bcpu = (void *)(long)smp_processor_id();
        unsigned int cpu;
-        int err;
        for_each_possible_cpu(cpu) {
                struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
@@ -380,15 +351,8 @@ static int __init cpu_stop_init(void)
                INIT_LIST_HEAD(&stopper->works);
        }
-        /* start one for the boot cpu */
+        BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
-        err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
-                                    bcpu);
-        BUG_ON(err != NOTIFY_OK);
-        cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
-        register_cpu_notifier(&cpu_stop_cpu_notifier);
        stop_machine_initialized = true;
        return 0;
 }
 early_initcall(cpu_stop_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index e6e0ece5f6a0..0da73cf73e60 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -47,6 +47,7 @@
 #include <linux/syscalls.h>
 #include <linux/kprobes.h>
 #include <linux/user_namespace.h>
+#include <linux/binfmts.h>
 #include <linux/kmsg_dump.h>
 /* Move somewhere else to avoid recompiling? */
@@ -323,7 +324,6 @@ void kernel_restart_prepare(char *cmd)
        system_state = SYSTEM_RESTART;
        usermodehelper_disable();
        device_shutdown();
-        syscore_shutdown();
 }
 /**
@@ -369,6 +369,7 @@ void kernel_restart(char *cmd)
 {
        kernel_restart_prepare(cmd);
        disable_nonboot_cpus();
+        syscore_shutdown();
        if (!cmd)
                printk(KERN_EMERG "Restarting system.\n");
        else
@@ -394,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state)
 void kernel_halt(void)
 {
        kernel_shutdown_prepare(SYSTEM_HALT);
+        disable_nonboot_cpus();
        syscore_shutdown();
        printk(KERN_EMERG "System halted.\n");
        kmsg_dump(KMSG_DUMP_HALT);
@@ -433,11 +435,12 @@ static DEFINE_MUTEX(reboot_mutex);
 SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                void __user *, arg)
 {
+        struct pid_namespace *pid_ns = task_active_pid_ns(current);
        char buffer[256];
        int ret = 0;
        /* We only trust the superuser with rebooting the system. */
-        if (!capable(CAP_SYS_BOOT))
+        if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
                return -EPERM;
        /* For safety, we require "magic" arguments. */
@@ -453,7 +456,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
         * pid_namespace, the command is handled by reboot_pid_ns() which will
         * call do_exit().
         */
-        ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
+        ret = reboot_pid_ns(pid_ns, cmd);
        if (ret)
                return ret;
@@ -1046,7 +1049,7 @@ void do_sys_times(struct tms *tms)
        cputime_t tgutime, tgstime, cutime, cstime;
        spin_lock_irq(&current->sighand->siglock);
-        thread_group_times(current, &tgutime, &tgstime);
+        thread_group_cputime_adjusted(current, &tgutime, &tgstime);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
        spin_unlock_irq(&current->sighand->siglock);
@@ -1704,7 +1707,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        utime = stime = 0;
        if (who == RUSAGE_THREAD) {
-                task_times(current, &utime, &stime);
+                task_cputime_adjusted(current, &utime, &stime);
                accumulate_thread_rusage(p, r);
                maxrss = p->signal->maxrss;
                goto out;
@@ -1730,7 +1733,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                                break;
                case RUSAGE_SELF:
-                        thread_group_times(p, &tgutime, &tgstime);
+                        thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                        utime += tgutime;
                        stime += tgstime;
                        r->ru_nvcsw += p->signal->nvcsw;
@@ -1792,14 +1795,14 @@ SYSCALL_DEFINE1(umask, int, mask)
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
        struct fd exe;
-        struct dentry *dentry;
+        struct inode *inode;
        int err;
        exe = fdget(fd);
        if (!exe.file)
                return -EBADF;
-        dentry = exe.file->f_path.dentry;
+        inode = file_inode(exe.file);
        /*
         * Because the original mm->exe_file points to executable file, make
@@ -1807,11 +1810,11 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
         * overall picture.
         */
        err = -EACCES;
-        if (!S_ISREG(dentry->d_inode->i_mode)   ||
+        if (!S_ISREG(inode->i_mode)     ||
            exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
                goto exit;
-        err = inode_permission(dentry->d_inode, MAY_EXEC);
+        err = inode_permission(inode, MAY_EXEC);
        if (err)
                goto exit;
@@ -2012,160 +2015,159 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
        error = 0;
        switch (option) {
-                case PR_SET_PDEATHSIG:
+        case PR_SET_PDEATHSIG:
-                        if (!valid_signal(arg2)) {
+                if (!valid_signal(arg2)) {
-                                error = -EINVAL;
+                        error = -EINVAL;
-                                break;
-                        }
-                        me->pdeath_signal = arg2;
-                        break;
-                case PR_GET_PDEATHSIG:
-                        error = put_user(me->pdeath_signal, (int __user *)arg2);
-                        break;
-                case PR_GET_DUMPABLE:
-                        error = get_dumpable(me->mm);
                        break;
-                case PR_SET_DUMPABLE:
+                }
-                        if (arg2 < 0 || arg2 > 1) {
+                me->pdeath_signal = arg2;
-                                error = -EINVAL;
+                break;
-                                break;
+        case PR_GET_PDEATHSIG:
-                        }
+                error = put_user(me->pdeath_signal, (int __user *)arg2);
-                        set_dumpable(me->mm, arg2);
+                break;
+        case PR_GET_DUMPABLE:
+                error = get_dumpable(me->mm);
+                break;
+        case PR_SET_DUMPABLE:
+                if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
+                        error = -EINVAL;
                        break;
+                }
+                set_dumpable(me->mm, arg2);
+                break;
-                case PR_SET_UNALIGN:
+        case PR_SET_UNALIGN:
-                        error = SET_UNALIGN_CTL(me, arg2);
+                error = SET_UNALIGN_CTL(me, arg2);
-                        break;
+                break;
-                case PR_GET_UNALIGN:
+        case PR_GET_UNALIGN:
-                        error = GET_UNALIGN_CTL(me, arg2);
+                error = GET_UNALIGN_CTL(me, arg2);
-                        break;
+                break;
-                case PR_SET_FPEMU:
+        case PR_SET_FPEMU:
-                        error = SET_FPEMU_CTL(me, arg2);
+                error = SET_FPEMU_CTL(me, arg2);
-                        break;
+                break;
-                case PR_GET_FPEMU:
+        case PR_GET_FPEMU:
-                        error = GET_FPEMU_CTL(me, arg2);
+                error = GET_FPEMU_CTL(me, arg2);
-                        break;
+                break;
-                case PR_SET_FPEXC:
+        case PR_SET_FPEXC:
-                        error = SET_FPEXC_CTL(me, arg2);
+                error = SET_FPEXC_CTL(me, arg2);
-                        break;
+                break;
-                case PR_GET_FPEXC:
+        case PR_GET_FPEXC:
-                        error = GET_FPEXC_CTL(me, arg2);
+                error = GET_FPEXC_CTL(me, arg2);
-                        break;
+                break;
-                case PR_GET_TIMING:
+        case PR_GET_TIMING:
-                        error = PR_TIMING_STATISTICAL;
+                error = PR_TIMING_STATISTICAL;
-                        break;
+                break;
-                case PR_SET_TIMING:
+        case PR_SET_TIMING:
-                        if (arg2 != PR_TIMING_STATISTICAL)
+                if (arg2 != PR_TIMING_STATISTICAL)
-                                error = -EINVAL;
+                        error = -EINVAL;
-                        break;
+                break;
-                case PR_SET_NAME:
+        case PR_SET_NAME:
-                        comm[sizeof(me->comm)-1] = 0;
+                comm[sizeof(me->comm) - 1] = 0;
-                        if (strncpy_from_user(comm, (char __user *)arg2,
+                if (strncpy_from_user(comm, (char __user *)arg2,
-                                              sizeof(me->comm) - 1) < 0)
+                                      sizeof(me->comm) - 1) < 0)
-                                return -EFAULT;
+                        return -EFAULT;
-                        set_task_comm(me, comm);
+                set_task_comm(me, comm);
-                        proc_comm_connector(me);
+                proc_comm_connector(me);
-                        break;
+                break;
-                case PR_GET_NAME:
+        case PR_GET_NAME:
-                        get_task_comm(comm, me);
+                get_task_comm(comm, me);
-                        if (copy_to_user((char __user *)arg2, comm,
+                if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
-                                         sizeof(comm)))
+                        return -EFAULT;
-                                return -EFAULT;
+                break;
-                        break;
+        case PR_GET_ENDIAN:
-                case PR_GET_ENDIAN:
+                error = GET_ENDIAN(me, arg2);
-                        error = GET_ENDIAN(me, arg2);
+                break;
-                        break;
+        case PR_SET_ENDIAN:
-                case PR_SET_ENDIAN:
+                error = SET_ENDIAN(me, arg2);
-                        error = SET_ENDIAN(me, arg2);
+                break;
-                        break;
+        case PR_GET_SECCOMP:
-                case PR_GET_SECCOMP:
+                error = prctl_get_seccomp();
-                        error = prctl_get_seccomp();
+                break;
-                        break;
+        case PR_SET_SECCOMP:
-                case PR_SET_SECCOMP:
+                error = prctl_set_seccomp(arg2, (char __user *)arg3);
-                        error = prctl_set_seccomp(arg2, (char __user *)arg3);
+                break;
-                        break;
+        case PR_GET_TSC:
-                case PR_GET_TSC:
+                error = GET_TSC_CTL(arg2);
-                        error = GET_TSC_CTL(arg2);
+                break;
-                        break;
+        case PR_SET_TSC:
-                case PR_SET_TSC:
+                error = SET_TSC_CTL(arg2);
-                        error = SET_TSC_CTL(arg2);
+                break;
-                        break;
+        case PR_TASK_PERF_EVENTS_DISABLE:
-                case PR_TASK_PERF_EVENTS_DISABLE:
+                error = perf_event_task_disable();
-                        error = perf_event_task_disable();
+                break;
-                        break;
+        case PR_TASK_PERF_EVENTS_ENABLE:
-                case PR_TASK_PERF_EVENTS_ENABLE:
+                error = perf_event_task_enable();
-                        error = perf_event_task_enable();
+                break;
-                        break;
+        case PR_GET_TIMERSLACK:
-                case PR_GET_TIMERSLACK:
+                error = current->timer_slack_ns;
-                        error = current->timer_slack_ns;
+                break;
-                        break;
+        case PR_SET_TIMERSLACK:
-                case PR_SET_TIMERSLACK:
+                if (arg2 <= 0)
-                        if (arg2 <= 0)
+                        current->timer_slack_ns =
-                                current->timer_slack_ns =
                                        current->default_timer_slack_ns;
-                        else
+                else
-                                current->timer_slack_ns = arg2;
+                        current->timer_slack_ns = arg2;
-                        break;
+                break;
-                case PR_MCE_KILL:
+        case PR_MCE_KILL:
-                        if (arg4 | arg5)
+                if (arg4 | arg5)
-                                return -EINVAL;
+                        return -EINVAL;
-                        switch (arg2) {
+                switch (arg2) {
-                        case PR_MCE_KILL_CLEAR:
+                case PR_MCE_KILL_CLEAR:
-                                if (arg3 != 0)
+                        if (arg3 != 0)
-                                        return -EINVAL;
-                                current->flags &= ~PF_MCE_PROCESS;
-                                break;
-                        case PR_MCE_KILL_SET:
-                                current->flags |= PF_MCE_PROCESS;
-                                if (arg3 == PR_MCE_KILL_EARLY)
-                                        current->flags |= PF_MCE_EARLY;
-                                else if (arg3 == PR_MCE_KILL_LATE)
-                                        current->flags &= ~PF_MCE_EARLY;
-                                else if (arg3 == PR_MCE_KILL_DEFAULT)
-                                        current->flags &=
-                                                ~(PF_MCE_EARLY|PF_MCE_PROCESS);
-                                else
-                                        return -EINVAL;
-                                break;
-                        default:
                                return -EINVAL;
-                        }
+                        current->flags &= ~PF_MCE_PROCESS;
                        break;
-                case PR_MCE_KILL_GET:
+                case PR_MCE_KILL_SET:
-                        if (arg2 | arg3 | arg4 | arg5)
+                        current->flags |= PF_MCE_PROCESS;
-                                return -EINVAL;
+                        if (arg3 == PR_MCE_KILL_EARLY)
-                        if (current->flags & PF_MCE_PROCESS)
+                                current->flags |= PF_MCE_EARLY;
-                                error = (current->flags & PF_MCE_EARLY) ?
+                        else if (arg3 == PR_MCE_KILL_LATE)
-                                        PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+                                current->flags &= ~PF_MCE_EARLY;
+                        else if (arg3 == PR_MCE_KILL_DEFAULT)
+                                current->flags &=
+                                                ~(PF_MCE_EARLY|PF_MCE_PROCESS);
                        else
-                                error = PR_MCE_KILL_DEFAULT;
-                        break;
-                case PR_SET_MM:
-                        error = prctl_set_mm(arg2, arg3, arg4, arg5);
-                        break;
-                case PR_GET_TID_ADDRESS:
-                        error = prctl_get_tid_address(me, (int __user **)arg2);
-                        break;
-                case PR_SET_CHILD_SUBREAPER:
-                        me->signal->is_child_subreaper = !!arg2;
-                        break;
-                case PR_GET_CHILD_SUBREAPER:
-                        error = put_user(me->signal->is_child_subreaper,
-                                         (int __user *) arg2);
-                        break;
-                case PR_SET_NO_NEW_PRIVS:
-                        if (arg2 != 1 || arg3 || arg4 || arg5)
                                return -EINVAL;
-                        current->no_new_privs = 1;
                        break;
-                case PR_GET_NO_NEW_PRIVS:
-                        if (arg2 || arg3 || arg4 || arg5)
-                                return -EINVAL;
-                        return current->no_new_privs ? 1 : 0;
                default:
-                        error = -EINVAL;
+                        return -EINVAL;
-                        break;
+                }
+                break;
+        case PR_MCE_KILL_GET:
+                if (arg2 | arg3 | arg4 | arg5)
+                        return -EINVAL;
+                if (current->flags & PF_MCE_PROCESS)
+                        error = (current->flags & PF_MCE_EARLY) ?
+                                PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+                else
+                        error = PR_MCE_KILL_DEFAULT;
+                break;
+        case PR_SET_MM:
+                error = prctl_set_mm(arg2, arg3, arg4, arg5);
+                break;
+        case PR_GET_TID_ADDRESS:
+                error = prctl_get_tid_address(me, (int __user **)arg2);
+                break;
+        case PR_SET_CHILD_SUBREAPER:
+                me->signal->is_child_subreaper = !!arg2;
+                break;
+        case PR_GET_CHILD_SUBREAPER:
+                error = put_user(me->signal->is_child_subreaper,
+                                 (int __user *)arg2);
+                break;
+        case PR_SET_NO_NEW_PRIVS:
+                if (arg2 != 1 || arg3 || arg4 || arg5)
+                        return -EINVAL;
+                current->no_new_privs = 1;
+                break;
+        case PR_GET_NO_NEW_PRIVS:
+                if (arg2 || arg3 || arg4 || arg5)
+                        return -EINVAL;
+                return current->no_new_privs ? 1 : 0;
+        default:
+                error = -EINVAL;
+                break;
        }
        return error;
 }
@@ -2184,14 +2186,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
 char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-static void argv_cleanup(struct subprocess_info *info)
+static int __orderly_poweroff(bool force)
 {
-        argv_free(info->argv);
-}
-static int __orderly_poweroff(void)
-{
-        int argc;
        char **argv;
        static char *envp[] = {
                "HOME=/",
@@ -2200,21 +2196,40 @@ static int __orderly_poweroff(void)
        };
        int ret;
-        argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
+        argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
-        if (argv == NULL) {
+        if (argv) {
+                ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+                argv_free(argv);
+        } else {
                printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
-                       __func__, poweroff_cmd);
+                                         __func__, poweroff_cmd);
-                return -ENOMEM;
+                ret = -ENOMEM;
        }
-        ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
+        if (ret && force) {
-                                      NULL, argv_cleanup, NULL);
+                printk(KERN_WARNING "Failed to start orderly shutdown: "
-        if (ret == -ENOMEM)
+                                        "forcing the issue\n");
-                argv_free(argv);
+                /*
+                 * I guess this should try to kick off some daemon to sync and
+                 * poweroff asap.  Or not even bother syncing if we're doing an
+                 * emergency shutdown?
+                 */
+                emergency_sync();
+                kernel_power_off();
+        }
        return ret;
 }
+static bool poweroff_force;
+static void poweroff_work_func(struct work_struct *work)
+{
+        __orderly_poweroff(poweroff_force);
+}
+static DECLARE_WORK(poweroff_work, poweroff_work_func);
 /**
 * orderly_poweroff - Trigger an orderly system poweroff
 * @force: force poweroff if command execution fails
@@ -2224,21 +2239,9 @@ static int __orderly_poweroff(void)
 */
 int orderly_poweroff(bool force)
 {
-        int ret = __orderly_poweroff();
+        if (force) /* do not override the pending "true" */
+                poweroff_force = true;
-        if (ret && force) {
+        schedule_work(&poweroff_work);
-                printk(KERN_WARNING "Failed to start orderly shutdown: "
+        return 0;
-                       "forcing the issue\n");
-                /*
-                 * I guess this should try to kick off some daemon to sync and
-                 * poweroff asap.  Or not even bother syncing if we're doing an
-                 * emergency shutdown?
-                 */
-                emergency_sync();
-                kernel_power_off();
-        }
-        return ret;
 }
 EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index dbff751e4086..395084d4ce16 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapoff);
 cond_syscall(sys_kexec_load);
 cond_syscall(compat_sys_kexec_load);
 cond_syscall(sys_init_module);
+cond_syscall(sys_finit_module);
 cond_syscall(sys_delete_module);
 cond_syscall(sys_socketpair);
 cond_syscall(sys_bind);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65eaa01f9..afc1dc60f3f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -61,6 +61,7 @@
 #include <linux/kmod.h>
 #include <linux/capability.h>
 #include <linux/binfmts.h>
+#include <linux/sched/sysctl.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -104,7 +105,6 @@ extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 #endif
 extern int pid_max;
-extern int min_free_kbytes;
 extern int pid_max_min, pid_max_max;
 extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
@@ -157,14 +157,20 @@ extern int sysctl_tsb_ratio;
 #ifdef __hppa__
 extern int pwrsw_enabled;
+#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
 extern int unaligned_enabled;
 #endif
 #ifdef CONFIG_IA64
-extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
 #endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
+extern int no_unaligned_warning;
+#endif
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -256,9 +262,11 @@ static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;     /* 1 second */
 static int min_wakeup_granularity_ns;                   /* 0 usecs */
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;    /* 1 second */
+#ifdef CONFIG_SMP
 static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
 static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
 #ifdef CONFIG_COMPACTION
 static int min_extfrag_threshold;
@@ -301,6 +309,7 @@ static struct ctl_table kern_table[] = {
                .extra1         = &min_wakeup_granularity_ns,
                .extra2         = &max_wakeup_granularity_ns,
        },
+#ifdef CONFIG_SMP
        {
                .procname       = "sched_tunable_scaling",
                .data           = &sysctl_sched_tunable_scaling,
@@ -347,7 +356,45 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+        {
+                .procname       = "numa_balancing_scan_delay_ms",
+                .data           = &sysctl_numa_balancing_scan_delay,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_scan_period_min_ms",
+                .data           = &sysctl_numa_balancing_scan_period_min,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_scan_period_reset",
+                .data           = &sysctl_numa_balancing_scan_period_reset,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_scan_period_max_ms",
+                .data           = &sysctl_numa_balancing_scan_period_max,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_scan_size_mb",
+                .data           = &sysctl_numa_balancing_scan_size,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
        {
                .procname       = "sched_rt_period_us",
                .data           = &sysctl_sched_rt_period,
@@ -362,6 +409,13 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = sched_rt_handler,
        },
+        {
+                .procname       = "sched_rr_timeslice_ms",
+                .data           = &sched_rr_timeslice,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = sched_rr_handler,
+        },
 #ifdef CONFIG_SCHED_AUTOGROUP
        {
                .procname       = "sched_autogroup_enabled",
@@ -504,6 +558,8 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#endif
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW
        {
                .procname       = "unaligned-trap",
                .data           = &unaligned_enabled,
@@ -565,7 +621,7 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif
-#ifdef CONFIG_HOTPLUG
        {
                .procname       = "hotplug",
                .data           = &uevent_helper,
@@ -573,7 +629,7 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dostring,
        },
-#endif
 #ifdef CONFIG_CHR_DEV_SG
        {
                .procname       = "sg-big-buff",
@@ -870,7 +926,7 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_doulongvec_minmax,
        },
 #endif
-#ifdef CONFIG_IA64
+#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
        {
                .procname       = "ignore-unaligned-usertrap",
                .data           = &no_unaligned_warning,
@@ -878,6 +934,8 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+#endif
+#ifdef CONFIG_IA64
        {
                .procname       = "unaligned-dump-stack",
                .data           = &unaligned_dump_stack,
@@ -1965,7 +2023,7 @@ static int proc_taint(struct ctl_table *table, int write,
                int i;
                for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
                        if ((tmptaint >> i) & 1)
-                                add_taint(i);
+                                add_taint(i, LOCKDEP_STILL_OK);
                }
        }
@@ -2042,7 +2100,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 static void validate_coredump_safety(void)
 {
 #ifdef CONFIG_COREDUMP
-        if (suid_dumpable == SUID_DUMPABLE_SAFE &&
+        if (suid_dumpable == SUID_DUMP_ROOT &&
            core_pattern[0] != '/' && core_pattern[0] != '|') {
                printk(KERN_WARNING "Unsafe core_pattern used with "\
                        "suid_dumpable=2. Pipe handler or fully qualified "\
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 65bdcf198d4e..ebf72358e86a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -387,7 +387,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
        { CTL_INT,      NET_TCP_MODERATE_RCVBUF,                "tcp_moderate_rcvbuf" },
        { CTL_INT,      NET_TCP_TSO_WIN_DIVISOR,                "tcp_tso_win_divisor" },
        { CTL_STR,      NET_TCP_CONG_CONTROL,                   "tcp_congestion_control" },
-        { CTL_INT,      NET_TCP_ABC,                            "tcp_abc" },
        { CTL_INT,      NET_TCP_MTU_PROBING,                    "tcp_mtu_probing" },
        { CTL_INT,      NET_TCP_BASE_MSS,                       "tcp_base_mss" },
        { CTL_INT,      NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
@@ -971,7 +970,6 @@ out:
 static ssize_t bin_intvec(struct file *file,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
-        mm_segment_t old_fs = get_fs();
        ssize_t copied = 0;
        char *buffer;
        ssize_t result;
@@ -984,13 +982,10 @@ static ssize_t bin_intvec(struct file *file,
        if (oldval && oldlen) {
                unsigned __user *vec = oldval;
                size_t length = oldlen / sizeof(*vec);
-                loff_t pos = 0;
                char *str, *end;
                int i;
-                set_fs(KERNEL_DS);
+                result = kernel_read(file, 0, buffer, BUFSZ - 1);
-                result = vfs_read(file, buffer, BUFSZ - 1, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out_kfree;
@@ -1017,7 +1012,6 @@ static ssize_t bin_intvec(struct file *file,
        if (newval && newlen) {
                unsigned __user *vec = newval;
                size_t length = newlen / sizeof(*vec);
-                loff_t pos = 0;
                char *str, *end;
                int i;
@@ -1033,9 +1027,7 @@ static ssize_t bin_intvec(struct file *file,
                        str += snprintf(str, end - str, "%lu\t", value);
                }
-                set_fs(KERNEL_DS);
+                result = kernel_write(file, buffer, str - buffer, 0);
-                result = vfs_write(file, buffer, str - buffer, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out_kfree;
        }
@@ -1049,7 +1041,6 @@ out:
 static ssize_t bin_ulongvec(struct file *file,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
-        mm_segment_t old_fs = get_fs();
        ssize_t copied = 0;
        char *buffer;
        ssize_t result;
@@ -1062,13 +1053,10 @@ static ssize_t bin_ulongvec(struct file *file,
        if (oldval && oldlen) {
                unsigned long __user *vec = oldval;
                size_t length = oldlen / sizeof(*vec);
-                loff_t pos = 0;
                char *str, *end;
                int i;
-                set_fs(KERNEL_DS);
+                result = kernel_read(file, 0, buffer, BUFSZ - 1);
-                result = vfs_read(file, buffer, BUFSZ - 1, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out_kfree;
@@ -1095,7 +1083,6 @@ static ssize_t bin_ulongvec(struct file *file,
        if (newval && newlen) {
                unsigned long __user *vec = newval;
                size_t length = newlen / sizeof(*vec);
-                loff_t pos = 0;
                char *str, *end;
                int i;
@@ -1111,9 +1098,7 @@ static ssize_t bin_ulongvec(struct file *file,
                        str += snprintf(str, end - str, "%lu\t", value);
                }
-                set_fs(KERNEL_DS);
+                result = kernel_write(file, buffer, str - buffer, 0);
-                result = vfs_write(file, buffer, str - buffer, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out_kfree;
        }
@@ -1127,19 +1112,15 @@ out:
 static ssize_t bin_uuid(struct file *file,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
-        mm_segment_t old_fs = get_fs();
        ssize_t result, copied = 0;
        /* Only supports reads */
        if (oldval && oldlen) {
-                loff_t pos = 0;
                char buf[40], *str = buf;
                unsigned char uuid[16];
                int i;
-                set_fs(KERNEL_DS);
+                result = kernel_read(file, 0, buf, sizeof(buf) - 1);
-                result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out;
@@ -1175,18 +1156,14 @@ out:
 static ssize_t bin_dn_node_address(struct file *file,
        void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
-        mm_segment_t old_fs = get_fs();
        ssize_t result, copied = 0;
        if (oldval && oldlen) {
-                loff_t pos = 0;
                char buf[15], *nodep;
                unsigned long area, node;
                __le16 dnaddr;
-                set_fs(KERNEL_DS);
+                result = kernel_read(file, 0, buf, sizeof(buf) - 1);
-                result = vfs_read(file, buf, sizeof(buf) - 1, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out;
@@ -1194,9 +1171,10 @@ static ssize_t bin_dn_node_address(struct file *file,
                /* Convert the decnet address to binary */
                result = -EIO;
-                nodep = strchr(buf, '.') + 1;
+                nodep = strchr(buf, '.');
                if (!nodep)
                        goto out;
+                ++nodep;
                area = simple_strtoul(buf, NULL, 10);
                node = simple_strtoul(nodep, NULL, 10);
@@ -1215,7 +1193,6 @@ static ssize_t bin_dn_node_address(struct file *file,
        }
        if (newval && newlen) {
-                loff_t pos = 0;
                __le16 dnaddr;
                char buf[15];
                int len;
@@ -1232,9 +1209,7 @@ static ssize_t bin_dn_node_address(struct file *file,
                                le16_to_cpu(dnaddr) >> 10,
                                le16_to_cpu(dnaddr) & 0x3ff);
-                set_fs(KERNEL_DS);
+                result = kernel_write(file, buf, len, 0);
-                result = vfs_write(file, buf, len, &pos);
-                set_fs(old_fs);
                if (result < 0)
                        goto out;
        }
@@ -1344,7 +1319,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
                goto out_putname;
        }
-        mnt = current->nsproxy->pid_ns->proc_mnt;
+        mnt = task_active_pid_ns(current)->proc_mnt;
        file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
        result = PTR_ERR(file);
        if (IS_ERR(file))
diff --git a/kernel/time.c b/kernel/time.c
index d226c6a3fd28..f8342a41efa6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -115,6 +115,12 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
 }
 /*
+ * Indicates if there is an offset between the system clock and the hardware
+ * clock/persistent clock/rtc.
+ */
+int persistent_clock_is_local;
+/*
 * Adjust the time obtained from the CMOS to be UTC time instead of
 * local time.
 *
@@ -135,6 +141,8 @@ static inline void warp_clock(void)
        struct timespec adjust;
        adjust = current_kernel_time();
+        if (sys_tz.tz_minuteswest != 0)
+                persistent_clock_is_local = 1;
        adjust.tv_sec += sys_tz.tz_minuteswest * 60;
        do_settimeofday(&adjust);
 }
@@ -232,7 +240,7 @@ EXPORT_SYMBOL(current_fs_time);
 * Avoid unnecessary multiplications/divisions in the
 * two most common HZ cases:
 */
-inline unsigned int jiffies_to_msecs(const unsigned long j)
+unsigned int jiffies_to_msecs(const unsigned long j)
 {
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
@@ -248,7 +256,7 @@ inline unsigned int jiffies_to_msecs(const unsigned long j)
 }
 EXPORT_SYMBOL(jiffies_to_msecs);
-inline unsigned int jiffies_to_usecs(const unsigned long j)
+unsigned int jiffies_to_usecs(const unsigned long j)
 {
 #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
        return (USEC_PER_SEC / HZ) * j;
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8601f0db1261..24510d84efd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
 config ARCH_CLOCKSOURCE_DATA
        bool
+# Platforms has a persistent clock
+config ALWAYS_USE_PERSISTENT_CLOCK
+        bool
+        default n
 # Timekeeping vsyscall support
 config GENERIC_TIME_VSYSCALL
        bool
@@ -38,6 +43,10 @@ config GENERIC_CLOCKEVENTS_BUILD
        default y
        depends on GENERIC_CLOCKEVENTS
+# Architecture can handle broadcast in a driver-agnostic way
+config ARCH_HAS_TICK_BROADCAST
+        bool
 # Clockevents broadcasting infrastructure
 config GENERIC_CLOCKEVENTS_BROADCAST
        bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index e2fd74b8e8c2..ff7d9d2ab504 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
 obj-y += timeconv.o posix-clock.o alarmtimer.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)         += clockevents.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 30b6de0d977c..c6d6400ee137 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -339,6 +339,7 @@ void clockevents_config_and_register(struct clock_event_device *dev,
        clockevents_config(dev, freq);
        clockevents_register_device(dev);
 }
+EXPORT_SYMBOL_GPL(clockevents_config_and_register);
 /**
 * clockevents_update_freq - Update frequency and reprogram a clock event device.
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 6629bf7b5285..7a925ba456fb 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -58,7 +58,7 @@ static cycle_t jiffies_read(struct clocksource *cs)
        return (cycle_t) jiffies;
 }
-struct clocksource clocksource_jiffies = {
+static struct clocksource clocksource_jiffies = {
        .name           = "jiffies",
        .rating         = 1, /* lowest valid rating*/
        .read           = jiffies_read,
@@ -67,6 +67,8 @@ struct clocksource clocksource_jiffies = {
        .shift          = JIFFIES_SHIFT,
 };
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
@@ -74,9 +76,9 @@ u64 get_jiffies_64(void)
        u64 ret;
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&jiffies_lock);
                ret = jiffies_64;
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&jiffies_lock, seq));
        return ret;
 }
 EXPORT_SYMBOL(get_jiffies_64);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 24174b4d669b..072bb066bb7d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -15,6 +15,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/rtc.h>
 #include "tick-internal.h"
@@ -22,7 +23,7 @@
 * NTP timekeeping variables:
 */
-DEFINE_SPINLOCK(ntp_lock);
+DEFINE_RAW_SPINLOCK(ntp_lock);
 /* USER_HZ period (usecs): */
@@ -347,7 +348,7 @@ void ntp_clear(void)
 {
        unsigned long flags;
-        spin_lock_irqsave(&ntp_lock, flags);
+        raw_spin_lock_irqsave(&ntp_lock, flags);
        time_adjust     = 0;            /* stop active adjtime() */
        time_status     |= STA_UNSYNC;
@@ -361,7 +362,7 @@ void ntp_clear(void)
        /* Clear PPS state variables */
        pps_clear();
-        spin_unlock_irqrestore(&ntp_lock, flags);
+        raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
@@ -371,9 +372,9 @@ u64 ntp_tick_length(void)
        unsigned long flags;
        s64 ret;
-        spin_lock_irqsave(&ntp_lock, flags);
+        raw_spin_lock_irqsave(&ntp_lock, flags);
        ret = tick_length;
-        spin_unlock_irqrestore(&ntp_lock, flags);
+        raw_spin_unlock_irqrestore(&ntp_lock, flags);
        return ret;
 }
@@ -394,7 +395,7 @@ int second_overflow(unsigned long secs)
        int leap = 0;
        unsigned long flags;
-        spin_lock_irqsave(&ntp_lock, flags);
+        raw_spin_lock_irqsave(&ntp_lock, flags);
        /*
         * Leap second processing. If in leap-insert state at the end of the
@@ -478,13 +479,12 @@ int second_overflow(unsigned long secs)
        time_adjust = 0;
 out:
-        spin_unlock_irqrestore(&ntp_lock, flags);
+        raw_spin_unlock_irqrestore(&ntp_lock, flags);
        return leap;
 }
-#ifdef CONFIG_GENERIC_CMOS_UPDATE
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
 static void sync_cmos_clock(struct work_struct *work);
 static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -510,14 +510,26 @@ static void sync_cmos_clock(struct work_struct *work)
        }
        getnstimeofday(&now);
-        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
+        if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
-                fail = update_persistent_clock(now);
+                struct timespec adjust = now;
+                fail = -ENODEV;
+                if (persistent_clock_is_local)
+                        adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+                fail = update_persistent_clock(adjust);
+#endif
+#ifdef CONFIG_RTC_SYSTOHC
+                if (fail == -ENODEV)
+                        fail = rtc_set_ntp_time(adjust);
+#endif
+        }
        next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
        if (next.tv_nsec <= 0)
                next.tv_nsec += NSEC_PER_SEC;
-        if (!fail)
+        if (!fail || fail == -ENODEV)
                next.tv_sec = 659;
        else
                next.tv_sec = 0;
@@ -660,7 +672,7 @@ int do_adjtimex(struct timex *txc)
        getnstimeofday(&ts);
-        spin_lock_irq(&ntp_lock);
+        raw_spin_lock_irq(&ntp_lock);
        if (txc->modes & ADJ_ADJTIME) {
                long save_adjust = time_adjust;
@@ -702,7 +714,7 @@ int do_adjtimex(struct timex *txc)
        /* fill PPS status fields */
        pps_fill_timex(txc);
-        spin_unlock_irq(&ntp_lock);
+        raw_spin_unlock_irq(&ntp_lock);
        txc->time.tv_sec = ts.tv_sec;
        txc->time.tv_usec = ts.tv_nsec;
@@ -900,7 +912,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
        pts_norm = pps_normalize_ts(*phase_ts);
-        spin_lock_irqsave(&ntp_lock, flags);
+        raw_spin_lock_irqsave(&ntp_lock, flags);
        /* clear the error bits, they will be set again if needed */
        time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -913,7 +925,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
         * just start the frequency interval */
        if (unlikely(pps_fbase.tv_sec == 0)) {
                pps_fbase = *raw_ts;
-                spin_unlock_irqrestore(&ntp_lock, flags);
+                raw_spin_unlock_irqrestore(&ntp_lock, flags);
                return;
        }
@@ -928,7 +940,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
                time_status |= STA_PPSJITTER;
                /* restart the frequency calibration interval */
                pps_fbase = *raw_ts;
-                spin_unlock_irqrestore(&ntp_lock, flags);
+                raw_spin_unlock_irqrestore(&ntp_lock, flags);
                pr_err("hardpps: PPSJITTER: bad pulse\n");
                return;
        }
@@ -945,7 +957,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
        hardpps_update_phase(pts_norm.nsec);
-        spin_unlock_irqrestore(&ntp_lock, flags);
+        raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
 EXPORT_SYMBOL(hardpps);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f113755695e2..7f32fe0e52cd 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,6 +18,7 @@
 #include <linux/percpu.h>
 #include <linux/profile.h>
 #include <linux/sched.h>
+#include <linux/smp.h>
 #include "tick-internal.h"
@@ -66,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
 */
 int tick_check_broadcast_device(struct clock_event_device *dev)
 {
-        if ((tick_broadcast_device.evtdev &&
+        if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+            (tick_broadcast_device.evtdev &&
             tick_broadcast_device.evtdev->rating >= dev->rating) ||
             (dev->features & CLOCK_EVT_FEAT_C3STOP))
                return 0;
@@ -86,6 +88,22 @@ int tick_is_broadcast_device(struct clock_event_device *dev)
        return (dev && tick_broadcast_device.evtdev == dev);
 }
+static void err_broadcast(const struct cpumask *mask)
+{
+        pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
+}
+static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
+{
+        if (!dev->broadcast)
+                dev->broadcast = tick_broadcast;
+        if (!dev->broadcast) {
+                pr_warn_once("%s depends on broadcast, but no broadcast function available\n",
+                             dev->name);
+                dev->broadcast = err_broadcast;
+        }
+}
 /*
 * Check, if the device is disfunctional and a place holder, which
 * needs to be handled by the broadcast device.
@@ -105,6 +123,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
         */
        if (!tick_device_is_functional(dev)) {
                dev->event_handler = tick_handle_periodic;
+                tick_device_setup_broadcast_func(dev);
                cpumask_set_cpu(cpu, tick_get_broadcast_mask());
                tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
                ret = 1;
@@ -116,15 +135,33 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
                 */
                if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
                        int cpu = smp_processor_id();
                        cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
                        tick_broadcast_clear_oneshot(cpu);
+                } else {
+                        tick_device_setup_broadcast_func(dev);
                }
        }
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
        return ret;
 }
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+int tick_receive_broadcast(void)
+{
+        struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+        struct clock_event_device *evt = td->evtdev;
+        if (!evt)
+                return -ENODEV;
+        if (!evt->event_handler)
+                return -EINVAL;
+        evt->event_handler(evt);
+        return 0;
+}
+#endif
 /*
 * Broadcast the event to the cpus, which are set in the mask (mangled).
 */
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index da6c9ecad4e4..b1600a6973f4 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -63,13 +63,13 @@ int tick_is_oneshot_available(void)
 static void tick_periodic(int cpu)
 {
        if (tick_do_timer_cpu == cpu) {
-                write_seqlock(&xtime_lock);
+                write_seqlock(&jiffies_lock);
                /* Keep track of the next tick event */
                tick_next_period = ktime_add(tick_next_period, tick_period);
                do_timer(1);
-                write_sequnlock(&xtime_lock);
+                write_sequnlock(&jiffies_lock);
        }
        update_process_times(user_mode(get_irq_regs()));
@@ -130,9 +130,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
                ktime_t next;
                do {
-                        seq = read_seqbegin(&xtime_lock);
+                        seq = read_seqbegin(&jiffies_lock);
                        next = tick_next_period;
-                } while (read_seqretry(&xtime_lock, seq));
+                } while (read_seqretry(&jiffies_lock, seq));
                clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 4e265b901fed..cf3e59ed6dc0 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -141,4 +141,3 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 #endif
 extern void do_timer(unsigned long ticks);
-extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a40260885265..a19a39952c1b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/irq_work.h>
 #include <asm/irq_regs.h>
@@ -28,10 +29,10 @@
 /*
 * Per cpu nohz control structure
 */
-static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
+DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
 /*
- * The time, when the last jiffy update happened. Protected by xtime_lock.
+ * The time, when the last jiffy update happened. Protected by jiffies_lock.
 */
 static ktime_t last_jiffies_update;
@@ -49,14 +50,14 @@ static void tick_do_update_jiffies64(ktime_t now)
        ktime_t delta;
        /*
-         * Do a quick check without holding xtime_lock:
+         * Do a quick check without holding jiffies_lock:
         */
        delta = ktime_sub(now, last_jiffies_update);
        if (delta.tv64 < tick_period.tv64)
                return;
-        /* Reevalute with xtime_lock held */
+        /* Reevalute with jiffies_lock held */
-        write_seqlock(&xtime_lock);
+        write_seqlock(&jiffies_lock);
        delta = ktime_sub(now, last_jiffies_update);
        if (delta.tv64 >= tick_period.tv64) {
@@ -79,7 +80,7 @@ static void tick_do_update_jiffies64(ktime_t now)
                /* Keep the tick_next_period variable up to date */
                tick_next_period = ktime_add(last_jiffies_update, tick_period);
        }
-        write_sequnlock(&xtime_lock);
+        write_sequnlock(&jiffies_lock);
 }
 /*
@@ -89,15 +90,58 @@ static ktime_t tick_init_jiffy_update(void)
 {
        ktime_t period;
-        write_seqlock(&xtime_lock);
+        write_seqlock(&jiffies_lock);
        /* Did we start the jiffies update yet ? */
        if (last_jiffies_update.tv64 == 0)
                last_jiffies_update = tick_next_period;
        period = last_jiffies_update;
-        write_sequnlock(&xtime_lock);
+        write_sequnlock(&jiffies_lock);
        return period;
 }
+static void tick_sched_do_timer(ktime_t now)
+{
+        int cpu = smp_processor_id();
+#ifdef CONFIG_NO_HZ
+        /*
+         * Check if the do_timer duty was dropped. We don't care about
+         * concurrency: This happens only when the cpu in charge went
+         * into a long sleep. If two cpus happen to assign themself to
+         * this duty, then the jiffies update is still serialized by
+         * jiffies_lock.
+         */
+        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+                tick_do_timer_cpu = cpu;
+#endif
+        /* Check, if the jiffies need an update */
+        if (tick_do_timer_cpu == cpu)
+                tick_do_update_jiffies64(now);
+}
+static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
+{
+#ifdef CONFIG_NO_HZ
+        /*
+         * When we are idle and the tick is stopped, we have to touch
+         * the watchdog as we might not schedule for a really long
+         * time. This happens on complete idle SMP systems while
+         * waiting on the login prompt. We also increment the "start of
+         * idle" jiffy stamp so the idle accounting adjustment we do
+         * when we go busy again does not account too much ticks.
+         */
+        if (ts->tick_stopped) {
+                touch_softlockup_watchdog();
+                if (is_idle_task(current))
+                        ts->idle_jiffies++;
+        }
+#endif
+        update_process_times(user_mode(regs));
+        profile_tick(CPU_PROFILING);
+}
 /*
 * NOHZ - aka dynamic tick functionality
 */
@@ -282,14 +326,14 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
        /* Read jiffies and the time when jiffies were updated last */
        do {
-                seq = read_seqbegin(&xtime_lock);
+                seq = read_seqbegin(&jiffies_lock);
                last_update = last_jiffies_update;
                last_jiffies = jiffies;
                time_delta = timekeeping_max_deferment();
-        } while (read_seqretry(&xtime_lock, seq));
+        } while (read_seqretry(&jiffies_lock, seq));
-        if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||
+        if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
-            arch_needs_cpu(cpu)) {
+            arch_needs_cpu(cpu) || irq_work_needs_cpu()) {
                next_jiffies = last_jiffies + 1;
                delta_jiffies = 1;
        } else {
@@ -510,6 +554,7 @@ void tick_nohz_idle_enter(void)
        local_irq_enable();
 }
+EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
 /**
 * tick_nohz_irq_exit - update next tick event from interrupt exit
@@ -526,6 +571,8 @@ void tick_nohz_irq_exit(void)
        if (!ts->inidle)
                return;
+        /* Cancel the timer because CPU already waken up from the C-states*/
+        menu_hrtimer_cancel();
        __tick_nohz_idle_enter(ts);
 }
@@ -586,8 +633,11 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 {
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        unsigned long ticks;
+        if (vtime_accounting_enabled())
+                return;
        /*
         * We stopped the tick in idle. Update process times would miss the
         * time we slept as update_process_times does only a 1 tick
@@ -621,6 +671,8 @@ void tick_nohz_idle_exit(void)
        ts->inidle = 0;
+        /* Cancel the timer because CPU already waken up from the C-states*/
+        menu_hrtimer_cancel();
        if (ts->idle_active || ts->tick_stopped)
                now = ktime_get();
@@ -634,6 +686,7 @@ void tick_nohz_idle_exit(void)
        local_irq_enable();
 }
+EXPORT_SYMBOL_GPL(tick_nohz_idle_exit);
 static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
 {
@@ -648,40 +701,12 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 {
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
        struct pt_regs *regs = get_irq_regs();
-        int cpu = smp_processor_id();
        ktime_t now = ktime_get();
        dev->next_event.tv64 = KTIME_MAX;
-        /*
+        tick_sched_do_timer(now);
-         * Check if the do_timer duty was dropped. We don't care about
+        tick_sched_handle(ts, regs);
-         * concurrency: This happens only when the cpu in charge went
-         * into a long sleep. If two cpus happen to assign themself to
-         * this duty, then the jiffies update is still serialized by
-         * xtime_lock.
-         */
-        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
-                tick_do_timer_cpu = cpu;
-        /* Check, if the jiffies need an update */
-        if (tick_do_timer_cpu == cpu)
-                tick_do_update_jiffies64(now);
-        /*
-         * When we are idle and the tick is stopped, we have to touch
-         * the watchdog as we might not schedule for a really long
-         * time. This happens on complete idle SMP systems while
-         * waiting on the login prompt. We also increment the "start
-         * of idle" jiffy stamp so the idle accounting adjustment we
-         * do when we go busy again does not account too much ticks.
-         */
-        if (ts->tick_stopped) {
-                touch_softlockup_watchdog();
-                ts->idle_jiffies++;
-        }
-        update_process_times(user_mode(regs));
-        profile_tick(CPU_PROFILING);
        while (tick_nohz_reprogram(ts, now)) {
                now = ktime_get();
@@ -794,7 +819,7 @@ void tick_check_idle(int cpu)
 #ifdef CONFIG_HIGH_RES_TIMERS
 /*
 * We rearm the timer until we get disabled by the idle code.
- * Called with interrupts disabled and timer->base->cpu_base->lock held.
+ * Called with interrupts disabled.
 */
 static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 {
@@ -802,45 +827,15 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
                container_of(timer, struct tick_sched, sched_timer);
        struct pt_regs *regs = get_irq_regs();
        ktime_t now = ktime_get();
-        int cpu = smp_processor_id();
-#ifdef CONFIG_NO_HZ
-        /*
-         * Check if the do_timer duty was dropped. We don't care about
-         * concurrency: This happens only when the cpu in charge went
-         * into a long sleep. If two cpus happen to assign themself to
-         * this duty, then the jiffies update is still serialized by
-         * xtime_lock.
-         */
-        if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
-                tick_do_timer_cpu = cpu;
-#endif
-        /* Check, if the jiffies need an update */
+        tick_sched_do_timer(now);
-        if (tick_do_timer_cpu == cpu)
-                tick_do_update_jiffies64(now);
        /*
         * Do not call, when we are not in irq context and have
         * no valid regs pointer
         */
-        if (regs) {
+        if (regs)
-                /*
+                tick_sched_handle(ts, regs);
-                 * When we are idle and the tick is stopped, we have to touch
-                 * the watchdog as we might not schedule for a really long
-                 * time. This happens on complete idle SMP systems while
-                 * waiting on the login prompt. We also increment the "start of
-                 * idle" jiffy stamp so the idle accounting adjustment we do
-                 * when we go busy again does not account too much ticks.
-                 */
-                if (ts->tick_stopped) {
-                        touch_softlockup_watchdog();
-                        if (is_idle_task(current))
-                                ts->idle_jiffies++;
-                }
-                update_process_times(user_mode(regs));
-                profile_tick(CPU_PROFILING);
-        }
        hrtimer_forward(timer, now, tick_period);
@@ -874,7 +869,7 @@ void tick_setup_sched_timer(void)
        /* Get the next period (per cpu) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
-        /* Offset the tick to avert xtime_lock contention. */
+        /* Offset the tick to avert jiffies_lock contention. */
        if (sched_skew_tick) {
                u64 offset = ktime_to_ns(tick_period) >> 1;
                do_div(offset, num_possible_cpus());
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
deleted file mode 100644
index a9ae369925ce..000000000000
--- a/kernel/time/timecompare.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (C) 2009 Intel Corporation.
- * Author: Patrick Ohly <patrick.ohly@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include <linux/timecompare.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/math64.h>
-#include <linux/kernel.h>
-/*
- * fixed point arithmetic scale factor for skew
- *
- * Usually one would measure skew in ppb (parts per billion, 1e9), but
- * using a factor of 2 simplifies the math.
- */
-#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
-ktime_t timecompare_transform(struct timecompare *sync,
-                              u64 source_tstamp)
-{
-        u64 nsec;
-        nsec = source_tstamp + sync->offset;
-        nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
-                TIMECOMPARE_SKEW_RESOLUTION;
-        return ns_to_ktime(nsec);
-}
-EXPORT_SYMBOL_GPL(timecompare_transform);
-int timecompare_offset(struct timecompare *sync,
-                       s64 *offset,
-                       u64 *source_tstamp)
-{
-        u64 start_source = 0, end_source = 0;
-        struct {
-                s64 offset;
-                s64 duration_target;
-        } buffer[10], sample, *samples;
-        int counter = 0, i;
-        int used;
-        int index;
-        int num_samples = sync->num_samples;
-        if (num_samples > ARRAY_SIZE(buffer)) {
-                samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
-                if (!samples) {
-                        samples = buffer;
-                        num_samples = ARRAY_SIZE(buffer);
-                }
-        } else {
-                samples = buffer;
-        }
-        /* run until we have enough valid samples, but do not try forever */
-        i = 0;
-        counter = 0;
-        while (1) {
-                u64 ts;
-                ktime_t start, end;
-                start = sync->target();
-                ts = timecounter_read(sync->source);
-                end = sync->target();
-                if (!i)
-                        start_source = ts;
-                /* ignore negative durations */
-                sample.duration_target = ktime_to_ns(ktime_sub(end, start));
-                if (sample.duration_target >= 0) {
-                        /*
-                         * assume symetric delay to and from source:
-                         * average target time corresponds to measured
-                         * source time
-                         */
-                        sample.offset =
-                                (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
-                                ts;
-                        /* simple insertion sort based on duration */
-                        index = counter - 1;
-                        while (index >= 0) {
-                                if (samples[index].duration_target <
-                                    sample.duration_target)
-                                        break;
-                                samples[index + 1] = samples[index];
-                                index--;
-                        }
-                        samples[index + 1] = sample;
-                        counter++;
-                }
-                i++;
-                if (counter >= num_samples || i >= 100000) {
-                        end_source = ts;
-                        break;
-                }
-        }
-        *source_tstamp = (end_source + start_source) / 2;
-        /* remove outliers by only using 75% of the samples */
-        used = counter * 3 / 4;
-        if (!used)
-                used = counter;
-        if (used) {
-                /* calculate average */
-                s64 off = 0;
-                for (index = 0; index < used; index++)
-                        off += samples[index].offset;
-                *offset = div_s64(off, used);
-        }
-        if (samples && samples != buffer)
-                kfree(samples);
-        return used;
-}
-EXPORT_SYMBOL_GPL(timecompare_offset);
-void __timecompare_update(struct timecompare *sync,
-                          u64 source_tstamp)
-{
-        s64 offset;
-        u64 average_time;
-        if (!timecompare_offset(sync, &offset, &average_time))
-                return;
-        if (!sync->last_update) {
-                sync->last_update = average_time;
-                sync->offset = offset;
-                sync->skew = 0;
-        } else {
-                s64 delta_nsec = average_time - sync->last_update;
-                /* avoid division by negative or small deltas */
-                if (delta_nsec >= 10000) {
-                        s64 delta_offset_nsec = offset - sync->offset;
-                        s64 skew; /* delta_offset_nsec *
-                                     TIMECOMPARE_SKEW_RESOLUTION /
-                                     delta_nsec */
-                        u64 divisor;
-                        /* div_s64() is limited to 32 bit divisor */
-                        skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
-                        divisor = delta_nsec;
-                        while (unlikely(divisor >= ((s64)1) << 32)) {
-                                /* divide both by 2; beware, right shift
-                                   of negative value has undefined
-                                   behavior and can only be used for
-                                   the positive divisor */
-                                skew = div_s64(skew, 2);
-                                divisor >>= 1;
-                        }
-                        skew = div_s64(skew, divisor);
-                        /*
-                         * Calculate new overall skew as 4/16 the
-                         * old value and 12/16 the new one. This is
-                         * a rather arbitrary tradeoff between
-                         * only using the latest measurement (0/16 and
-                         * 16/16) and even more weight on past measurements.
-                         */
-#define TIMECOMPARE_NEW_SKEW_PER_16 12
-                        sync->skew =
-                                div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
-                                        sync->skew +
-                                        TIMECOMPARE_NEW_SKEW_PER_16 * skew,
-                                        16);
-                        sync->last_update = average_time;
-                        sync->offset = offset;
-                }
-        }
-}
-EXPORT_SYMBOL_GPL(__timecompare_update);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e424970bb562..9a0bc98fbe1d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -21,19 +21,17 @@
 #include <linux/time.h>
 #include <linux/tick.h>
 #include <linux/stop_machine.h>
+#include <linux/pvclock_gtod.h>
 static struct timekeeper timekeeper;
-/*
- * This read-write spinlock protects us from races in SMP while
- * playing with xtime.
- */
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
+/* Flag for if there is a persistent clock on this platform */
+bool __read_mostly persistent_clock_exist = false;
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
        while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
@@ -140,6 +138,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 }
 /* Timekeeper helper functions. */
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+u32 (*arch_gettimeoffset)(void);
+u32 get_arch_timeoffset(void)
+{
+        if (likely(arch_gettimeoffset))
+                return arch_gettimeoffset();
+        return 0;
+}
+#else
+static inline u32 get_arch_timeoffset(void) { return 0; }
+#endif
 static inline s64 timekeeping_get_ns(struct timekeeper *tk)
 {
        cycle_t cycle_now, cycle_delta;
@@ -156,8 +168,8 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk)
        nsec = cycle_delta * tk->mult + tk->xtime_nsec;
        nsec >>= tk->shift;
-        /* If arch requires, add in gettimeoffset() */
+        /* If arch requires, add in get_arch_timeoffset() */
-        return nsec + arch_gettimeoffset();
+        return nsec + get_arch_timeoffset();
 }
 static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
@@ -176,9 +188,57 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
        /* convert delta to nanoseconds. */
        nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
-        /* If arch requires, add in gettimeoffset() */
+        /* If arch requires, add in get_arch_timeoffset() */
-        return nsec + arch_gettimeoffset();
+        return nsec + get_arch_timeoffset();
+}
+static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+        raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk);
+}
+/**
+ * pvclock_gtod_register_notifier - register a pvclock timedata update listener
+ *
+ * Must hold write on timekeeper.lock
+ */
+int pvclock_gtod_register_notifier(struct notifier_block *nb)
+{
+        struct timekeeper *tk = &timekeeper;
+        unsigned long flags;
+        int ret;
+        write_seqlock_irqsave(&tk->lock, flags);
+        ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
+        /* update timekeeping data */
+        update_pvclock_gtod(tk);
+        write_sequnlock_irqrestore(&tk->lock, flags);
+        return ret;
 }
+EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
+/**
+ * pvclock_gtod_unregister_notifier - unregister a pvclock
+ * timedata update listener
+ *
+ * Must hold write on timekeeper.lock
+ */
+int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
+{
+        struct timekeeper *tk = &timekeeper;
+        unsigned long flags;
+        int ret;
+        write_seqlock_irqsave(&tk->lock, flags);
+        ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
+        write_sequnlock_irqrestore(&tk->lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 /* must hold write on timekeeper.lock */
 static void timekeeping_update(struct timekeeper *tk, bool clearntp)
@@ -188,6 +248,7 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
                ntp_clear();
        }
        update_vsyscall(tk);
+        update_pvclock_gtod(tk);
 }
 /**
@@ -210,8 +271,8 @@ static void timekeeping_forward_now(struct timekeeper *tk)
        tk->xtime_nsec += cycle_delta * tk->mult;
-        /* If arch requires, add in gettimeoffset() */
+        /* If arch requires, add in get_arch_timeoffset() */
-        tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
+        tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift;
        tk_normalize_xtime(tk);
@@ -220,19 +281,18 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 }
 /**
- * getnstimeofday - Returns the time of day in a timespec
+ * __getnstimeofday - Returns the time of day in a timespec.
 * @ts:         pointer to the timespec to be set
 *
- * Returns the time of day in a timespec.
+ * Updates the time of day in the timespec.
+ * Returns 0 on success, or -ve when suspended (timespec will be undefined).
 */
-void getnstimeofday(struct timespec *ts)
+int __getnstimeofday(struct timespec *ts)
 {
        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs = 0;
-        WARN_ON(timekeeping_suspended);
        do {
                seq = read_seqbegin(&tk->lock);
@@ -243,6 +303,26 @@ void getnstimeofday(struct timespec *ts)
        ts->tv_nsec = 0;
        timespec_add_ns(ts, nsecs);
+        /*
+         * Do not bail out early, in case there were callers still using
+         * the value, even in the face of the WARN_ON.
+         */
+        if (unlikely(timekeeping_suspended))
+                return -EAGAIN;
+        return 0;
+}
+EXPORT_SYMBOL(__getnstimeofday);
+/**
+ * getnstimeofday - Returns the time of day in a timespec.
+ * @ts:         pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec (WARN if suspended).
+ */
+void getnstimeofday(struct timespec *ts)
+{
+        WARN_ON(__getnstimeofday(ts));
 }
 EXPORT_SYMBOL(getnstimeofday);
@@ -596,12 +676,14 @@ void __init timekeeping_init(void)
        struct timespec now, boot, tmp;
        read_persistent_clock(&now);
        if (!timespec_valid_strict(&now)) {
                pr_warn("WARNING: Persistent clock returned invalid value!\n"
                        "         Check your CMOS/BIOS settings.\n");
                now.tv_sec = 0;
                now.tv_nsec = 0;
-        }
+        } else if (now.tv_sec || now.tv_nsec)
+                persistent_clock_exist = true;
        read_boot_clock(&boot);
        if (!timespec_valid_strict(&boot)) {
@@ -674,11 +756,12 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 {
        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
-        struct timespec ts;
-        /* Make sure we don't set the clock twice */
+        /*
-        read_persistent_clock(&ts);
+         * Make sure we don't set the clock twice, as timekeeping_resume()
-        if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
+         * already did it
+         */
+        if (has_persistent_clock())
                return;
        write_seqlock_irqsave(&tk->lock, flags);
@@ -1299,9 +1382,7 @@ struct timespec get_monotonic_coarse(void)
 }
 /*
- * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * Must hold jiffies_lock
- * without sampling the sequence number in xtime_lock.
- * jiffies is defined in the linker script...
 */
 void do_timer(unsigned long ticks)
 {
@@ -1389,7 +1470,7 @@ EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
 */
 void xtime_update(unsigned long ticks)
 {
-        write_seqlock(&xtime_lock);
+        write_seqlock(&jiffies_lock);
        do_timer(ticks);
-        write_sequnlock(&xtime_lock);
+        write_sequnlock(&jiffies_lock);
 }
diff --git a/kernel/timeconst.bc b/kernel/timeconst.bc
new file mode 100644
index 000000000000..511bdf2cafda
--- /dev/null
+++ b/kernel/timeconst.bc
@@ -0,0 +1,108 @@
+scale=0
+define gcd(a,b) {
+        auto t;
+        while (b) {
+                t = b;
+                b = a % b;
+                a = t;
+        }
+        return a;
+}
+/* Division by reciprocal multiplication. */
+define fmul(b,n,d) {
+       return (2^b*n+d-1)/d;
+}
+/* Adjustment factor when a ceiling value is used.  Use as:
+   (imul * n) + (fmulxx * n + fadjxx) >> xx) */
+define fadj(b,n,d) {
+        auto v;
+        d = d/gcd(n,d);
+        v = 2^b*(d-1)/d;
+        return v;
+}
+/* Compute the appropriate mul/adj values as well as a shift count,
+   which brings the mul value into the range 2^b-1 <= x < 2^b.  Such
+   a shift value will be correct in the signed integer range and off
+   by at most one in the upper half of the unsigned range. */
+define fmuls(b,n,d) {
+        auto s, m;
+        for (s = 0; 1; s++) {
+                m = fmul(s,n,d);
+                if (m >= 2^(b-1))
+                        return s;
+        }
+        return 0;
+}
+define timeconst(hz) {
+        print "/* Automatically generated by kernel/timeconst.bc */\n"
+        print "/* Time conversion constants for HZ == ", hz, " */\n"
+        print "\n"
+        print "#ifndef KERNEL_TIMECONST_H\n"
+        print "#define KERNEL_TIMECONST_H\n\n"
+        print "#include <linux/param.h>\n"
+        print "#include <linux/types.h>\n\n"
+        print "#if HZ != ", hz, "\n"
+        print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+        print "#endif\n\n"
+        if (hz < 2) {
+                print "#error Totally bogus HZ value!\n"
+        } else {
+                s=fmuls(32,1000,hz)
+                obase=16
+                print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
+                print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
+                obase=10
+                print "#define HZ_TO_MSEC_SHR32\t", s, "\n"
+                s=fmuls(32,hz,1000)
+                obase=16
+                print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
+                print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
+                obase=10
+                print "#define MSEC_TO_HZ_SHR32\t", s, "\n"
+                obase=10
+                cd=gcd(hz,1000)
+                print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
+                print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
+                print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+                print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
+                print "\n"
+                s=fmuls(32,1000000,hz)
+                obase=16
+                print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
+                print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
+                obase=10
+                print "#define HZ_TO_USEC_SHR32\t", s, "\n"
+                s=fmuls(32,hz,1000000)
+                obase=16
+                print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
+                print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
+                obase=10
+                print "#define USEC_TO_HZ_SHR32\t", s, "\n"
+                obase=10
+                cd=gcd(hz,1000000)
+                print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
+                print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
+                print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
+                print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
+                print "\n"
+                print "#endif /* KERNEL_TIMECONST_H */\n"
+        }
+        halt
+}
+timeconst(hz)
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
deleted file mode 100644
index eb51d76e058a..000000000000
--- a/kernel/timeconst.pl
+++ /dev/null
@@ -1,378 +0,0 @@
-#!/usr/bin/perl
-# -----------------------------------------------------------------------
-#
-#   Copyright 2007-2008 rPath, Inc. - All Rights Reserved
-#
-#   This file is part of the Linux kernel, and is made available under
-#   the terms of the GNU General Public License version 2 or (at your
-#   option) any later version; incorporated herein by reference.
-#
-# -----------------------------------------------------------------------
-#
-#
-# Usage: timeconst.pl HZ > timeconst.h
-#
-# Precomputed values for systems without Math::BigInt
-# Generated by:
-# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200
-%canned_values = (
-        24 => [
-                '0xa6aaaaab','0x2aaaaaa',26,
-                125,3,
-                '0xc49ba5e4','0x1fbe76c8b4',37,
-                3,125,
-                '0xa2c2aaab','0xaaaa',16,
-                125000,3,
-                '0xc9539b89','0x7fffbce4217d',47,
-                3,125000,
-        ], 32 => [
-                '0xfa000000','0x6000000',27,
-                125,4,
-                '0x83126e98','0xfdf3b645a',36,
-                4,125,
-                '0xf4240000','0x0',17,
-                31250,1,
-                '0x8637bd06','0x3fff79c842fa',46,
-                1,31250,
-        ], 48 => [
-                '0xa6aaaaab','0x6aaaaaa',27,
-                125,6,
-                '0xc49ba5e4','0xfdf3b645a',36,
-                6,125,
-                '0xa2c2aaab','0x15555',17,
-                62500,3,
-                '0xc9539b89','0x3fffbce4217d',46,
-                3,62500,
-        ], 64 => [
-                '0xfa000000','0xe000000',28,
-                125,8,
-                '0x83126e98','0x7ef9db22d',35,
-                8,125,
-                '0xf4240000','0x0',18,
-                15625,1,
-                '0x8637bd06','0x1fff79c842fa',45,
-                1,15625,
-        ], 100 => [
-                '0xa0000000','0x0',28,
-                10,1,
-                '0xcccccccd','0x733333333',35,
-                1,10,
-                '0x9c400000','0x0',18,
-                10000,1,
-                '0xd1b71759','0x1fff2e48e8a7',45,
-                1,10000,
-        ], 122 => [
-                '0x8325c53f','0xfbcda3a',28,
-                500,61,
-                '0xf9db22d1','0x7fbe76c8b',35,
-                61,500,
-                '0x8012e2a0','0x3ef36',18,
-                500000,61,
-                '0xffda4053','0x1ffffbce4217',45,
-                61,500000,
-        ], 128 => [
-                '0xfa000000','0x1e000000',29,
-                125,16,
-                '0x83126e98','0x3f7ced916',34,
-                16,125,
-                '0xf4240000','0x40000',19,
-                15625,2,
-                '0x8637bd06','0xfffbce4217d',44,
-                2,15625,
-        ], 200 => [
-                '0xa0000000','0x0',29,
-                5,1,
-                '0xcccccccd','0x333333333',34,
-                1,5,
-                '0x9c400000','0x0',19,
-                5000,1,
-                '0xd1b71759','0xfff2e48e8a7',44,
-                1,5000,
-        ], 250 => [
-                '0x80000000','0x0',29,
-                4,1,
-                '0x80000000','0x180000000',33,
-                1,4,
-                '0xfa000000','0x0',20,
-                4000,1,
-                '0x83126e98','0x7ff7ced9168',43,
-                1,4000,
-        ], 256 => [
-                '0xfa000000','0x3e000000',30,
-                125,32,
-                '0x83126e98','0x1fbe76c8b',33,
-                32,125,
-                '0xf4240000','0xc0000',20,
-                15625,4,
-                '0x8637bd06','0x7ffde7210be',43,
-                4,15625,
-        ], 300 => [
-                '0xd5555556','0x2aaaaaaa',30,
-                10,3,
-                '0x9999999a','0x1cccccccc',33,
-                3,10,
-                '0xd0555556','0xaaaaa',20,
-                10000,3,
-                '0x9d495183','0x7ffcb923a29',43,
-                3,10000,
-        ], 512 => [
-                '0xfa000000','0x7e000000',31,
-                125,64,
-                '0x83126e98','0xfdf3b645',32,
-                64,125,
-                '0xf4240000','0x1c0000',21,
-                15625,8,
-                '0x8637bd06','0x3ffef39085f',42,
-                8,15625,
-        ], 1000 => [
-                '0x80000000','0x0',31,
-                1,1,
-                '0x80000000','0x0',31,
-                1,1,
-                '0xfa000000','0x0',22,
-                1000,1,
-                '0x83126e98','0x1ff7ced9168',41,
-                1,1000,
-        ], 1024 => [
-                '0xfa000000','0xfe000000',32,
-                125,128,
-                '0x83126e98','0x7ef9db22',31,
-                128,125,
-                '0xf4240000','0x3c0000',22,
-                15625,16,
-                '0x8637bd06','0x1fff79c842f',41,
-                16,15625,
-        ], 1200 => [
-                '0xd5555556','0xd5555555',32,
-                5,6,
-                '0x9999999a','0x66666666',31,
-                6,5,
-                '0xd0555556','0x2aaaaa',22,
-                2500,3,
-                '0x9d495183','0x1ffcb923a29',41,
-                3,2500,
-        ]
-);
-$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;';
-sub bint($)
-{
-        my($x) = @_;
-        return Math::BigInt->new($x);
-}
-#
-# Constants for division by reciprocal multiplication.
-# (bits, numerator, denominator)
-#
-sub fmul($$$)
-{
-        my ($b,$n,$d) = @_;
-        $n = bint($n);
-        $d = bint($d);
-        return scalar (($n << $b)+$d-bint(1))/$d;
-}
-sub fadj($$$)
-{
-        my($b,$n,$d) = @_;
-        $n = bint($n);
-        $d = bint($d);
-        $d = $d/bgcd($n, $d);
-        return scalar (($d-bint(1)) << $b)/$d;
-}
-sub fmuls($$$) {
-        my($b,$n,$d) = @_;
-        my($s,$m);
-        my($thres) = bint(1) << ($b-1);
-        $n = bint($n);
-        $d = bint($d);
-        for ($s = 0; 1; $s++) {
-                $m = fmul($s,$n,$d);
-                return $s if ($m >= $thres);
-        }
-        return 0;
-}
-# Generate a hex value if the result fits in 64 bits;
-# otherwise skip.
-sub bignum_hex($) {
-        my($x) = @_;
-        my $s = $x->as_hex();
-        return (length($s) > 18) ? undef : $s;
-}
-# Provides mul, adj, and shr factors for a specific
-# (bit, time, hz) combination
-sub muladj($$$) {
-        my($b, $t, $hz) = @_;
-        my $s = fmuls($b, $t, $hz);
-        my $m = fmul($s, $t, $hz);
-        my $a = fadj($s, $t, $hz);
-        return (bignum_hex($m), bignum_hex($a), $s);
-}
-# Provides numerator, denominator values
-sub numden($$) {
-        my($n, $d) = @_;
-        my $g = bgcd($n, $d);
-        return ($n/$g, $d/$g);
-}
-# All values for a specific (time, hz) combo
-sub conversions($$) {
-        my ($t, $hz) = @_;
-        my @val = ();
-        # HZ_TO_xx
-        push(@val, muladj(32, $t, $hz));
-        push(@val, numden($t, $hz));
-        # xx_TO_HZ
-        push(@val, muladj(32, $hz, $t));
-        push(@val, numden($hz, $t));
-        return @val;
-}
-sub compute_values($) {
-        my($hz) = @_;
-        my @val = ();
-        my $s, $m, $a, $g;
-        if (!$has_bigint) {
-                die "$0: HZ == $hz not canned and ".
-                    "Math::BigInt not available\n";
-        }
-        # MSEC conversions
-        push(@val, conversions(1000, $hz));
-        # USEC conversions
-        push(@val, conversions(1000000, $hz));
-        return @val;
-}
-sub outputval($$)
-{
-        my($name, $val) = @_;
-        my $csuf;
-        if (defined($val)) {
-            if ($name !~ /SHR/) {
-                $val = "U64_C($val)";
-            }
-            printf "#define %-23s %s\n", $name.$csuf, $val.$csuf;
-        }
-}
-sub output($@)
-{
-        my($hz, @val) = @_;
-        my $pfx, $bit, $suf, $s, $m, $a;
-        print "/* Automatically generated by kernel/timeconst.pl */\n";
-        print "/* Conversion constants for HZ == $hz */\n";
-        print "\n";
-        print "#ifndef KERNEL_TIMECONST_H\n";
-        print "#define KERNEL_TIMECONST_H\n";
-        print "\n";
-        print "#include <linux/param.h>\n";
-        print "#include <linux/types.h>\n";
-        print "\n";
-        print "#if HZ != $hz\n";
-        print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n";
-        print "#endif\n";
-        print "\n";
-        foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
-                      'HZ_TO_USEC','USEC_TO_HZ') {
-                foreach $bit (32) {
-                        foreach $suf ('MUL', 'ADJ', 'SHR') {
-                                outputval("${pfx}_$suf$bit", shift(@val));
-                        }
-                }
-                foreach $suf ('NUM', 'DEN') {
-                        outputval("${pfx}_$suf", shift(@val));
-                }
-        }
-        print "\n";
-        print "#endif /* KERNEL_TIMECONST_H */\n";
-}
-# Pretty-print Perl values
-sub perlvals(@) {
-        my $v;
-        my @l = ();
-        foreach $v (@_) {
-                if (!defined($v)) {
-                        push(@l, 'undef');
-                } elsif ($v =~ /^0x/) {
-                        push(@l, "\'".$v."\'");
-                } else {
-                        push(@l, $v.'');
-                }
-        }
-        return join(',', @l);
-}
-($hz) = @ARGV;
-# Use this to generate the %canned_values structure
-if ($hz eq '--can') {
-        shift(@ARGV);
-        @hzlist = sort {$a <=> $b} (@ARGV);
-        print "# Precomputed values for systems without Math::BigInt\n";
-        print "# Generated by:\n";
-        print "# timeconst.pl --can ", join(' ', @hzlist), "\n";
-        print "\%canned_values = (\n";
-        my $pf = "\t";
-        foreach $hz (@hzlist) {
-                my @values = compute_values($hz);
-                print "$pf$hz => [\n";
-                while (scalar(@values)) {
-                        my $bit;
-                        foreach $bit (32) {
-                                my $m = shift(@values);
-                                my $a = shift(@values);
-                                my $s = shift(@values);
-                                print "\t\t", perlvals($m,$a,$s), ",\n";
-                        }
-                        my $n = shift(@values);
-                        my $d = shift(@values);
-                        print "\t\t", perlvals($n,$d), ",\n";
-                }
-                print "\t]";
-                $pf = ', ';
-        }
-        print "\n);\n";
-} else {
-        $hz += 0;                       # Force to number
-        if ($hz < 1) {
-                die "Usage: $0 HZ\n";
-        }
-        @val = @{$canned_values{$hz}};
-        if (!defined(@val)) {
-                @val = compute_values($hz);
-        }
-        output($hz, @val);
-}
-exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 367d00858482..dbf7a78a1ef1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
 #include <linux/kallsyms.h>
 #include <linux/irq_work.h>
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
@@ -1351,7 +1352,6 @@ void update_process_times(int user_tick)
        account_process_tick(p, user_tick);
        run_local_timers();
        rcu_check_callbacks(cpu, user_tick);
-        printk_tick();
 #ifdef CONFIG_IRQ_WORK
        if (in_irq())
                irq_work_run();
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4cea4f41c1d9..fc382d6e2765 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE
        help
          See Documentation/trace/ftrace-design.txt
+config HAVE_DYNAMIC_FTRACE_WITH_REGS
+        bool
 config HAVE_FTRACE_MCOUNT_RECORD
        bool
        help
@@ -78,21 +81,6 @@ config EVENT_TRACING
        select CONTEXT_SWITCH_TRACER
        bool
-config EVENT_POWER_TRACING_DEPRECATED
-        depends on EVENT_TRACING
-        bool "Deprecated power event trace API, to be removed"
-        default y
-        help
-          Provides old power event types:
-          C-state/idle accounting events:
-          power:power_start
-          power:power_end
-          and old cpufreq accounting event:
-          power:power_frequency
-          This is for userspace compatibility
-          and will vanish after 5 kernel iterations,
-          namely 3.1.
 config CONTEXT_SWITCH_TRACER
        bool
@@ -119,6 +107,7 @@ config TRACING
        select BINARY_PRINTF
        select EVENT_TRACING
        select TRACE_CLOCK
+        select IRQ_WORK
 config GENERIC_TRACER
        bool
@@ -249,6 +238,16 @@ config FTRACE_SYSCALLS
        help
          Basic tracer to catch the syscall entry and exit events.
+config TRACER_SNAPSHOT
+        bool "Create a snapshot trace buffer"
+        select TRACER_MAX_TRACE
+        help
+          Allow tracing users to take snapshot of the current buffer using the
+          ftrace interface, e.g.:
+              echo 1 > /sys/kernel/debug/tracing/snapshot
+              cat snapshot
 config TRACE_BRANCH_PROFILING
        bool
        select GENERIC_TRACER
@@ -415,23 +414,32 @@ config PROBE_EVENTS
        def_bool n
 config DYNAMIC_FTRACE
-        bool "enable/disable ftrace tracepoints dynamically"
+        bool "enable/disable function tracing dynamically"
        depends on FUNCTION_TRACER
        depends on HAVE_DYNAMIC_FTRACE
        default y
        help
-          This option will modify all the calls to ftrace dynamically
+          This option will modify all the calls to function tracing
-          (will patch them out of the binary image and replace them
+          dynamically (will patch them out of the binary image and
-          with a No-Op instruction) as they are called. A table is
+          replace them with a No-Op instruction) on boot up. During
-          created to dynamically enable them again.
+          compile time, a table is made of all the locations that ftrace
+          can function trace, and this table is linked into the kernel
+          image. When this is enabled, functions can be individually
+          enabled, and the functions not enabled will not affect
+          performance of the system.
+          See the files in /sys/kernel/debug/tracing:
+            available_filter_functions
+            set_ftrace_filter
+            set_ftrace_notrace
          This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
          otherwise has native performance as long as no tracing is active.
-          The changes to the code are done by a kernel thread that
+config DYNAMIC_FTRACE_WITH_REGS
-          wakes up once a second and checks to see if any ftrace calls
+        def_bool y
-          were made. If so, it runs stop_machine (stops all CPUS)
+        depends on DYNAMIC_FTRACE
-          and modifies the code to jump over the call to ftrace.
+        depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
 config FUNCTION_PROFILER
        bool "Kernel function profiler"
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c0bd0308741c..9e5b8c272eec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
                return;
        local_irq_save(flags);
-        buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
+        buf = this_cpu_ptr(bt->msg_data);
        va_start(args, fmt);
        n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
        va_end(args);
@@ -739,6 +739,12 @@ static void blk_add_trace_rq_complete(void *ignore,
                                      struct request_queue *q,
                                      struct request *rq)
 {
+        struct blk_trace *bt = q->blk_trace;
+        /* if control ever passes through here, it's a request based driver */
+        if (unlikely(bt && !bt->rq_based))
+                bt->rq_based = true;
        blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
 }
@@ -774,15 +780,30 @@ static void blk_add_trace_bio_bounce(void *ignore,
        blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
-static void blk_add_trace_bio_complete(void *ignore,
+static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
-                                       struct request_queue *q, struct bio *bio,
-                                       int error)
 {
+        struct request_queue *q;
+        struct blk_trace *bt;
+        if (!bio->bi_bdev)
+                return;
+        q = bdev_get_queue(bio->bi_bdev);
+        bt = q->blk_trace;
+        /*
+         * Request based drivers will generate both rq and bio completions.
+         * Ignore bio ones.
+         */
+        if (likely(!bt) || bt->rq_based)
+                return;
        blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 static void blk_add_trace_bio_backmerge(void *ignore,
                                        struct request_queue *q,
+                                        struct request *rq,
                                        struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
@@ -790,6 +811,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
 static void blk_add_trace_bio_frontmerge(void *ignore,
                                         struct request_queue *q,
+                                         struct request *rq,
                                         struct bio *bio)
 {
        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9dcf15d38380..b3fde6d7b7fc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -10,7 +10,7 @@
 * Based on code in the latency_tracer, that is:
 *
 *  Copyright (C) 2004-2006 Ingo Molnar
- *  Copyright (C) 2004 William Lee Irwin III
+ *  Copyright (C) 2004 Nadia Yvette Chambers
 */
 #include <linux/stop_machine.h>
@@ -66,7 +66,7 @@
 static struct ftrace_ops ftrace_list_end __read_mostly = {
        .func           = ftrace_stub,
-        .flags          = FTRACE_OPS_FL_RECURSION_SAFE,
+        .flags          = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
 };
 /* ftrace_enabled is a method to turn ftrace on or off */
@@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
 #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
 #endif
+/*
+ * Traverse the ftrace_global_list, invoking all entries.  The reason that we
+ * can use rcu_dereference_raw() is that elements removed from this list
+ * are simply leaked, so there is no need to interact with a grace-period
+ * mechanism.  The rcu_dereference_raw() calls are needed to handle
+ * concurrent insertions into the ftrace_global_list.
+ *
+ * Silly Alpha and silly pointer-speculation compiler optimizations!
+ */
+#define do_for_each_ftrace_op(op, list)                 \
+        op = rcu_dereference_raw(list);                 \
+        do
+/*
+ * Optimized for just a single item in the list (as that is the normal case).
+ */
+#define while_for_each_ftrace_op(op)                            \
+        while (likely(op = rcu_dereference_raw((op)->next)) &&  \
+               unlikely((op) != &ftrace_list_end))
 /**
 * ftrace_nr_registered_ops - return number of ops registered
 *
@@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void)
        return cnt;
 }
-/*
- * Traverse the ftrace_global_list, invoking all entries.  The reason that we
- * can use rcu_dereference_raw() is that elements removed from this list
- * are simply leaked, so there is no need to interact with a grace-period
- * mechanism.  The rcu_dereference_raw() calls are needed to handle
- * concurrent insertions into the ftrace_global_list.
- *
- * Silly Alpha and silly pointer-speculation compiler optimizations!
- */
 static void
 ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
                        struct ftrace_ops *op, struct pt_regs *regs)
 {
-        if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+        int bit;
+        bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
+        if (bit < 0)
                return;
-        trace_recursion_set(TRACE_GLOBAL_BIT);
+        do_for_each_ftrace_op(op, ftrace_global_list) {
-        op = rcu_dereference_raw(ftrace_global_list); /*see above*/
-        while (op != &ftrace_list_end) {
                op->func(ip, parent_ip, op, regs);
-                op = rcu_dereference_raw(op->next); /*see above*/
+        } while_for_each_ftrace_op(op);
-        };
-        trace_recursion_clear(TRACE_GLOBAL_BIT);
+        trace_clear_recursion(bit);
 }
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
@@ -221,10 +233,24 @@ static void update_global_ops(void)
         * registered callers.
         */
        if (ftrace_global_list == &ftrace_list_end ||
-            ftrace_global_list->next == &ftrace_list_end)
+            ftrace_global_list->next == &ftrace_list_end) {
                func = ftrace_global_list->func;
-        else
+                /*
+                 * As we are calling the function directly.
+                 * If it does not have recursion protection,
+                 * the function_trace_op needs to be updated
+                 * accordingly.
+                 */
+                if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
+                        global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
+                else
+                        global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
+        } else {
                func = ftrace_global_list_func;
+                /* The list has its own recursion protection. */
+                global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
+        }
        /* If we filter on pids, update to use the pid function */
        if (!list_empty(&ftrace_pids)) {
@@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
        if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
                return -EINVAL;
-#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
        /*
         * If the ftrace_ops specifies SAVE_REGS, then it only can be used
         * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
@@ -668,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
                free_page(tmp);
        }
-        free_page((unsigned long)stat->pages);
        stat->pages = NULL;
        stat->start = NULL;
@@ -736,7 +761,6 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
 {
        struct ftrace_profile *rec;
        struct hlist_head *hhd;
-        struct hlist_node *n;
        unsigned long key;
        key = hash_long(ip, ftrace_profile_bits);
@@ -745,7 +769,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
        if (hlist_empty(hhd))
                return NULL;
-        hlist_for_each_entry_rcu(rec, n, hhd, node) {
+        hlist_for_each_entry_rcu(rec, hhd, node) {
                if (rec->ip == ip)
                        return rec;
        }
@@ -1028,6 +1052,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+loff_t
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
+{
+        loff_t ret;
+        if (file->f_mode & FMODE_READ)
+                ret = seq_lseek(file, offset, whence);
+        else
+                file->f_pos = ret = 1;
+        return ret;
+}
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1107,7 +1144,6 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
        unsigned long key;
        struct ftrace_func_entry *entry;
        struct hlist_head *hhd;
-        struct hlist_node *n;
        if (ftrace_hash_empty(hash))
                return NULL;
@@ -1119,7 +1155,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
        hhd = &hash->buckets[key];
-        hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
+        hlist_for_each_entry_rcu(entry, hhd, hlist) {
                if (entry->ip == ip)
                        return entry;
        }
@@ -1176,7 +1212,7 @@ remove_hash_entry(struct ftrace_hash *hash,
 static void ftrace_hash_clear(struct ftrace_hash *hash)
 {
        struct hlist_head *hhd;
-        struct hlist_node *tp, *tn;
+        struct hlist_node *tn;
        struct ftrace_func_entry *entry;
        int size = 1 << hash->size_bits;
        int i;
@@ -1186,7 +1222,7 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
        for (i = 0; i < size; i++) {
                hhd = &hash->buckets[i];
-                hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
+                hlist_for_each_entry_safe(entry, tn, hhd, hlist)
                        free_hash_entry(hash, entry);
        }
        FTRACE_WARN_ON(hash->count);
@@ -1249,7 +1285,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
 {
        struct ftrace_func_entry *entry;
        struct ftrace_hash *new_hash;
-        struct hlist_node *tp;
        int size;
        int ret;
        int i;
@@ -1264,7 +1299,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
        size = 1 << hash->size_bits;
        for (i = 0; i < size; i++) {
-                hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
+                hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
                        ret = add_hash_entry(new_hash, entry->ip);
                        if (ret < 0)
                                goto free_hash;
@@ -1290,7 +1325,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
                 struct ftrace_hash **dst, struct ftrace_hash *src)
 {
        struct ftrace_func_entry *entry;
-        struct hlist_node *tp, *tn;
+        struct hlist_node *tn;
        struct hlist_head *hhd;
        struct ftrace_hash *old_hash;
        struct ftrace_hash *new_hash;
@@ -1336,7 +1371,7 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
        size = 1 << src->size_bits;
        for (i = 0; i < size; i++) {
                hhd = &src->buckets[i];
-                hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
+                hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
                        if (bits > 0)
                                key = hash_long(entry->ip, bits);
                        else
@@ -2437,7 +2472,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
 {
        iter->pos = 0;
        iter->func_pos = 0;
-        iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
+        iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
 }
 static void *t_start(struct seq_file *m, loff_t *pos)
@@ -2590,7 +2625,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
 * routine, you can use ftrace_filter_write() for the write
 * routine if @flag has FTRACE_ITER_FILTER set, or
 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_regex_lseek() should be used as the lseek routine, and
+ * ftrace_filter_lseek() should be used as the lseek routine, and
 * release must call ftrace_regex_release().
 */
 int
@@ -2674,19 +2709,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
                                 inode, file);
 }
-loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
-{
-        loff_t ret;
-        if (file->f_mode & FMODE_READ)
-                ret = seq_lseek(file, offset, origin);
-        else
-                file->f_pos = ret = 1;
-        return ret;
-}
 static int ftrace_match(char *str, char *regex, int len, int type)
 {
        int matched = 0;
@@ -2868,14 +2890,13 @@ static int __init ftrace_mod_cmd_init(void)
 {
        return register_ftrace_command(&ftrace_mod_cmd);
 }
-device_initcall(ftrace_mod_cmd_init);
+core_initcall(ftrace_mod_cmd_init);
 static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
                                      struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
        struct ftrace_func_probe *entry;
        struct hlist_head *hhd;
-        struct hlist_node *n;
        unsigned long key;
        key = hash_long(ip, FTRACE_HASH_BITS);
@@ -2891,7 +2912,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
         * on the hash. rcu_read_lock is too dangerous here.
         */
        preempt_disable_notrace();
-        hlist_for_each_entry_rcu(entry, n, hhd, node) {
+        hlist_for_each_entry_rcu(entry, hhd, node) {
                if (entry->ip == ip)
                        entry->ops->func(ip, parent_ip, &entry->data);
        }
@@ -3042,7 +3063,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                                  void *data, int flags)
 {
        struct ftrace_func_probe *entry;
-        struct hlist_node *n, *tmp;
+        struct hlist_node *tmp;
        char str[KSYM_SYMBOL_LEN];
        int type = MATCH_FULL;
        int i, len = 0;
@@ -3065,7 +3086,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
        for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
                struct hlist_head *hhd = &ftrace_func_hash[i];
-                hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
+                hlist_for_each_entry_safe(entry, tmp, hhd, node) {
                        /* break up if statements for readability */
                        if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
@@ -3082,8 +3103,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                                        continue;
                        }
-                        hlist_del(&entry->node);
+                        hlist_del_rcu(&entry->node);
-                        call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+                        call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu);
                }
        }
        __disable_ftrace_function_probe();
@@ -3419,14 +3440,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
 static int __init set_ftrace_notrace(char *str)
 {
-        strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+        strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
        return 1;
 }
 __setup("ftrace_notrace=", set_ftrace_notrace);
 static int __init set_ftrace_filter(char *str)
 {
-        strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+        strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
        return 1;
 }
 __setup("ftrace_filter=", set_ftrace_filter);
@@ -3549,7 +3570,7 @@ static const struct file_operations ftrace_filter_fops = {
        .open = ftrace_filter_open,
        .read = seq_read,
        .write = ftrace_filter_write,
-        .llseek = ftrace_regex_lseek,
+        .llseek = ftrace_filter_lseek,
        .release = ftrace_regex_release,
 };
@@ -3557,7 +3578,7 @@ static const struct file_operations ftrace_notrace_fops = {
        .open = ftrace_notrace_open,
        .read = seq_read,
        .write = ftrace_notrace_write,
-        .llseek = ftrace_regex_lseek,
+        .llseek = ftrace_filter_lseek,
        .release = ftrace_regex_release,
 };
@@ -3762,8 +3783,8 @@ static const struct file_operations ftrace_graph_fops = {
        .open           = ftrace_graph_open,
        .read           = seq_read,
        .write          = ftrace_graph_write,
+        .llseek         = ftrace_filter_lseek,
        .release        = ftrace_graph_release,
-        .llseek         = seq_lseek,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -3970,35 +3991,49 @@ static void ftrace_init_module(struct module *mod,
        ftrace_process_locs(mod, start, end);
 }
-static int ftrace_module_notify(struct notifier_block *self,
+static int ftrace_module_notify_enter(struct notifier_block *self,
-                                unsigned long val, void *data)
+                                      unsigned long val, void *data)
 {
        struct module *mod = data;
-        switch (val) {
+        if (val == MODULE_STATE_COMING)
-        case MODULE_STATE_COMING:
                ftrace_init_module(mod, mod->ftrace_callsites,
                                   mod->ftrace_callsites +
                                   mod->num_ftrace_callsites);
-                break;
+        return 0;
-        case MODULE_STATE_GOING:
+}
+static int ftrace_module_notify_exit(struct notifier_block *self,
+                                     unsigned long val, void *data)
+{
+        struct module *mod = data;
+        if (val == MODULE_STATE_GOING)
                ftrace_release_mod(mod);
-                break;
-        }
        return 0;
 }
 #else
-static int ftrace_module_notify(struct notifier_block *self,
+static int ftrace_module_notify_enter(struct notifier_block *self,
-                                unsigned long val, void *data)
+                                      unsigned long val, void *data)
+{
+        return 0;
+}
+static int ftrace_module_notify_exit(struct notifier_block *self,
+                                     unsigned long val, void *data)
 {
        return 0;
 }
 #endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_nb = {
+struct notifier_block ftrace_module_enter_nb = {
-        .notifier_call = ftrace_module_notify,
+        .notifier_call = ftrace_module_notify_enter,
-        .priority = 0,
+        .priority = INT_MAX,    /* Run before anything that can use kprobes */
+};
+struct notifier_block ftrace_module_exit_nb = {
+        .notifier_call = ftrace_module_notify_exit,
+        .priority = INT_MIN,    /* Run after anything that can remove kprobes */
 };
 extern unsigned long __start_mcount_loc[];
@@ -4032,9 +4067,13 @@ void __init ftrace_init(void)
                                  __start_mcount_loc,
                                  __stop_mcount_loc);
-        ret = register_module_notifier(&ftrace_module_nb);
+        ret = register_module_notifier(&ftrace_module_enter_nb);
        if (ret)
-                pr_warning("Failed to register trace ftrace module notifier\n");
+                pr_warning("Failed to register trace ftrace module enter notifier\n");
+        ret = register_module_notifier(&ftrace_module_exit_nb);
+        if (ret)
+                pr_warning("Failed to register trace ftrace module exit notifier\n");
        set_ftrace_early_filters();
@@ -4055,7 +4094,7 @@ static int __init ftrace_nodyn_init(void)
        ftrace_enabled = 1;
        return 0;
 }
-device_initcall(ftrace_nodyn_init);
+core_initcall(ftrace_nodyn_init);
 static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
 static inline void ftrace_startup_enable(int command) { }
@@ -4090,14 +4129,12 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
         */
        preempt_disable_notrace();
        trace_recursion_set(TRACE_CONTROL_BIT);
-        op = rcu_dereference_raw(ftrace_control_list);
+        do_for_each_ftrace_op(op, ftrace_control_list) {
-        while (op != &ftrace_list_end) {
+                if (!(op->flags & FTRACE_OPS_FL_STUB) &&
-                if (!ftrace_function_local_disabled(op) &&
+                    !ftrace_function_local_disabled(op) &&
                    ftrace_ops_test(op, ip))
                        op->func(ip, parent_ip, op, regs);
+        } while_for_each_ftrace_op(op);
-                op = rcu_dereference_raw(op->next);
-        };
        trace_recursion_clear(TRACE_CONTROL_BIT);
        preempt_enable_notrace();
 }
@@ -4112,27 +4149,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
                       struct ftrace_ops *ignored, struct pt_regs *regs)
 {
        struct ftrace_ops *op;
+        int bit;
        if (function_trace_stop)
                return;
-        if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+        bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+        if (bit < 0)
                return;
-        trace_recursion_set(TRACE_INTERNAL_BIT);
        /*
         * Some of the ops may be dynamically allocated,
         * they must be freed after a synchronize_sched().
         */
        preempt_disable_notrace();
-        op = rcu_dereference_raw(ftrace_ops_list);
+        do_for_each_ftrace_op(op, ftrace_ops_list) {
-        while (op != &ftrace_list_end) {
                if (ftrace_ops_test(op, ip))
                        op->func(ip, parent_ip, op, regs);
-                op = rcu_dereference_raw(op->next);
+        } while_for_each_ftrace_op(op);
-        };
        preempt_enable_notrace();
-        trace_recursion_clear(TRACE_INTERNAL_BIT);
+        trace_clear_recursion(bit);
 }
 /*
@@ -4143,8 +4179,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 * Archs are to support both the regs and ftrace_ops at the same time.
 * If they support ftrace_ops, it is assumed they support regs.
 * If call backs want to use regs, they must either check for regs
- * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
+ * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
- * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
+ * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
 * An architecture can pass partial regs with ftrace_ops and still
 * set the ARCH_SUPPORT_FTARCE_OPS.
 */
@@ -4381,7 +4417,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
        if (strlen(tmp) == 0)
                return 1;
-        ret = strict_strtol(tmp, 10, &val);
+        ret = kstrtol(tmp, 10, &val);
        if (ret < 0)
                return ret;
@@ -4403,7 +4439,7 @@ static const struct file_operations ftrace_pid_fops = {
        .open           = ftrace_pid_open,
        .write          = ftrace_pid_write,
        .read           = seq_read,
-        .llseek         = seq_lseek,
+        .llseek         = ftrace_filter_lseek,
        .release        = ftrace_pid_release,
 };
@@ -4519,12 +4555,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
                ftrace_startup_sysctl();
                /* we are starting ftrace again */
-                if (ftrace_ops_list != &ftrace_list_end) {
+                if (ftrace_ops_list != &ftrace_list_end)
-                        if (ftrace_ops_list->next == &ftrace_list_end)
+                        update_ftrace_function();
-                                ftrace_trace_function = ftrace_ops_list->func;
-                        else
-                                ftrace_trace_function = ftrace_ops_list_func;
-                }
        } else {
                /* stopping ftrace calls (just send to ftrace_stub) */
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index f55fcf61b223..1c71382b283d 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -13,8 +13,5 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/power.h>
-#ifdef EVENT_POWER_TRACING_DEPRECATED
-EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
-#endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b979426d16c6..6989df2ba194 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,8 +3,10 @@
 *
 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
 */
+#include <linux/ftrace_event.h>
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
+#include <linux/trace_seq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
@@ -21,7 +23,6 @@
 #include <linux/fs.h>
 #include <asm/local.h>
-#include "trace.h"
 static void update_pages_handler(struct work_struct *work);
@@ -177,7 +178,7 @@ void tracing_off_permanent(void)
 #define RB_MAX_SMALL_DATA       (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 #define RB_EVNT_MIN_SIZE        8U      /* two 32bit words */
-#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
 # define RB_FORCE_8BYTE_ALIGNMENT       0
 # define RB_ARCH_ALIGNMENT              RB_ALIGNMENT
 #else
@@ -185,6 +186,8 @@ void tracing_off_permanent(void)
 # define RB_ARCH_ALIGNMENT              8U
 #endif
+#define RB_ALIGN_DATA           __aligned(RB_ARCH_ALIGNMENT)
 /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
 #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -333,7 +336,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 struct buffer_data_page {
        u64              time_stamp;    /* page time stamp */
        local_t          commit;        /* write committed index */
-        unsigned char    data[];        /* data of buffer page */
+        unsigned char    data[] RB_ALIGN_DATA;  /* data of buffer page */
 };
 /*
@@ -460,9 +463,10 @@ struct ring_buffer_per_cpu {
        unsigned long                   lost_events;
        unsigned long                   last_overrun;
        local_t                         entries_bytes;
-        local_t                         commit_overrun;
-        local_t                         overrun;
        local_t                         entries;
+        local_t                         overrun;
+        local_t                         commit_overrun;
+        local_t                         dropped_events;
        local_t                         committing;
        local_t                         commits;
        unsigned long                   read;
@@ -1396,6 +1400,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
                struct list_head *head_page_with_bit;
                head_page = &rb_set_head_page(cpu_buffer)->list;
+                if (!head_page)
+                        break;
                prev_page = head_page->prev;
                first_page = pages->next;
@@ -1820,7 +1826,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
 }
 /**
- * ring_buffer_update_event - update event type and data
+ * rb_update_event - update event type and data
 * @event: the even to update
 * @type: the type of event
 * @length: the size of the event field in the ring buffer
@@ -2155,8 +2161,10 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
                         * If we are not in overwrite mode,
                         * this is easy, just stop here.
                         */
-                        if (!(buffer->flags & RB_FL_OVERWRITE))
+                        if (!(buffer->flags & RB_FL_OVERWRITE)) {
+                                local_inc(&cpu_buffer->dropped_events);
                                goto out_reset;
+                        }
                        ret = rb_handle_head_page(cpu_buffer,
                                                  tail_page,
@@ -2427,41 +2435,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 #ifdef CONFIG_TRACING
-#define TRACE_RECURSIVE_DEPTH 16
+/*
+ * The lock and unlock are done within a preempt disable section.
+ * The current_context per_cpu variable can only be modified
+ * by the current task between lock and unlock. But it can
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ *  bit 0 =  NMI context
+ *  bit 1 =  IRQ context
+ *  bit 2 =  SoftIRQ context
+ *  bit 3 =  normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
+ *
+ * (binary)
+ *  101 - 1 = 100
+ *  101 & 100 = 100 (clearing bit zero)
+ *
+ *  1010 - 1 = 1001
+ *  1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
+ */
+static DEFINE_PER_CPU(unsigned int, current_context);
-/* Keep this code out of the fast path cache */
+static __always_inline int trace_recursive_lock(void)
-static noinline void trace_recursive_fail(void)
 {
-        /* Disable all tracing before we do anything else */
+        unsigned int val = this_cpu_read(current_context);
-        tracing_off_permanent();
+        int bit;
-        printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
+        if (in_interrupt()) {
-                    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+                if (in_nmi())
-                    trace_recursion_buffer(),
+                        bit = 0;
-                    hardirq_count() >> HARDIRQ_SHIFT,
+                else if (in_irq())
-                    softirq_count() >> SOFTIRQ_SHIFT,
+                        bit = 1;
-                    in_nmi());
+                else
+                        bit = 2;
-        WARN_ON_ONCE(1);
+        } else
-}
+                bit = 3;
-static inline int trace_recursive_lock(void)
-{
-        trace_recursion_inc();
-        if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
+        if (unlikely(val & (1 << bit)))
-                return 0;
+                return 1;
-        trace_recursive_fail();
+        val |= (1 << bit);
+        this_cpu_write(current_context, val);
-        return -1;
+        return 0;
 }
-static inline void trace_recursive_unlock(void)
+static __always_inline void trace_recursive_unlock(void)
 {
-        WARN_ON_ONCE(!trace_recursion_buffer());
+        unsigned int val = this_cpu_read(current_context);
-        trace_recursion_dec();
+        val--;
+        val &= this_cpu_read(current_context);
+        this_cpu_write(current_context, val);
 }
 #else
@@ -2720,8 +2763,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
 * and not the length of the event which would hold the header.
 */
 int ring_buffer_write(struct ring_buffer *buffer,
-                        unsigned long length,
+                      unsigned long length,
-                        void *data)
+                      void *data)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event;
@@ -2929,12 +2972,12 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
 * @buffer: The ring buffer
 * @cpu: The per CPU buffer to read from.
 */
-unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
+u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
 {
        unsigned long flags;
        struct ring_buffer_per_cpu *cpu_buffer;
        struct buffer_page *bpage;
-        unsigned long ret;
+        u64 ret = 0;
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return 0;
@@ -2949,7 +2992,8 @@ unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
                bpage = cpu_buffer->reader_page;
        else
                bpage = rb_set_head_page(cpu_buffer);
-        ret = bpage->page->time_stamp;
+        if (bpage)
+                ret = bpage->page->time_stamp;
        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        return ret;
@@ -2995,7 +3039,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 /**
- * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * ring_buffer_overrun_cpu - get the number of overruns caused by the ring
+ * buffer wrapping around (only if RB_FL_OVERWRITE is on).
 * @buffer: The ring buffer
 * @cpu: The per CPU buffer to get the number of overruns from
 */
@@ -3015,7 +3060,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 /**
- * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by
+ * commits failing due to the buffer wrapping around while there are uncommitted
+ * events, such as during an interrupt storm.
 * @buffer: The ring buffer
 * @cpu: The per CPU buffer to get the number of overruns from
 */
@@ -3036,6 +3083,46 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
 EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
 /**
+ * ring_buffer_dropped_events_cpu - get the number of dropped events caused by
+ * the ring buffer filling up (only if RB_FL_OVERWRITE is off).
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        unsigned long ret;
+        if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                return 0;
+        cpu_buffer = buffer->buffers[cpu];
+        ret = local_read(&cpu_buffer->dropped_events);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
+/**
+ * ring_buffer_read_events_cpu - get the number of events successfully read
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of events read
+ */
+unsigned long
+ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                return 0;
+        cpu_buffer = buffer->buffers[cpu];
+        return cpu_buffer->read;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
+/**
 * ring_buffer_entries - get the number of entries in a buffer
 * @buffer: The ring buffer
 *
@@ -3260,6 +3347,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
         * Splice the empty reader page into the list around the head.
         */
        reader = rb_set_head_page(cpu_buffer);
+        if (!reader)
+                goto out;
        cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
        cpu_buffer->reader_page->list.prev = reader->list.prev;
@@ -3392,7 +3481,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
        /* check for end of page padding */
        if ((iter->head >= rb_page_size(iter->head_page)) &&
            (iter->head_page != cpu_buffer->commit_page))
-                rb_advance_iter(iter);
+                rb_inc_iter(iter);
 }
 static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
@@ -3778,12 +3867,17 @@ void
 ring_buffer_read_finish(struct ring_buffer_iter *iter)
 {
        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+        unsigned long flags;
        /*
         * Ring buffer is disabled from recording, here's a good place
-         * to check the integrity of the ring buffer. 
+         * to check the integrity of the ring buffer.
+         * Must prevent readers from trying to read, as the check
+         * clears the HEAD page and readers require it.
         */
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        rb_check_pages(cpu_buffer);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        atomic_dec(&cpu_buffer->record_disabled);
        atomic_dec(&cpu_buffer->buffer->resize_disabled);
@@ -3864,9 +3958,10 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        local_set(&cpu_buffer->reader_page->page->commit, 0);
        cpu_buffer->reader_page->read = 0;
-        local_set(&cpu_buffer->commit_overrun, 0);
        local_set(&cpu_buffer->entries_bytes, 0);
        local_set(&cpu_buffer->overrun, 0);
+        local_set(&cpu_buffer->commit_overrun, 0);
+        local_set(&cpu_buffer->dropped_events, 0);
        local_set(&cpu_buffer->entries, 0);
        local_set(&cpu_buffer->committing, 0);
        local_set(&cpu_buffer->commits, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 31e4f55773f1..66338c4f7f4b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9,7 +9,7 @@
 *
 * Based on code from the latency_tracer, that is:
 *  Copyright (C) 2004-2006 Ingo Molnar
- *  Copyright (C) 2004 William Lee Irwin III
+ *  Copyright (C) 2004 Nadia Yvette Chambers
 */
 #include <linux/ring_buffer.h>
 #include <generated/utsrelease.h>
@@ -19,6 +19,7 @@
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
+#include <linux/irq_work.h>
 #include <linux/debugfs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
@@ -38,6 +39,7 @@
 #include <linux/poll.h>
 #include <linux/nmi.h>
 #include <linux/fs.h>
+#include <linux/sched/rt.h>
 #include "trace.h"
 #include "trace_output.h"
@@ -78,6 +80,21 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
 }
 /*
+ * To prevent the comm cache from being overwritten when no
+ * tracing is active, only save the comm when a trace event
+ * occurred.
+ */
+static DEFINE_PER_CPU(bool, trace_cmdline_save);
+/*
+ * When a reader is waiting for data, then this variable is
+ * set to true.
+ */
+static bool trace_wakeup_needed;
+static struct irq_work trace_work_wakeup;
+/*
 * Kill all tracing for good (never come back).
 * It is initialized to 1 but will turn to zero if the initialization
 * of the tracer is successful. But that is the only place that sets
@@ -115,7 +132,7 @@ static char *default_bootup_tracer;
 static int __init set_cmdline_ftrace(char *str)
 {
-        strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
+        strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
        default_bootup_tracer = bootup_tracer_buf;
        /* We are using ftrace early, expand it */
        ring_buffer_expanded = 1;
@@ -139,6 +156,18 @@ static int __init set_ftrace_dump_on_oops(char *str)
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
+static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
+static char *trace_boot_options __initdata;
+static int __init set_trace_boot_options(char *str)
+{
+        strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+        trace_boot_options = trace_boot_options_buf;
+        return 0;
+}
+__setup("trace_options=", set_trace_boot_options);
 unsigned long long ns2usecs(cycle_t nsec)
 {
        nsec += 500;
@@ -198,20 +227,9 @@ static struct trace_array	max_tr;
 static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
-/* tracer_enabled is used to toggle activation of a tracer */
-static int                      tracer_enabled = 1;
-/**
- * tracing_is_enabled - return tracer_enabled status
- *
- * This function is used by other tracers to know the status
- * of the tracer_enabled flag.  Tracers may use this function
- * to know if it should enable their features when starting
- * up. See irqsoff tracer for an example (start_irqsoff_tracer).
- */
 int tracing_is_enabled(void)
 {
-        return tracer_enabled;
+        return tracing_is_on();
 }
 /*
@@ -232,7 +250,7 @@ static unsigned long		trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
 static struct tracer            *trace_types __read_mostly;
 /* current_trace points to the tracer that is currently active */
-static struct tracer            *current_trace __read_mostly;
+static struct tracer            *current_trace __read_mostly = &nop_trace;
 /*
 * trace_types_lock is used to protect the trace_types list.
@@ -333,12 +351,18 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 static int trace_stop_count;
 static DEFINE_RAW_SPINLOCK(tracing_start_lock);
-static void wakeup_work_handler(struct work_struct *work)
+/**
+ * trace_wake_up - wake up tasks waiting for trace input
+ *
+ * Schedules a delayed work to wake up any task that is blocked on the
+ * trace_wait queue. These is used with trace_poll for tasks polling the
+ * trace.
+ */
+static void trace_wake_up(struct irq_work *work)
 {
-        wake_up(&trace_wait);
+        wake_up_all(&trace_wait);
-}
-static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
+}
 /**
 * tracing_on - enable tracing buffers
@@ -393,22 +417,6 @@ int tracing_is_on(void)
 }
 EXPORT_SYMBOL_GPL(tracing_is_on);
-/**
- * trace_wake_up - wake up tasks waiting for trace input
- *
- * Schedules a delayed work to wake up any task that is blocked on the
- * trace_wait queue. These is used with trace_poll for tasks polling the
- * trace.
- */
-void trace_wake_up(void)
-{
-        const unsigned long delay = msecs_to_jiffies(2);
-        if (trace_flags & TRACE_ITER_BLOCK)
-                return;
-        schedule_delayed_work(&wakeup_work, delay);
-}
 static int __init set_buf_size(char *str)
 {
        unsigned long buf_size;
@@ -431,7 +439,7 @@ static int __init set_tracing_thresh(char *str)
        if (!str)
                return 0;
-        ret = strict_strtoul(str, 0, &threshold);
+        ret = kstrtoul(str, 0, &threshold);
        if (ret < 0)
                return 0;
        tracing_thresh = threshold * 1000;
@@ -477,10 +485,12 @@ static const char *trace_options[] = {
 static struct {
        u64 (*func)(void);
        const char *name;
+        int in_ns;              /* is this clock in nanoseconds? */
 } trace_clocks[] = {
-        { trace_clock_local,    "local" },
+        { trace_clock_local,    "local",        1 },
-        { trace_clock_global,   "global" },
+        { trace_clock_global,   "global",       1 },
-        { trace_clock_counter,  "counter" },
+        { trace_clock_counter,  "counter",      0 },
+        ARCH_TRACE_CLOCKS
 };
 int trace_clock_id;
@@ -694,18 +704,22 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 void
 update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
-        struct ring_buffer *buf = tr->buffer;
+        struct ring_buffer *buf;
        if (trace_stop_count)
                return;
        WARN_ON_ONCE(!irqs_disabled());
-        if (!current_trace->use_max_tr) {
-                WARN_ON_ONCE(1);
+        if (!current_trace->allocated_snapshot) {
+                /* Only the nop tracer should hit this when disabling */
+                WARN_ON_ONCE(current_trace != &nop_trace);
                return;
        }
        arch_spin_lock(&ftrace_max_lock);
+        buf = tr->buffer;
        tr->buffer = max_tr.buffer;
        max_tr.buffer = buf;
@@ -730,8 +744,9 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
                return;
        WARN_ON_ONCE(!irqs_disabled());
-        if (!current_trace->use_max_tr) {
+        if (!current_trace->allocated_snapshot) {
-                WARN_ON_ONCE(1);
+                /* Only the nop tracer should hit this when disabling */
+                WARN_ON_ONCE(current_trace != &nop_trace);
                return;
        }
@@ -757,6 +772,40 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 }
 #endif /* CONFIG_TRACER_MAX_TRACE */
+static void default_wait_pipe(struct trace_iterator *iter)
+{
+        DEFINE_WAIT(wait);
+        prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
+        /*
+         * The events can happen in critical sections where
+         * checking a work queue can cause deadlocks.
+         * After adding a task to the queue, this flag is set
+         * only to notify events to try to wake up the queue
+         * using irq_work.
+         *
+         * We don't clear it even if the buffer is no longer
+         * empty. The flag only causes the next event to run
+         * irq_work to do the work queue wake up. The worse
+         * that can happen if we race with !trace_empty() is that
+         * an event will cause an irq_work to try to wake up
+         * an empty queue.
+         *
+         * There's no reason to protect this flag either, as
+         * the work queue and irq_work logic will do the necessary
+         * synchronization for the wake ups. The only thing
+         * that is necessary is that the wake up happens after
+         * a task has been queued. It's OK for spurious wake ups.
+         */
+        trace_wakeup_needed = true;
+        if (trace_empty(iter))
+                schedule();
+        finish_wait(&trace_wait, &wait);
+}
 /**
 * register_tracer - register a tracer with the ftrace system.
 * @type - the plugin for the tracer
@@ -819,10 +868,13 @@ int register_tracer(struct tracer *type)
                current_trace = type;
-                /* If we expanded the buffers, make sure the max is expanded too */
+                if (type->use_max_tr) {
-                if (ring_buffer_expanded && type->use_max_tr)
+                        /* If we expanded the buffers, make sure the max is expanded too */
-                        ring_buffer_resize(max_tr.buffer, trace_buf_size,
+                        if (ring_buffer_expanded)
-                                                RING_BUFFER_ALL_CPUS);
+                                ring_buffer_resize(max_tr.buffer, trace_buf_size,
+                                                   RING_BUFFER_ALL_CPUS);
+                        type->allocated_snapshot = true;
+                }
                /* the test is responsible for initializing and enabling */
                pr_info("Testing tracer %s: ", type->name);
@@ -838,10 +890,14 @@ int register_tracer(struct tracer *type)
                /* Only reset on passing, to avoid touching corrupted buffers */
                tracing_reset_online_cpus(tr);
-                /* Shrink the max buffer again */
+                if (type->use_max_tr) {
-                if (ring_buffer_expanded && type->use_max_tr)
+                        type->allocated_snapshot = false;
-                        ring_buffer_resize(max_tr.buffer, 1,
-                                                RING_BUFFER_ALL_CPUS);
+                        /* Shrink the max buffer again */
+                        if (ring_buffer_expanded)
+                                ring_buffer_resize(max_tr.buffer, 1,
+                                                   RING_BUFFER_ALL_CPUS);
+                }
                printk(KERN_CONT "PASSED\n");
        }
@@ -875,36 +931,13 @@ int register_tracer(struct tracer *type)
        return ret;
 }
-void unregister_tracer(struct tracer *type)
-{
-        struct tracer **t;
-        mutex_lock(&trace_types_lock);
-        for (t = &trace_types; *t; t = &(*t)->next) {
-                if (*t == type)
-                        goto found;
-        }
-        pr_info("Tracer %s not registered\n", type->name);
-        goto out;
- found:
-        *t = (*t)->next;
-        if (type == current_trace && tracer_enabled) {
-                tracer_enabled = 0;
-                tracing_stop();
-                if (current_trace->stop)
-                        current_trace->stop(&global_trace);
-                current_trace = &nop_trace;
-        }
-out:
-        mutex_unlock(&trace_types_lock);
-}
 void tracing_reset(struct trace_array *tr, int cpu)
 {
        struct ring_buffer *buffer = tr->buffer;
+        if (!buffer)
+                return;
        ring_buffer_record_disable(buffer);
        /* Make sure all commits have finished */
@@ -919,6 +952,9 @@ void tracing_reset_online_cpus(struct trace_array *tr)
        struct ring_buffer *buffer = tr->buffer;
        int cpu;
+        if (!buffer)
+                return;
        ring_buffer_record_disable(buffer);
        /* Make sure all commits have finished */
@@ -1131,10 +1167,14 @@ void trace_find_cmdline(int pid, char comm[])
 void tracing_record_cmdline(struct task_struct *tsk)
 {
-        if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled ||
+        if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
-            !tracing_is_on())
                return;
+        if (!__this_cpu_read(trace_cmdline_save))
+                return;
+        __this_cpu_write(trace_cmdline_save, false);
        trace_save_cmdline(tsk);
 }
@@ -1146,7 +1186,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
        entry->preempt_count            = pc & 0xff;
        entry->pid                      = (tsk) ? tsk->pid : 0;
-        entry->padding                  = 0;
        entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
                (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1178,27 +1217,36 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
        return event;
 }
+void
+__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
+{
+        __this_cpu_write(trace_cmdline_save, true);
+        if (trace_wakeup_needed) {
+                trace_wakeup_needed = false;
+                /* irq_work_queue() supplies it's own memory barriers */
+                irq_work_queue(&trace_work_wakeup);
+        }
+        ring_buffer_unlock_commit(buffer, event);
+}
 static inline void
 __trace_buffer_unlock_commit(struct ring_buffer *buffer,
                             struct ring_buffer_event *event,
-                             unsigned long flags, int pc,
+                             unsigned long flags, int pc)
-                             int wake)
 {
-        ring_buffer_unlock_commit(buffer, event);
+        __buffer_unlock_commit(buffer, event);
        ftrace_trace_stack(buffer, flags, 6, pc);
        ftrace_trace_userstack(buffer, flags, pc);
-        if (wake)
-                trace_wake_up();
 }
 void trace_buffer_unlock_commit(struct ring_buffer *buffer,
                                struct ring_buffer_event *event,
                                unsigned long flags, int pc)
 {
-        __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
+        __trace_buffer_unlock_commit(buffer, event, flags, pc);
 }
+EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
 struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
@@ -1215,29 +1263,21 @@ void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
                                        struct ring_buffer_event *event,
                                        unsigned long flags, int pc)
 {
-        __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
+        __trace_buffer_unlock_commit(buffer, event, flags, pc);
 }
 EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
+void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
-                                       struct ring_buffer_event *event,
+                                     struct ring_buffer_event *event,
-                                       unsigned long flags, int pc)
+                                     unsigned long flags, int pc,
-{
+                                     struct pt_regs *regs)
-        __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
-}
-EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
-void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
-                                            struct ring_buffer_event *event,
-                                            unsigned long flags, int pc,
-                                            struct pt_regs *regs)
 {
-        ring_buffer_unlock_commit(buffer, event);
+        __buffer_unlock_commit(buffer, event);
        ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
        ftrace_trace_userstack(buffer, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
+EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
                                         struct ring_buffer_event *event)
@@ -1269,7 +1309,7 @@ trace_function(struct trace_array *tr,
        entry->parent_ip                = parent_ip;
        if (!filter_check_discard(call, entry, buffer, event))
-                ring_buffer_unlock_commit(buffer, event);
+                __buffer_unlock_commit(buffer, event);
 }
 void
@@ -1313,7 +1353,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
         */
        preempt_disable_notrace();
-        use_stack = ++__get_cpu_var(ftrace_stack_reserve);
+        use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
        /*
         * We don't need any atomic variables, just a barrier.
         * If an interrupt comes in, we don't care, because it would
@@ -1362,12 +1402,12 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
        entry->size = trace.nr_entries;
        if (!filter_check_discard(call, entry, buffer, event))
-                ring_buffer_unlock_commit(buffer, event);
+                __buffer_unlock_commit(buffer, event);
 out:
        /* Again, don't let gcc optimize things here */
        barrier();
-        __get_cpu_var(ftrace_stack_reserve)--;
+        __this_cpu_dec(ftrace_stack_reserve);
        preempt_enable_notrace();
 }
@@ -1458,7 +1498,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
        save_stack_trace_user(&trace);
        if (!filter_check_discard(call, entry, buffer, event))
-                ring_buffer_unlock_commit(buffer, event);
+                __buffer_unlock_commit(buffer, event);
 out_drop_count:
        __this_cpu_dec(user_stack_count);
@@ -1495,7 +1535,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer;
 static char *get_trace_buf(void)
 {
        struct trace_buffer_struct *percpu_buffer;
-        struct trace_buffer_struct *buffer;
        /*
         * If we have allocated per cpu buffers, then we do not
@@ -1513,9 +1552,7 @@ static char *get_trace_buf(void)
        if (!percpu_buffer)
                return NULL;
-        buffer = per_cpu_ptr(percpu_buffer, smp_processor_id());
+        return this_cpu_ptr(&percpu_buffer->buffer[0]);
-        return buffer->buffer;
 }
 static int alloc_percpu_trace_buffer(void)
@@ -1559,10 +1596,10 @@ static int alloc_percpu_trace_buffer(void)
        return -ENOMEM;
 }
+static int buffers_allocated;
 void trace_printk_init_buffers(void)
 {
-        static int buffers_allocated;
        if (buffers_allocated)
                return;
@@ -1571,7 +1608,38 @@ void trace_printk_init_buffers(void)
        pr_info("ftrace: Allocated trace_printk buffers\n");
+        /* Expand the buffers to set size */
+        tracing_update_buffers();
        buffers_allocated = 1;
+        /*
+         * trace_printk_init_buffers() can be called by modules.
+         * If that happens, then we need to start cmdline recording
+         * directly here. If the global_trace.buffer is already
+         * allocated here, then this was called by module code.
+         */
+        if (global_trace.buffer)
+                tracing_start_cmdline_record();
+}
+void trace_printk_start_comm(void)
+{
+        /* Start tracing comms if trace printk is set */
+        if (!buffers_allocated)
+                return;
+        tracing_start_cmdline_record();
+}
+static void trace_printk_start_stop_comm(int enabled)
+{
+        if (!buffers_allocated)
+                return;
+        if (enabled)
+                tracing_start_cmdline_record();
+        else
+                tracing_stop_cmdline_record();
 }
 /**
@@ -1622,7 +1690,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        memcpy(entry->buf, tbuffer, sizeof(u32) * len);
        if (!filter_check_discard(call, entry, buffer, event)) {
-                ring_buffer_unlock_commit(buffer, event);
+                __buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(buffer, flags, 6, pc);
        }
@@ -1693,7 +1761,7 @@ int trace_array_vprintk(struct trace_array *tr,
        memcpy(&entry->buf, tbuffer, len);
        entry->buf[len] = '\0';
        if (!filter_check_discard(call, entry, buffer, event)) {
-                ring_buffer_unlock_commit(buffer, event);
+                __buffer_unlock_commit(buffer, event);
                ftrace_trace_stack(buffer, flags, 6, pc);
        }
 out:
@@ -1889,21 +1957,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
        struct trace_iterator *iter = m->private;
-        static struct tracer *old_tracer;
        int cpu_file = iter->cpu_file;
        void *p = NULL;
        loff_t l = 0;
        int cpu;
-        /* copy the tracer to avoid using a global lock all around */
+        /*
+         * copy the tracer to avoid using a global lock all around.
+         * iter->trace is a copy of current_trace, the pointer to the
+         * name may be used instead of a strcmp(), as iter->trace->name
+         * will point to the same string as current_trace->name.
+         */
        mutex_lock(&trace_types_lock);
-        if (unlikely(old_tracer != current_trace && current_trace)) {
+        if (unlikely(current_trace && iter->trace->name != current_trace->name))
-                old_tracer = current_trace;
                *iter->trace = *current_trace;
-        }
        mutex_unlock(&trace_types_lock);
-        atomic_inc(&trace_record_cmdline_disabled);
+        if (iter->snapshot && iter->trace->use_max_tr)
+                return ERR_PTR(-EBUSY);
+        if (!iter->snapshot)
+                atomic_inc(&trace_record_cmdline_disabled);
        if (*pos != iter->pos) {
                iter->ent = NULL;
@@ -1942,7 +2016,11 @@ static void s_stop(struct seq_file *m, void *p)
 {
        struct trace_iterator *iter = m->private;
-        atomic_dec(&trace_record_cmdline_disabled);
+        if (iter->snapshot && iter->trace->use_max_tr)
+                return;
+        if (!iter->snapshot)
+                atomic_dec(&trace_record_cmdline_disabled);
        trace_access_unlock(iter->cpu_file);
        trace_event_read_unlock();
 }
@@ -2027,8 +2105,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
        unsigned long total;
        const char *name = "preemption";
-        if (type)
+        name = type->name;
-                name = type->name;
        get_total_entries(tr, &total, &entries);
@@ -2327,6 +2404,27 @@ static void test_ftrace_alive(struct seq_file *m)
        seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n");
 }
+#ifdef CONFIG_TRACER_MAX_TRACE
+static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+        if (iter->trace->allocated_snapshot)
+                seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
+        else
+                seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
+        seq_printf(m, "# Snapshot commands:\n");
+        seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
+        seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+        seq_printf(m, "#                      Takes a snapshot of the main buffer.\n");
+        seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n");
+        seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");
+        seq_printf(m, "#                       is not a '0' or '1')\n");
+}
+#else
+/* Should never be called */
+static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
+#endif
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
@@ -2338,7 +2436,9 @@ static int s_show(struct seq_file *m, void *v)
                        seq_puts(m, "#\n");
                        test_ftrace_alive(m);
                }
-                if (iter->trace && iter->trace->print_header)
+                if (iter->snapshot && trace_empty(iter))
+                        print_snapshot_help(m, iter);
+                else if (iter->trace && iter->trace->print_header)
                        iter->trace->print_header(m);
                else
                        trace_default_header(m);
@@ -2377,7 +2477,7 @@ static const struct seq_operations tracer_seq_ops = {
 };
 static struct trace_iterator *
-__tracing_open(struct inode *inode, struct file *file)
+__tracing_open(struct inode *inode, struct file *file, bool snapshot)
 {
        long cpu_file = (long) inode->i_private;
        struct trace_iterator *iter;
@@ -2404,16 +2504,16 @@ __tracing_open(struct inode *inode, struct file *file)
        if (!iter->trace)
                goto fail;
-        if (current_trace)
+        *iter->trace = *current_trace;
-                *iter->trace = *current_trace;
        if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
                goto fail;
-        if (current_trace && current_trace->print_max)
+        if (current_trace->print_max || snapshot)
                iter->tr = &max_tr;
        else
                iter->tr = &global_trace;
+        iter->snapshot = snapshot;
        iter->pos = -1;
        mutex_init(&iter->mutex);
        iter->cpu_file = cpu_file;
@@ -2426,8 +2526,13 @@ __tracing_open(struct inode *inode, struct file *file)
        if (ring_buffer_overruns(iter->tr->buffer))
                iter->iter_flags |= TRACE_FILE_ANNOTATE;
-        /* stop the trace while dumping */
+        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
-        tracing_stop();
+        if (trace_clocks[trace_clock_id].in_ns)
+                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
+        /* stop the trace while dumping if we are not opening "snapshot" */
+        if (!iter->snapshot)
+                tracing_stop();
        if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
                for_each_tracing_cpu(cpu) {
@@ -2490,8 +2595,9 @@ static int tracing_release(struct inode *inode, struct file *file)
        if (iter->trace && iter->trace->close)
                iter->trace->close(iter);
-        /* reenable tracing if it was previously enabled */
+        if (!iter->snapshot)
-        tracing_start();
+                /* reenable tracing if it was previously enabled */
+                tracing_start();
        mutex_unlock(&trace_types_lock);
        mutex_destroy(&iter->mutex);
@@ -2519,7 +2625,7 @@ static int tracing_open(struct inode *inode, struct file *file)
        }
        if (file->f_mode & FMODE_READ) {
-                iter = __tracing_open(inode, file);
+                iter = __tracing_open(inode, file, false);
                if (IS_ERR(iter))
                        ret = PTR_ERR(iter);
                else if (trace_flags & TRACE_ITER_LATENCY_FMT)
@@ -2778,11 +2884,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
        return -EINVAL;
 }
-static void set_tracer_flags(unsigned int mask, int enabled)
+/* Some tracers require overwrite to stay enabled */
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
+{
+        if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
+                return -1;
+        return 0;
+}
+int set_tracer_flag(unsigned int mask, int enabled)
 {
        /* do nothing if flag is already set */
        if (!!(trace_flags & mask) == !!enabled)
-                return;
+                return 0;
+        /* Give the tracer a chance to approve the change */
+        if (current_trace->flag_changed)
+                if (current_trace->flag_changed(current_trace, mask, !!enabled))
+                        return -EINVAL;
        if (enabled)
                trace_flags |= mask;
@@ -2792,49 +2912,69 @@ static void set_tracer_flags(unsigned int mask, int enabled)
        if (mask == TRACE_ITER_RECORD_CMD)
                trace_event_enable_cmd_record(enabled);
-        if (mask == TRACE_ITER_OVERWRITE)
+        if (mask == TRACE_ITER_OVERWRITE) {
                ring_buffer_change_overwrite(global_trace.buffer, enabled);
+#ifdef CONFIG_TRACER_MAX_TRACE
+                ring_buffer_change_overwrite(max_tr.buffer, enabled);
+#endif
+        }
+        if (mask == TRACE_ITER_PRINTK)
+                trace_printk_start_stop_comm(enabled);
+        return 0;
 }
-static ssize_t
+static int trace_set_options(char *option)
-tracing_trace_options_write(struct file *filp, const char __user *ubuf,
-                        size_t cnt, loff_t *ppos)
 {
-        char buf[64];
        char *cmp;
        int neg = 0;
-        int ret;
+        int ret = -ENODEV;
        int i;
-        if (cnt >= sizeof(buf))
+        cmp = strstrip(option);
-                return -EINVAL;
-        if (copy_from_user(&buf, ubuf, cnt))
-                return -EFAULT;
-        buf[cnt] = 0;
-        cmp = strstrip(buf);
        if (strncmp(cmp, "no", 2) == 0) {
                neg = 1;
                cmp += 2;
        }
+        mutex_lock(&trace_types_lock);
        for (i = 0; trace_options[i]; i++) {
                if (strcmp(cmp, trace_options[i]) == 0) {
-                        set_tracer_flags(1 << i, !neg);
+                        ret = set_tracer_flag(1 << i, !neg);
                        break;
                }
        }
        /* If no option could be set, test the specific tracer options */
-        if (!trace_options[i]) {
+        if (!trace_options[i])
-                mutex_lock(&trace_types_lock);
                ret = set_tracer_option(current_trace, cmp, neg);
-                mutex_unlock(&trace_types_lock);
-                if (ret)
+        mutex_unlock(&trace_types_lock);
-                        return ret;
-        }
+        return ret;
+}
+static ssize_t
+tracing_trace_options_write(struct file *filp, const char __user *ubuf,
+                        size_t cnt, loff_t *ppos)
+{
+        char buf[64];
+        int ret;
+        if (cnt >= sizeof(buf))
+                return -EINVAL;
+        if (copy_from_user(&buf, ubuf, cnt))
+                return -EFAULT;
+        buf[cnt] = 0;
+        ret = trace_set_options(buf);
+        if (ret < 0)
+                return ret;
        *ppos += cnt;
@@ -2940,56 +3080,6 @@ static const struct file_operations tracing_saved_cmdlines_fops = {
 };
 static ssize_t
-tracing_ctrl_read(struct file *filp, char __user *ubuf,
-                  size_t cnt, loff_t *ppos)
-{
-        char buf[64];
-        int r;
-        r = sprintf(buf, "%u\n", tracer_enabled);
-        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-static ssize_t
-tracing_ctrl_write(struct file *filp, const char __user *ubuf,
-                   size_t cnt, loff_t *ppos)
-{
-        struct trace_array *tr = filp->private_data;
-        unsigned long val;
-        int ret;
-        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
-        if (ret)
-                return ret;
-        val = !!val;
-        mutex_lock(&trace_types_lock);
-        if (tracer_enabled ^ val) {
-                /* Only need to warn if this is used to change the state */
-                WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
-                if (val) {
-                        tracer_enabled = 1;
-                        if (current_trace->start)
-                                current_trace->start(tr);
-                        tracing_start();
-                } else {
-                        tracer_enabled = 0;
-                        tracing_stop();
-                        if (current_trace->stop)
-                                current_trace->stop(tr);
-                }
-        }
-        mutex_unlock(&trace_types_lock);
-        *ppos += cnt;
-        return cnt;
-}
-static ssize_t
 tracing_set_trace_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
 {
@@ -2997,10 +3087,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
        int r;
        mutex_lock(&trace_types_lock);
-        if (current_trace)
+        r = sprintf(buf, "%s\n", current_trace->name);
-                r = sprintf(buf, "%s\n", current_trace->name);
-        else
-                r = sprintf(buf, "\n");
        mutex_unlock(&trace_types_lock);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3019,6 +3106,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val)
                tr->data[cpu]->entries = val;
 }
+/* resize @tr's buffer to the size of @size_tr's entries */
+static int resize_buffer_duplicate_size(struct trace_array *tr,
+                                        struct trace_array *size_tr, int cpu_id)
+{
+        int cpu, ret = 0;
+        if (cpu_id == RING_BUFFER_ALL_CPUS) {
+                for_each_tracing_cpu(cpu) {
+                        ret = ring_buffer_resize(tr->buffer,
+                                        size_tr->data[cpu]->entries, cpu);
+                        if (ret < 0)
+                                break;
+                        tr->data[cpu]->entries = size_tr->data[cpu]->entries;
+                }
+        } else {
+                ret = ring_buffer_resize(tr->buffer,
+                                        size_tr->data[cpu_id]->entries, cpu_id);
+                if (ret == 0)
+                        tr->data[cpu_id]->entries =
+                                size_tr->data[cpu_id]->entries;
+        }
+        return ret;
+}
 static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
 {
        int ret;
@@ -3030,6 +3142,10 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
         */
        ring_buffer_expanded = 1;
+        /* May be called before buffers are initialized */
+        if (!global_trace.buffer)
+                return 0;
        ret = ring_buffer_resize(global_trace.buffer, size, cpu);
        if (ret < 0)
                return ret;
@@ -3039,23 +3155,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
        ret = ring_buffer_resize(max_tr.buffer, size, cpu);
        if (ret < 0) {
-                int r = 0;
+                int r = resize_buffer_duplicate_size(&global_trace,
+                                                     &global_trace, cpu);
-                if (cpu == RING_BUFFER_ALL_CPUS) {
-                        int i;
-                        for_each_tracing_cpu(i) {
-                                r = ring_buffer_resize(global_trace.buffer,
-                                                global_trace.data[i]->entries,
-                                                i);
-                                if (r < 0)
-                                        break;
-                        }
-                } else {
-                        r = ring_buffer_resize(global_trace.buffer,
-                                                global_trace.data[cpu]->entries,
-                                                cpu);
-                }
                if (r < 0) {
                        /*
                         * AARGH! We are left with different
@@ -3152,6 +3253,7 @@ static int tracing_set_tracer(const char *buf)
        static struct trace_option_dentry *topts;
        struct trace_array *tr = &global_trace;
        struct tracer *t;
+        bool had_max_tr;
        int ret = 0;
        mutex_lock(&trace_types_lock);
@@ -3176,9 +3278,24 @@ static int tracing_set_tracer(const char *buf)
                goto out;
        trace_branch_disable();
-        if (current_trace && current_trace->reset)
+        current_trace->enabled = false;
+        if (current_trace->reset)
                current_trace->reset(tr);
-        if (current_trace && current_trace->use_max_tr) {
+        had_max_tr = current_trace->allocated_snapshot;
+        current_trace = &nop_trace;
+        if (had_max_tr && !t->use_max_tr) {
+                /*
+                 * We need to make sure that the update_max_tr sees that
+                 * current_trace changed to nop_trace to keep it from
+                 * swapping the buffers after we resize it.
+                 * The update_max_tr is called from interrupts disabled
+                 * so a synchronized_sched() is sufficient.
+                 */
+                synchronize_sched();
                /*
                 * We don't free the ring buffer. instead, resize it because
                 * The max_tr ring buffer has some state (e.g. ring->clock) and
@@ -3186,24 +3303,19 @@ static int tracing_set_tracer(const char *buf)
                 */
                ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
                set_buffer_entries(&max_tr, 1);
+                tracing_reset_online_cpus(&max_tr);
+                current_trace->allocated_snapshot = false;
        }
        destroy_trace_option_files(topts);
-        current_trace = &nop_trace;
        topts = create_trace_option_files(t);
-        if (t->use_max_tr) {
+        if (t->use_max_tr && !had_max_tr) {
-                int cpu;
                /* we need to make per cpu buffer sizes equivalent */
-                for_each_tracing_cpu(cpu) {
+                ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
-                        ret = ring_buffer_resize(max_tr.buffer,
+                                                   RING_BUFFER_ALL_CPUS);
-                                                global_trace.data[cpu]->entries,
+                if (ret < 0)
-                                                cpu);
+                        goto out;
-                        if (ret < 0)
+                t->allocated_snapshot = true;
-                                goto out;
-                        max_tr.data[cpu]->entries =
-                                        global_trace.data[cpu]->entries;
-                }
        }
        if (t->init) {
@@ -3213,6 +3325,7 @@ static int tracing_set_tracer(const char *buf)
        }
        current_trace = t;
+        current_trace->enabled = true;
        trace_branch_enable(tr);
 out:
        mutex_unlock(&trace_types_lock);
@@ -3311,8 +3424,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
                ret = -ENOMEM;
                goto fail;
        }
-        if (current_trace)
+        *iter->trace = *current_trace;
-                *iter->trace = *current_trace;
        if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
                ret = -ENOMEM;
@@ -3325,6 +3437,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        if (trace_flags & TRACE_ITER_LATENCY_FMT)
                iter->iter_flags |= TRACE_FILE_LAT_FMT;
+        /* Output in nanoseconds only if we are using a clock in nanoseconds. */
+        if (trace_clocks[trace_clock_id].in_ns)
+                iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
        iter->cpu_file = cpu_file;
        iter->tr = &global_trace;
        mutex_init(&iter->mutex);
@@ -3385,19 +3501,6 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
        }
 }
-void default_wait_pipe(struct trace_iterator *iter)
-{
-        DEFINE_WAIT(wait);
-        prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
-        if (trace_empty(iter))
-                schedule();
-        finish_wait(&trace_wait, &wait);
-}
 /*
 * This is a make-shift waitqueue.
 * A tracer might use this callback on some rare cases:
@@ -3446,7 +3549,7 @@ static int tracing_wait_pipe(struct file *filp)
                 *
                 * iter->pos will be 0 if we haven't read anything.
                 */
-                if (!tracer_enabled && iter->pos)
+                if (!tracing_is_enabled() && iter->pos)
                        break;
        }
@@ -3461,7 +3564,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
 {
        struct trace_iterator *iter = filp->private_data;
-        static struct tracer *old_tracer;
        ssize_t sret;
        /* return any leftover data */
@@ -3473,10 +3575,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
        /* copy the tracer to avoid using a global lock all around */
        mutex_lock(&trace_types_lock);
-        if (unlikely(old_tracer != current_trace && current_trace)) {
+        if (unlikely(iter->trace->name != current_trace->name))
-                old_tracer = current_trace;
                *iter->trace = *current_trace;
-        }
        mutex_unlock(&trace_types_lock);
        /*
@@ -3632,7 +3732,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                .ops            = &tracing_pipe_buf_ops,
                .spd_release    = tracing_spd_release_pipe,
        };
-        static struct tracer *old_tracer;
        ssize_t ret;
        size_t rem;
        unsigned int i;
@@ -3642,10 +3741,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
        /* copy the tracer to avoid using a global lock all around */
        mutex_lock(&trace_types_lock);
-        if (unlikely(old_tracer != current_trace && current_trace)) {
+        if (unlikely(iter->trace->name != current_trace->name))
-                old_tracer = current_trace;
                *iter->trace = *current_trace;
-        }
        mutex_unlock(&trace_types_lock);
        mutex_lock(&iter->mutex);
@@ -3955,7 +4052,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        } else
                entry->buf[cnt] = '\0';
-        ring_buffer_unlock_commit(buffer, event);
+        __buffer_unlock_commit(buffer, event);
        written = cnt;
@@ -4016,6 +4113,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
        if (max_tr.buffer)
                ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
+        /*
+         * New clock may not be consistent with the previous clock.
+         * Reset the buffer so that it doesn't have incomparable timestamps.
+         */
+        tracing_reset_online_cpus(&global_trace);
+        tracing_reset_online_cpus(&max_tr);
        mutex_unlock(&trace_types_lock);
        *fpos += cnt;
@@ -4030,6 +4134,85 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
        return single_open(file, tracing_clock_show, NULL);
 }
+#ifdef CONFIG_TRACER_SNAPSHOT
+static int tracing_snapshot_open(struct inode *inode, struct file *file)
+{
+        struct trace_iterator *iter;
+        int ret = 0;
+        if (file->f_mode & FMODE_READ) {
+                iter = __tracing_open(inode, file, true);
+                if (IS_ERR(iter))
+                        ret = PTR_ERR(iter);
+        }
+        return ret;
+}
+static ssize_t
+tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
+                       loff_t *ppos)
+{
+        unsigned long val;
+        int ret;
+        ret = tracing_update_buffers();
+        if (ret < 0)
+                return ret;
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+        if (ret)
+                return ret;
+        mutex_lock(&trace_types_lock);
+        if (current_trace->use_max_tr) {
+                ret = -EBUSY;
+                goto out;
+        }
+        switch (val) {
+        case 0:
+                if (current_trace->allocated_snapshot) {
+                        /* free spare buffer */
+                        ring_buffer_resize(max_tr.buffer, 1,
+                                           RING_BUFFER_ALL_CPUS);
+                        set_buffer_entries(&max_tr, 1);
+                        tracing_reset_online_cpus(&max_tr);
+                        current_trace->allocated_snapshot = false;
+                }
+                break;
+        case 1:
+                if (!current_trace->allocated_snapshot) {
+                        /* allocate spare buffer */
+                        ret = resize_buffer_duplicate_size(&max_tr,
+                                        &global_trace, RING_BUFFER_ALL_CPUS);
+                        if (ret < 0)
+                                break;
+                        current_trace->allocated_snapshot = true;
+                }
+                local_irq_disable();
+                /* Now, we're going to swap */
+                update_max_tr(&global_trace, current, smp_processor_id());
+                local_irq_enable();
+                break;
+        default:
+                if (current_trace->allocated_snapshot)
+                        tracing_reset_online_cpus(&max_tr);
+                break;
+        }
+        if (ret >= 0) {
+                *ppos += cnt;
+                ret = cnt;
+        }
+out:
+        mutex_unlock(&trace_types_lock);
+        return ret;
+}
+#endif /* CONFIG_TRACER_SNAPSHOT */
 static const struct file_operations tracing_max_lat_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_max_lat_read,
@@ -4037,13 +4220,6 @@ static const struct file_operations tracing_max_lat_fops = {
        .llseek         = generic_file_llseek,
 };
-static const struct file_operations tracing_ctrl_fops = {
-        .open           = tracing_open_generic,
-        .read           = tracing_ctrl_read,
-        .write          = tracing_ctrl_write,
-        .llseek         = generic_file_llseek,
-};
 static const struct file_operations set_tracer_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_set_trace_read,
@@ -4093,6 +4269,16 @@ static const struct file_operations trace_clock_fops = {
        .write          = tracing_clock_write,
 };
+#ifdef CONFIG_TRACER_SNAPSHOT
+static const struct file_operations snapshot_fops = {
+        .open           = tracing_snapshot_open,
+        .read           = seq_read,
+        .write          = tracing_snapshot_write,
+        .llseek         = tracing_seek,
+        .release        = tracing_release,
+};
+#endif /* CONFIG_TRACER_SNAPSHOT */
 struct ftrace_buffer_info {
        struct trace_array      *tr;
        void                    *spare;
@@ -4260,13 +4446,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                return -ENOMEM;
        if (*ppos & (PAGE_SIZE - 1)) {
-                WARN_ONCE(1, "Ftrace: previous read must page-align\n");
                ret = -EINVAL;
                goto out;
        }
        if (len & (PAGE_SIZE - 1)) {
-                WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
                if (len < PAGE_SIZE) {
                        ret = -EINVAL;
                        goto out;
@@ -4377,13 +4561,30 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
        trace_seq_printf(s, "bytes: %ld\n", cnt);
-        t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
+        if (trace_clocks[trace_clock_id].in_ns) {
-        usec_rem = do_div(t, USEC_PER_SEC);
+                /* local or global for trace_clock */
-        trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem);
+                t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
+                usec_rem = do_div(t, USEC_PER_SEC);
+                trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
+                                                                t, usec_rem);
+                t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
+                usec_rem = do_div(t, USEC_PER_SEC);
+                trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
+        } else {
+                /* counter or tsc mode for trace_clock */
+                trace_seq_printf(s, "oldest event ts: %llu\n",
+                                ring_buffer_oldest_event_ts(tr->buffer, cpu));
+                trace_seq_printf(s, "now ts: %llu\n",
+                                ring_buffer_time_stamp(tr->buffer, cpu));
+        }
-        t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
+        cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
-        usec_rem = do_div(t, USEC_PER_SEC);
+        trace_seq_printf(s, "dropped events: %ld\n", cnt);
-        trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
+        cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
+        trace_seq_printf(s, "read events: %ld\n", cnt);
        count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
@@ -4461,7 +4662,7 @@ struct dentry *tracing_init_dentry(void)
 static struct dentry *d_percpu;
-struct dentry *tracing_dentry_percpu(void)
+static struct dentry *tracing_dentry_percpu(void)
 {
        static int once;
        struct dentry *d_tracer;
@@ -4611,7 +4812,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
        if (val != 0 && val != 1)
                return -EINVAL;
-        set_tracer_flags(1 << index, val);
+        mutex_lock(&trace_types_lock);
+        ret = set_tracer_flag(1 << index, val);
+        mutex_unlock(&trace_types_lock);
+        if (ret < 0)
+                return ret;
        *ppos += cnt;
@@ -4788,10 +4995,17 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
                return ret;
        if (buffer) {
-                if (val)
+                mutex_lock(&trace_types_lock);
+                if (val) {
                        ring_buffer_record_on(buffer);
-                else
+                        if (current_trace->start)
+                                current_trace->start(tr);
+                } else {
                        ring_buffer_record_off(buffer);
+                        if (current_trace->stop)
+                                current_trace->stop(tr);
+                }
+                mutex_unlock(&trace_types_lock);
        }
        (*ppos)++;
@@ -4815,9 +5029,6 @@ static __init int tracer_init_debugfs(void)
        d_tracer = tracing_init_dentry();
-        trace_create_file("tracing_enabled", 0644, d_tracer,
-                        &global_trace, &tracing_ctrl_fops);
        trace_create_file("trace_options", 0644, d_tracer,
                        NULL, &tracing_iter_fops);
@@ -4873,6 +5084,11 @@ static __init int tracer_init_debugfs(void)
                        &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+        trace_create_file("snapshot", 0644, d_tracer,
+                          (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
+#endif
        create_trace_options_dir();
        for_each_tracing_cpu(cpu)
@@ -4981,6 +5197,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
        if (disable_tracing)
                ftrace_kill();
+        /* Simulate the iterator */
        trace_init_global_iter(&iter);
        for_each_tracing_cpu(cpu) {
@@ -4992,10 +5209,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
        /* don't look at user memory in panic mode */
        trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
-        /* Simulate the iterator */
-        iter.tr = &global_trace;
-        iter.trace = current_trace;
        switch (oops_dump_mode) {
        case DUMP_ALL:
                iter.cpu_file = TRACE_PIPE_ALL_CPU;
@@ -5089,6 +5302,7 @@ __init static int tracer_alloc_buffers(void)
        /* Only allocate trace_printk buffers if a trace_printk exists */
        if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
+                /* Must be called before global_trace.buffer is allocated */
                trace_printk_init_buffers();
        /* To save memory, keep the ring buffer size to its minimum */
@@ -5136,9 +5350,10 @@ __init static int tracer_alloc_buffers(void)
 #endif
        trace_init_cmdlines();
+        init_irq_work(&trace_work_wakeup, trace_wake_up);
        register_tracer(&nop_trace);
-        current_trace = &nop_trace;
        /* All seems OK, enable tracing */
        tracing_disabled = 0;
@@ -5147,6 +5362,13 @@ __init static int tracer_alloc_buffers(void)
        register_die_notifier(&trace_die_notifier);
+        while (trace_boot_options) {
+                char *option;
+                option = strsep(&trace_boot_options, ",");
+                trace_set_options(option);
+        }
        return 0;
 out_free_cpumask:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c15f528c1af4..2081971367ea 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -283,24 +283,70 @@ struct tracer {
        enum print_line_t       (*print_line)(struct trace_iterator *iter);
        /* If you handled the flag setting, return 0 */
        int                     (*set_flag)(u32 old_flags, u32 bit, int set);
+        /* Return 0 if OK with change, else return non-zero */
+        int                     (*flag_changed)(struct tracer *tracer,
+                                                u32 mask, int set);
        struct tracer           *next;
        struct tracer_flags     *flags;
-        int                     print_max;
+        bool                    print_max;
-        int                     use_max_tr;
+        bool                    use_max_tr;
+        bool                    allocated_snapshot;
+        bool                    enabled;
 };
 /* Only current can touch trace_recursion */
-#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
-#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
-/* Ring buffer has the 10 LSB bits to count */
+/*
-#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+ * For function tracing recursion:
+ *  The order of these bits are important.
-/* for function tracing recursion */
+ *
-#define TRACE_INTERNAL_BIT              (1<<11)
+ *  When function tracing occurs, the following steps are made:
-#define TRACE_GLOBAL_BIT                (1<<12)
+ *   If arch does not support a ftrace feature:
-#define TRACE_CONTROL_BIT               (1<<13)
+ *    call internal function (uses INTERNAL bits) which calls...
+ *   If callback is registered to the "global" list, the list
+ *    function is called and recursion checks the GLOBAL bits.
+ *    then this function calls...
+ *   The function callback, which can use the FTRACE bits to
+ *    check for recursion.
+ *
+ * Now if the arch does not suppport a feature, and it calls
+ * the global list function which calls the ftrace callback
+ * all three of these steps will do a recursion protection.
+ * There's no reason to do one if the previous caller already
+ * did. The recursion that we are protecting against will
+ * go through the same steps again.
+ *
+ * To prevent the multiple recursion checks, if a recursion
+ * bit is set that is higher than the MAX bit of the current
+ * check, then we know that the check was made by the previous
+ * caller, and we can skip the current check.
+ */
+enum {
+        TRACE_BUFFER_BIT,
+        TRACE_BUFFER_NMI_BIT,
+        TRACE_BUFFER_IRQ_BIT,
+        TRACE_BUFFER_SIRQ_BIT,
+        /* Start of function recursion bits */
+        TRACE_FTRACE_BIT,
+        TRACE_FTRACE_NMI_BIT,
+        TRACE_FTRACE_IRQ_BIT,
+        TRACE_FTRACE_SIRQ_BIT,
+        /* GLOBAL_BITs must be greater than FTRACE_BITs */
+        TRACE_GLOBAL_BIT,
+        TRACE_GLOBAL_NMI_BIT,
+        TRACE_GLOBAL_IRQ_BIT,
+        TRACE_GLOBAL_SIRQ_BIT,
+        /* INTERNAL_BITs must be greater than GLOBAL_BITs */
+        TRACE_INTERNAL_BIT,
+        TRACE_INTERNAL_NMI_BIT,
+        TRACE_INTERNAL_IRQ_BIT,
+        TRACE_INTERNAL_SIRQ_BIT,
+        TRACE_CONTROL_BIT,
 /*
 * Abuse of the trace_recursion.
@@ -309,11 +355,77 @@ struct tracer {
 * was called in irq context but we have irq tracing off. Since this
 * can only be modified by current, we can reuse trace_recursion.
 */
-#define TRACE_IRQ_BIT                   (1<<13)
+        TRACE_IRQ_BIT,
+};
+#define trace_recursion_set(bit)        do { (current)->trace_recursion |= (1<<(bit)); } while (0)
+#define trace_recursion_clear(bit)      do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
+#define trace_recursion_test(bit)       ((current)->trace_recursion & (1<<(bit)))
-#define trace_recursion_set(bit)        do { (current)->trace_recursion |= (bit); } while (0)
+#define TRACE_CONTEXT_BITS      4
-#define trace_recursion_clear(bit)      do { (current)->trace_recursion &= ~(bit); } while (0)
-#define trace_recursion_test(bit)       ((current)->trace_recursion & (bit))
+#define TRACE_FTRACE_START      TRACE_FTRACE_BIT
+#define TRACE_FTRACE_MAX        ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
+#define TRACE_GLOBAL_START      TRACE_GLOBAL_BIT
+#define TRACE_GLOBAL_MAX        ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
+#define TRACE_LIST_START        TRACE_INTERNAL_BIT
+#define TRACE_LIST_MAX          ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
+#define TRACE_CONTEXT_MASK      TRACE_LIST_MAX
+static __always_inline int trace_get_context_bit(void)
+{
+        int bit;
+        if (in_interrupt()) {
+                if (in_nmi())
+                        bit = 0;
+                else if (in_irq())
+                        bit = 1;
+                else
+                        bit = 2;
+        } else
+                bit = 3;
+        return bit;
+}
+static __always_inline int trace_test_and_set_recursion(int start, int max)
+{
+        unsigned int val = current->trace_recursion;
+        int bit;
+        /* A previous recursion check was made */
+        if ((val & TRACE_CONTEXT_MASK) > max)
+                return 0;
+        bit = trace_get_context_bit() + start;
+        if (unlikely(val & (1 << bit)))
+                return -1;
+        val |= 1 << bit;
+        current->trace_recursion = val;
+        barrier();
+        return bit;
+}
+static __always_inline void trace_clear_recursion(int bit)
+{
+        unsigned int val = current->trace_recursion;
+        if (!bit)
+                return;
+        bit = 1 << bit;
+        val &= ~bit;
+        barrier();
+        current->trace_recursion = val;
+}
 #define TRACE_PIPE_ALL_CPU      -1
@@ -327,7 +439,6 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
-void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
 void tracing_reset_current(int cpu);
@@ -349,9 +460,6 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
                          unsigned long len,
                          unsigned long flags,
                          int pc);
-void trace_buffer_unlock_commit(struct ring_buffer *buffer,
-                                struct ring_buffer_event *event,
-                                unsigned long flags, int pc);
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
                                                struct trace_array_cpu *data);
@@ -359,6 +467,9 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
                                          int *ent_cpu, u64 *ent_ts);
+void __buffer_unlock_commit(struct ring_buffer *buffer,
+                            struct ring_buffer_event *event);
 int trace_empty(struct trace_iterator *iter);
 void *trace_find_next_entry_inc(struct trace_iterator *iter);
@@ -367,7 +478,6 @@ void trace_init_global_iter(struct trace_iterator *iter);
 void tracing_iter_reset(struct trace_iterator *iter, int cpu);
-void default_wait_pipe(struct trace_iterator *iter);
 void poll_wait_pipe(struct trace_iterator *iter);
 void ftrace(struct trace_array *tr,
@@ -407,12 +517,7 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr);
 void tracing_stop_sched_switch_record(void);
 void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
-void unregister_tracer(struct tracer *type);
 int is_tracing_stopped(void);
-enum trace_file_type {
-        TRACE_FILE_LAT_FMT      = 1,
-        TRACE_FILE_ANNOTATE     = 2,
-};
 extern cpumask_var_t __read_mostly tracing_buffer_mask;
@@ -841,6 +946,9 @@ extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 void trace_printk_init_buffers(void);
+void trace_printk_start_comm(void);
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
+int set_tracer_flag(unsigned int mask, int enabled);
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)     \
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 8d3538b4ea5f..95e96842ed29 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -77,7 +77,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
        entry->correct = val == expect;
        if (!filter_check_discard(call, entry, buffer, event))
-                ring_buffer_unlock_commit(buffer, event);
+                __buffer_unlock_commit(buffer, event);
 out:
        atomic_dec(&tr->data[cpu]->disabled);
@@ -199,7 +199,7 @@ __init static int init_branch_tracer(void)
        }
        return register_tracer(&branch_trace);
 }
-device_initcall(init_branch_tracer);
+core_initcall(init_branch_tracer);
 #else
 static inline
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 394783531cbb..aa8f5f48dae6 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -21,8 +21,6 @@
 #include <linux/ktime.h>
 #include <linux/trace_clock.h>
-#include "trace.h"
 /*
 * trace_clock_local(): the simplest and least coherent tracing clock.
 *
@@ -44,6 +42,7 @@ u64 notrace trace_clock_local(void)
        return clock;
 }
+EXPORT_SYMBOL_GPL(trace_clock_local);
 /*
 * trace_clock(): 'between' trace clock. Not completely serialized,
@@ -86,7 +85,7 @@ u64 notrace trace_clock_global(void)
        local_irq_save(flags);
        this_cpu = raw_smp_processor_id();
-        now = cpu_clock(this_cpu);
+        now = sched_clock_cpu(this_cpu);
        /*
         * If in an NMI context then dont risk lockups and return the
         * cpu_clock() time:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d608d09d08c0..57e9b284250c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
        __common_field(unsigned char, flags);
        __common_field(unsigned char, preempt_count);
        __common_field(int, pid);
-        __common_field(int, padding);
        return ret;
 }
@@ -491,19 +490,6 @@ static void t_stop(struct seq_file *m, void *p)
        mutex_unlock(&event_mutex);
 }
-static int
-ftrace_event_seq_open(struct inode *inode, struct file *file)
-{
-        const struct seq_operations *seq_ops;
-        if ((file->f_mode & FMODE_WRITE) &&
-            (file->f_flags & O_TRUNC))
-                ftrace_clear_events();
-        seq_ops = inode->i_private;
-        return seq_open(file, seq_ops);
-}
 static ssize_t
 event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
                  loff_t *ppos)
@@ -980,6 +966,9 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
        return r;
 }
+static int ftrace_event_avail_open(struct inode *inode, struct file *file);
+static int ftrace_event_set_open(struct inode *inode, struct file *file);
 static const struct seq_operations show_event_seq_ops = {
        .start = t_start,
        .next = t_next,
@@ -995,14 +984,14 @@ static const struct seq_operations show_set_event_seq_ops = {
 };
 static const struct file_operations ftrace_avail_fops = {
-        .open = ftrace_event_seq_open,
+        .open = ftrace_event_avail_open,
        .read = seq_read,
        .llseek = seq_lseek,
        .release = seq_release,
 };
 static const struct file_operations ftrace_set_event_fops = {
-        .open = ftrace_event_seq_open,
+        .open = ftrace_event_set_open,
        .read = seq_read,
        .write = ftrace_event_write,
        .llseek = seq_lseek,
@@ -1078,6 +1067,26 @@ static struct dentry *event_trace_events_dir(void)
        return d_events;
 }
+static int
+ftrace_event_avail_open(struct inode *inode, struct file *file)
+{
+        const struct seq_operations *seq_ops = &show_event_seq_ops;
+        return seq_open(file, seq_ops);
+}
+static int
+ftrace_event_set_open(struct inode *inode, struct file *file)
+{
+        const struct seq_operations *seq_ops = &show_set_event_seq_ops;
+        if ((file->f_mode & FMODE_WRITE) &&
+            (file->f_flags & O_TRUNC))
+                ftrace_clear_events();
+        return seq_open(file, seq_ops);
+}
 static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
@@ -1489,6 +1498,9 @@ static __init int event_trace_enable(void)
                if (ret)
                        pr_warn("Failed to enable trace event: %s\n", token);
        }
+        trace_printk_start_comm();
        return 0;
 }
@@ -1505,15 +1517,13 @@ static __init int event_trace_init(void)
                return 0;
        entry = debugfs_create_file("available_events", 0444, d_tracer,
-                                    (void *)&show_event_seq_ops,
+                                    NULL, &ftrace_avail_fops);
-                                    &ftrace_avail_fops);
        if (!entry)
                pr_warning("Could not create debugfs "
                           "'available_events' entry\n");
        entry = debugfs_create_file("set_event", 0644, d_tracer,
-                                    (void *)&show_set_event_seq_ops,
+                                    NULL, &ftrace_set_event_fops);
-                                    &ftrace_set_event_fops);
        if (!entry)
                pr_warning("Could not create debugfs "
                           "'set_event' entry\n");
@@ -1749,7 +1759,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
        entry->ip                       = ip;
        entry->parent_ip                = parent_ip;
-        trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
+        trace_buffer_unlock_commit(buffer, event, flags, pc);
 out:
        atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c154797a7ff7..e5b0ca8b8d4d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1000,9 +1000,9 @@ static int init_pred(struct filter_parse_state *ps,
                }
        } else {
                if (field->is_signed)
-                        ret = strict_strtoll(pred->regex.pattern, 0, &val);
+                        ret = kstrtoll(pred->regex.pattern, 0, &val);
                else
-                        ret = strict_strtoull(pred->regex.pattern, 0, &val);
+                        ret = kstrtoull(pred->regex.pattern, 0, &val);
                if (ret) {
                        parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
                        return -EINVAL;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 507a7a9630bf..601152523326 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -7,7 +7,7 @@
 * Based on code from the latency_tracer, that is:
 *
 *  Copyright (C) 2004-2006 Ingo Molnar
- *  Copyright (C) 2004 William Lee Irwin III
+ *  Copyright (C) 2004 Nadia Yvette Chambers
 */
 #include <linux/ring_buffer.h>
 #include <linux/debugfs.h>
@@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr)
        tracing_reset_online_cpus(tr);
 }
-static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
-                                 struct ftrace_ops *op, struct pt_regs *pt_regs)
-{
-        struct trace_array *tr = func_trace;
-        struct trace_array_cpu *data;
-        unsigned long flags;
-        long disabled;
-        int cpu;
-        int pc;
-        if (unlikely(!ftrace_function_enabled))
-                return;
-        pc = preempt_count();
-        preempt_disable_notrace();
-        local_save_flags(flags);
-        cpu = raw_smp_processor_id();
-        data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
-        if (likely(disabled == 1))
-                trace_function(tr, ip, parent_ip, flags, pc);
-        atomic_dec(&data->disabled);
-        preempt_enable_notrace();
-}
 /* Our option */
 enum {
        TRACE_FUNC_OPT_STACK    = 0x1,
@@ -85,34 +57,34 @@ static struct tracer_flags func_flags;
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip,
                    struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
        struct trace_array *tr = func_trace;
        struct trace_array_cpu *data;
        unsigned long flags;
-        long disabled;
+        int bit;
        int cpu;
        int pc;
        if (unlikely(!ftrace_function_enabled))
                return;
-        /*
+        pc = preempt_count();
-         * Need to use raw, since this must be called before the
+        preempt_disable_notrace();
-         * recursive protection is performed.
-         */
-        local_irq_save(flags);
-        cpu = raw_smp_processor_id();
-        data = tr->data[cpu];
-        disabled = atomic_inc_return(&data->disabled);
-        if (likely(disabled == 1)) {
+        bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
-                pc = preempt_count();
+        if (bit < 0)
+                goto out;
+        cpu = smp_processor_id();
+        data = tr->data[cpu];
+        if (!atomic_read(&data->disabled)) {
+                local_save_flags(flags);
                trace_function(tr, ip, parent_ip, flags, pc);
        }
+        trace_clear_recursion(bit);
-        atomic_dec(&data->disabled);
+ out:
-        local_irq_restore(flags);
+        preempt_enable_notrace();
 }
 static void
@@ -185,11 +157,6 @@ static void tracing_start_function_trace(void)
 {
        ftrace_function_enabled = 0;
-        if (trace_flags & TRACE_ITER_PREEMPTONLY)
-                trace_ops.func = function_trace_call_preempt_only;
-        else
-                trace_ops.func = function_trace_call;
        if (func_flags.val & TRACE_FUNC_OPT_STACK)
                register_ftrace_function(&trace_stack_ops);
        else
@@ -366,7 +333,7 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
         * We use the callback data field (which is a pointer)
         * as our counter.
         */
-        ret = strict_strtoul(number, 0, (unsigned long *)&count);
+        ret = kstrtoul(number, 0, (unsigned long *)&count);
        if (ret)
                return ret;
@@ -411,5 +378,4 @@ static __init int init_function_trace(void)
        init_func_cmd_traceon();
        return register_tracer(&function_trace);
 }
-device_initcall(init_function_trace);
+core_initcall(init_function_trace);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 99b4378393d5..39ada66389cc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -47,6 +47,8 @@ struct fgraph_data {
 #define TRACE_GRAPH_PRINT_ABS_TIME      0x20
 #define TRACE_GRAPH_PRINT_IRQS          0x40
+static unsigned int max_depth;
 static struct tracer_opt trace_opts[] = {
        /* Display overruns? (for self-debug purpose) */
        { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
@@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
        ftrace_pop_return_trace(&trace, &ret, frame_pointer);
        trace.rettime = trace_clock_local();
-        ftrace_graph_return(&trace);
        barrier();
        current->curr_ret_stack--;
+        /*
+         * The trace should run after decrementing the ret counter
+         * in case an interrupt were to come in. We don't want to
+         * lose the interrupt if max_depth is set.
+         */
+        ftrace_graph_return(&trace);
        if (unlikely(!ret)) {
                ftrace_graph_stop();
                WARN_ON(1);
@@ -223,7 +231,7 @@ int __trace_graph_entry(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->graph_ent                        = *trace;
        if (!filter_current_check_discard(buffer, call, entry, event))
-                ring_buffer_unlock_commit(buffer, event);
+                __buffer_unlock_commit(buffer, event);
        return 1;
 }
@@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
                return 0;
        /* trace it when it is-nested-in or is a function enabled. */
-        if (!(trace->depth || ftrace_graph_addr(trace->func)) ||
+        if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
-              ftrace_graph_ignore_irqs())
+             ftrace_graph_ignore_irqs()) ||
+            (max_depth && trace->depth >= max_depth))
                return 0;
        local_irq_save(flags);
@@ -327,7 +336,7 @@ void __trace_graph_return(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->ret                              = *trace;
        if (!filter_current_check_discard(buffer, call, entry, event))
-                ring_buffer_unlock_commit(buffer, event);
+                __buffer_unlock_commit(buffer, event);
 }
 void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = {
 #endif
 };
+static ssize_t
+graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
+                  loff_t *ppos)
+{
+        unsigned long val;
+        int ret;
+        ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+        if (ret)
+                return ret;
+        max_depth = val;
+        *ppos += cnt;
+        return cnt;
+}
+static ssize_t
+graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
+                 loff_t *ppos)
+{
+        char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
+        int n;
+        n = sprintf(buf, "%d\n", max_depth);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+static const struct file_operations graph_depth_fops = {
+        .open           = tracing_open_generic,
+        .write          = graph_depth_write,
+        .read           = graph_depth_read,
+        .llseek         = generic_file_llseek,
+};
+static __init int init_graph_debugfs(void)
+{
+        struct dentry *d_tracer;
+        d_tracer = tracing_init_dentry();
+        if (!d_tracer)
+                return 0;
+        trace_create_file("max_graph_depth", 0644, d_tracer,
+                          NULL, &graph_depth_fops);
+        return 0;
+}
+fs_initcall(init_graph_debugfs);
 static __init int init_graph_trace(void)
 {
        max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
@@ -1474,4 +1536,4 @@ static __init int init_graph_trace(void)
        return register_tracer(&graph_trace);
 }
-device_initcall(init_graph_trace);
+core_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index d98ee8283b29..443b25b43b4f 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -7,7 +7,7 @@
 * From code in the latency_tracer, that is:
 *
 *  Copyright (C) 2004-2006 Ingo Molnar
- *  Copyright (C) 2004 William Lee Irwin III
+ *  Copyright (C) 2004 Nadia Yvette Chambers
 */
 #include <linux/kallsyms.h>
 #include <linux/debugfs.h>
@@ -32,7 +32,7 @@ enum {
 static int trace_type __read_mostly;
-static int save_lat_flag;
+static int save_flags;
 static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
 static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -558,8 +558,11 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
 static void __irqsoff_tracer_init(struct trace_array *tr)
 {
-        save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+        save_flags = trace_flags;
-        trace_flags |= TRACE_ITER_LATENCY_FMT;
+        /* non overwrite screws up the latency tracers */
+        set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
+        set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
        tracing_max_latency = 0;
        irqsoff_trace = tr;
@@ -573,10 +576,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
+        int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+        int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
        stop_irqsoff_tracer(tr, is_graph());
-        if (!save_lat_flag)
+        set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
-                trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+        set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
 }
 static void irqsoff_tracer_start(struct trace_array *tr)
@@ -604,17 +610,18 @@ static struct tracer irqsoff_tracer __read_mostly =
        .reset          = irqsoff_tracer_reset,
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
-        .print_max      = 1,
+        .print_max      = true,
        .print_header   = irqsoff_print_header,
        .print_line     = irqsoff_print_line,
        .flags          = &tracer_flags,
        .set_flag       = irqsoff_set_flag,
+        .flag_changed   = trace_keep_overwrite,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_irqsoff,
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
-        .use_max_tr     = 1,
+        .use_max_tr     = true,
 };
 # define register_irqsoff(trace) register_tracer(&trace)
 #else
@@ -637,17 +644,18 @@ static struct tracer preemptoff_tracer __read_mostly =
        .reset          = irqsoff_tracer_reset,
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
-        .print_max      = 1,
+        .print_max      = true,
        .print_header   = irqsoff_print_header,
        .print_line     = irqsoff_print_line,
        .flags          = &tracer_flags,
        .set_flag       = irqsoff_set_flag,
+        .flag_changed   = trace_keep_overwrite,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptoff,
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
-        .use_max_tr     = 1,
+        .use_max_tr     = true,
 };
 # define register_preemptoff(trace) register_tracer(&trace)
 #else
@@ -672,17 +680,18 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
        .reset          = irqsoff_tracer_reset,
        .start          = irqsoff_tracer_start,
        .stop           = irqsoff_tracer_stop,
-        .print_max      = 1,
+        .print_max      = true,
        .print_header   = irqsoff_print_header,
        .print_line     = irqsoff_print_line,
        .flags          = &tracer_flags,
        .set_flag       = irqsoff_set_flag,
+        .flag_changed   = trace_keep_overwrite,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptirqsoff,
 #endif
        .open           = irqsoff_trace_open,
        .close          = irqsoff_trace_close,
-        .use_max_tr     = 1,
+        .use_max_tr     = true,
 };
 # define register_preemptirqsoff(trace) register_tracer(&trace)
@@ -698,4 +707,4 @@ __init static int init_irqsoff_tracer(void)
        return 0;
 }
-device_initcall(init_irqsoff_tracer);
+core_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1a2117043bb1..1865d5f76538 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -444,7 +444,7 @@ static int create_trace_probe(int argc, char **argv)
                        return -EINVAL;
                }
                /* an address specified */
-                ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
+                ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
                if (ret) {
                        pr_info("Failed to parse address.\n");
                        return ret;
@@ -751,8 +751,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        if (!filter_current_check_discard(buffer, call, entry, event))
-                trace_nowake_buffer_unlock_commit_regs(buffer, event,
+                trace_buffer_unlock_commit_regs(buffer, event,
-                                                       irq_flags, pc, regs);
+                                                irq_flags, pc, regs);
 }
 /* Kretprobe handler */
@@ -784,8 +784,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        if (!filter_current_check_discard(buffer, call, entry, event))
-                trace_nowake_buffer_unlock_commit_regs(buffer, event,
+                trace_buffer_unlock_commit_regs(buffer, event,
-                                                       irq_flags, pc, regs);
+                                                irq_flags, pc, regs);
 }
 /* Event entry printers */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 123b189c732c..697e88d13907 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -610,24 +610,54 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
        return trace_print_lat_fmt(s, entry);
 }
-static unsigned long preempt_mark_thresh = 100;
+static unsigned long preempt_mark_thresh_us = 100;
 static int
-lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
+lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
-                    unsigned long rel_usecs)
 {
-        return trace_seq_printf(s, " %4lldus%c: ", abs_usecs,
+        unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
-                                rel_usecs > preempt_mark_thresh ? '!' :
+        unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
-                                  rel_usecs > 1 ? '+' : ' ');
+        unsigned long long abs_ts = iter->ts - iter->tr->time_start;
+        unsigned long long rel_ts = next_ts - iter->ts;
+        struct trace_seq *s = &iter->seq;
+        if (in_ns) {
+                abs_ts = ns2usecs(abs_ts);
+                rel_ts = ns2usecs(rel_ts);
+        }
+        if (verbose && in_ns) {
+                unsigned long abs_usec = do_div(abs_ts, USEC_PER_MSEC);
+                unsigned long abs_msec = (unsigned long)abs_ts;
+                unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC);
+                unsigned long rel_msec = (unsigned long)rel_ts;
+                return trace_seq_printf(
+                                s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ",
+                                ns2usecs(iter->ts),
+                                abs_msec, abs_usec,
+                                rel_msec, rel_usec);
+        } else if (verbose && !in_ns) {
+                return trace_seq_printf(
+                                s, "[%016llx] %lld (+%lld): ",
+                                iter->ts, abs_ts, rel_ts);
+        } else if (!verbose && in_ns) {
+                return trace_seq_printf(
+                                s, " %4lldus%c: ",
+                                abs_ts,
+                                rel_ts > preempt_mark_thresh_us ? '!' :
+                                  rel_ts > 1 ? '+' : ' ');
+        } else { /* !verbose && !in_ns */
+                return trace_seq_printf(s, " %4lld: ", abs_ts);
+        }
 }
 int trace_print_context(struct trace_iterator *iter)
 {
        struct trace_seq *s = &iter->seq;
        struct trace_entry *entry = iter->ent;
-        unsigned long long t = ns2usecs(iter->ts);
+        unsigned long long t;
-        unsigned long usec_rem = do_div(t, USEC_PER_SEC);
+        unsigned long secs, usec_rem;
-        unsigned long secs = (unsigned long)t;
        char comm[TASK_COMM_LEN];
        int ret;
@@ -644,8 +674,13 @@ int trace_print_context(struct trace_iterator *iter)
                        return 0;
        }
-        return trace_seq_printf(s, " %5lu.%06lu: ",
+        if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
-                                secs, usec_rem);
+                t = ns2usecs(iter->ts);
+                usec_rem = do_div(t, USEC_PER_SEC);
+                secs = (unsigned long)t;
+                return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem);
+        } else
+                return trace_seq_printf(s, " %12llu: ", iter->ts);
 }
 int trace_print_lat_context(struct trace_iterator *iter)
@@ -659,36 +694,29 @@ int trace_print_lat_context(struct trace_iterator *iter)
                           *next_entry = trace_find_next_entry(iter, NULL,
                                                               &next_ts);
        unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
-        unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
-        unsigned long rel_usecs;
        /* Restore the original ent_size */
        iter->ent_size = ent_size;
        if (!next_entry)
                next_ts = iter->ts;
-        rel_usecs = ns2usecs(next_ts - iter->ts);
        if (verbose) {
                char comm[TASK_COMM_LEN];
                trace_find_cmdline(entry->pid, comm);
-                ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
+                ret = trace_seq_printf(
-                                       " %ld.%03ldms (+%ld.%03ldms): ", comm,
+                                s, "%16s %5d %3d %d %08x %08lx ",
-                                       entry->pid, iter->cpu, entry->flags,
+                                comm, entry->pid, iter->cpu, entry->flags,
-                                       entry->preempt_count, iter->idx,
+                                entry->preempt_count, iter->idx);
-                                       ns2usecs(iter->ts),
-                                       abs_usecs / USEC_PER_MSEC,
-                                       abs_usecs % USEC_PER_MSEC,
-                                       rel_usecs / USEC_PER_MSEC,
-                                       rel_usecs % USEC_PER_MSEC);
        } else {
                ret = lat_print_generic(s, entry, iter->cpu);
-                if (ret)
-                        ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
        }
+        if (ret)
+                ret = lat_print_timestamp(iter, next_ts);
        return ret;
 }
@@ -711,12 +739,11 @@ static int task_state_char(unsigned long state)
 struct trace_event *ftrace_find_event(int type)
 {
        struct trace_event *event;
-        struct hlist_node *n;
        unsigned key;
        key = type & (EVENT_HASHSIZE - 1);
-        hlist_for_each_entry(event, n, &event_hash[key], node) {
+        hlist_for_each_entry(event, &event_hash[key], node) {
                if (event->type == type)
                        return event;
        }
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index daa9980153af..412e959709b4 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -441,7 +441,7 @@ static const struct fetch_type *find_fetch_type(const char *type)
                        goto fail;
                type++;
-                if (strict_strtoul(type, 0, &bs))
+                if (kstrtoul(type, 0, &bs))
                        goto fail;
                switch (bs) {
@@ -501,8 +501,8 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset)
        tmp = strchr(symbol, '+');
        if (tmp) {
-                /* skip sign because strict_strtol doesn't accept '+' */
+                /* skip sign because kstrtoul doesn't accept '+' */
-                ret = strict_strtoul(tmp + 1, 0, offset);
+                ret = kstrtoul(tmp + 1, 0, offset);
                if (ret)
                        return ret;
@@ -533,7 +533,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t,
                        else
                                ret = -EINVAL;
                } else if (isdigit(arg[5])) {
-                        ret = strict_strtoul(arg + 5, 10, &param);
+                        ret = kstrtoul(arg + 5, 10, &param);
                        if (ret || param > PARAM_MAX_STACK)
                                ret = -EINVAL;
                        else {
@@ -579,7 +579,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
        case '@':       /* memory or symbol */
                if (isdigit(arg[1])) {
-                        ret = strict_strtoul(arg + 1, 0, &param);
+                        ret = kstrtoul(arg + 1, 0, &param);
                        if (ret)
                                break;
@@ -597,14 +597,14 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t,
                break;
        case '+':       /* deref memory */
-                arg++;  /* Skip '+', because strict_strtol() rejects it. */
+                arg++;  /* Skip '+', because kstrtol() rejects it. */
        case '-':
                tmp = strchr(arg, '(');
                if (!tmp)
                        break;
                *tmp = '\0';
-                ret = strict_strtol(arg, 0, &offset);
+                ret = kstrtol(arg, 0, &offset);
                if (ret)
                        break;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 933708677814..5c7e09d10d74 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -66,7 +66,6 @@
 #define TP_FLAG_TRACE           1
 #define TP_FLAG_PROFILE         2
 #define TP_FLAG_REGISTERED      4
-#define TP_FLAG_UPROBE          8
 /* data_rloc: data relative location, compatible with u32 */
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 7e62c0a18456..3374c792ccd8 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -102,9 +102,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
        entry->next_cpu                 = task_cpu(wakee);
        if (!filter_check_discard(call, entry, buffer, event))
-                ring_buffer_unlock_commit(buffer, event);
+                trace_buffer_unlock_commit(buffer, event, flags, pc);
-        ftrace_trace_stack(tr->buffer, flags, 6, pc);
-        ftrace_trace_userstack(tr->buffer, flags, pc);
 }
 static void
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 02170c00c413..fde652c9a511 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -7,7 +7,7 @@
 * Based on code from the latency_tracer, that is:
 *
 *  Copyright (C) 2004-2006 Ingo Molnar
- *  Copyright (C) 2004 William Lee Irwin III
+ *  Copyright (C) 2004 Nadia Yvette Chambers
 */
 #include <linux/module.h>
 #include <linux/fs.h>
@@ -15,8 +15,8 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/sched/rt.h>
 #include <trace/events/sched.h>
 #include "trace.h"
 static struct trace_array       *wakeup_trace;
@@ -36,7 +36,7 @@ static void __wakeup_reset(struct trace_array *tr);
 static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
 static void wakeup_graph_return(struct ftrace_graph_ret *trace);
-static int save_lat_flag;
+static int save_flags;
 #define TRACE_DISPLAY_GRAPH     1
@@ -540,8 +540,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 static int __wakeup_tracer_init(struct trace_array *tr)
 {
-        save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+        save_flags = trace_flags;
-        trace_flags |= TRACE_ITER_LATENCY_FMT;
+        /* non overwrite screws up the latency tracers */
+        set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
+        set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
        tracing_max_latency = 0;
        wakeup_trace = tr;
@@ -563,12 +566,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
 static void wakeup_tracer_reset(struct trace_array *tr)
 {
+        int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+        int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
        stop_wakeup_tracer(tr);
        /* make sure we put back any tasks we are tracing */
        wakeup_reset(tr);
-        if (!save_lat_flag)
+        set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
-                trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+        set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
 }
 static void wakeup_tracer_start(struct trace_array *tr)
@@ -589,17 +595,18 @@ static struct tracer wakeup_tracer __read_mostly =
        .reset          = wakeup_tracer_reset,
        .start          = wakeup_tracer_start,
        .stop           = wakeup_tracer_stop,
-        .print_max      = 1,
+        .print_max      = true,
        .print_header   = wakeup_print_header,
        .print_line     = wakeup_print_line,
        .flags          = &tracer_flags,
        .set_flag       = wakeup_set_flag,
+        .flag_changed   = trace_keep_overwrite,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
        .open           = wakeup_trace_open,
        .close          = wakeup_trace_close,
-        .use_max_tr     = 1,
+        .use_max_tr     = true,
 };
 static struct tracer wakeup_rt_tracer __read_mostly =
@@ -610,17 +617,18 @@ static struct tracer wakeup_rt_tracer __read_mostly =
        .start          = wakeup_tracer_start,
        .stop           = wakeup_tracer_stop,
        .wait_pipe      = poll_wait_pipe,
-        .print_max      = 1,
+        .print_max      = true,
        .print_header   = wakeup_print_header,
        .print_line     = wakeup_print_line,
        .flags          = &tracer_flags,
        .set_flag       = wakeup_set_flag,
+        .flag_changed   = trace_keep_overwrite,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
 #endif
        .open           = wakeup_trace_open,
        .close          = wakeup_trace_close,
-        .use_max_tr     = 1,
+        .use_max_tr     = true,
 };
 __init static int init_wakeup_tracer(void)
@@ -637,4 +645,4 @@ __init static int init_wakeup_tracer(void)
        return 0;
 }
-device_initcall(init_wakeup_tracer);
+core_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 2c00a691a540..51c819c12c29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -320,7 +320,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
                                           int (*func)(void))
 {
        int save_ftrace_enabled = ftrace_enabled;
-        int save_tracer_enabled = tracer_enabled;
        unsigned long count;
        char *func_name;
        int ret;
@@ -331,7 +330,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        /* enable tracing, and record the filter function */
        ftrace_enabled = 1;
-        tracer_enabled = 1;
        /* passed in by parameter to fool gcc from optimizing */
        func();
@@ -395,7 +393,6 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 out:
        ftrace_enabled = save_ftrace_enabled;
-        tracer_enabled = save_tracer_enabled;
        /* Enable tracing on all functions again */
        ftrace_set_global_filter(NULL, 0, 1);
@@ -418,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
         * The ftrace infrastructure should provide the recursion
         * protection. If not, this will crash the kernel!
         */
-        trace_selftest_recursion_cnt++;
+        if (trace_selftest_recursion_cnt++ > 10)
+                return;
        DYN_FTRACE_TEST_NAME();
 }
@@ -452,11 +450,9 @@ static int
 trace_selftest_function_recursion(void)
 {
        int save_ftrace_enabled = ftrace_enabled;
-        int save_tracer_enabled = tracer_enabled;
        char *func_name;
        int len;
        int ret;
-        int cnt;
        /* The previous test PASSED */
        pr_cont("PASSED\n");
@@ -465,7 +461,6 @@ trace_selftest_function_recursion(void)
        /* enable tracing, and record the filter function */
        ftrace_enabled = 1;
-        tracer_enabled = 1;
        /* Handle PPC64 '.' name */
        func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
@@ -515,26 +510,16 @@ trace_selftest_function_recursion(void)
        unregister_ftrace_function(&test_recsafe_probe);
-        /*
-         * If arch supports all ftrace features, and no other task
-         * was on the list, we should be fine.
-         */
-        if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
-                cnt = 2; /* Should have recursed */
-        else
-                cnt = 1;
        ret = -1;
-        if (trace_selftest_recursion_cnt != cnt) {
+        if (trace_selftest_recursion_cnt != 2) {
-                pr_cont("*callback not called expected %d times (%d)* ",
+                pr_cont("*callback not called expected 2 times (%d)* ",
-                        cnt, trace_selftest_recursion_cnt);
+                        trace_selftest_recursion_cnt);
                goto out;
        }
        ret = 0;
 out:
        ftrace_enabled = save_ftrace_enabled;
-        tracer_enabled = save_tracer_enabled;
        return ret;
 }
@@ -569,13 +554,12 @@ static int
 trace_selftest_function_regs(void)
 {
        int save_ftrace_enabled = ftrace_enabled;
-        int save_tracer_enabled = tracer_enabled;
        char *func_name;
        int len;
        int ret;
        int supported = 0;
-#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
        supported = 1;
 #endif
@@ -586,7 +570,6 @@ trace_selftest_function_regs(void)
        /* enable tracing, and record the filter function */
        ftrace_enabled = 1;
-        tracer_enabled = 1;
        /* Handle PPC64 '.' name */
        func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
@@ -648,7 +631,6 @@ trace_selftest_function_regs(void)
        ret = 0;
 out:
        ftrace_enabled = save_ftrace_enabled;
-        tracer_enabled = save_tracer_enabled;
        return ret;
 }
@@ -662,7 +644,6 @@ int
 trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 {
        int save_ftrace_enabled = ftrace_enabled;
-        int save_tracer_enabled = tracer_enabled;
        unsigned long count;
        int ret;
@@ -671,7 +652,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
        /* start the tracing */
        ftrace_enabled = 1;
-        tracer_enabled = 1;
        ret = tracer_init(trace, tr);
        if (ret) {
@@ -708,7 +688,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
        ret = trace_selftest_function_regs();
 out:
        ftrace_enabled = save_ftrace_enabled;
-        tracer_enabled = save_tracer_enabled;
        /* kill ftrace totally if we failed */
        if (ret)
@@ -1106,6 +1085,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
        tracing_stop();
        /* check both trace buffers */
        ret = trace_test_buffer(tr, NULL);
+        printk("ret = %d\n", ret);
        if (!ret)
                ret = trace_test_buffer(&max_tr, &count);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0c1b165778e5..83a8b5b7bd35 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -33,7 +33,6 @@ static unsigned long max_stack_size;
 static arch_spinlock_t max_stack_lock =
        (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-static int stack_trace_disabled __read_mostly;
 static DEFINE_PER_CPU(int, trace_active);
 static DEFINE_MUTEX(stack_sysctl_mutex);
@@ -116,9 +115,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
 {
        int cpu;
-        if (unlikely(!ftrace_enabled || stack_trace_disabled))
-                return;
        preempt_disable_notrace();
        cpu = raw_smp_processor_id();
@@ -326,7 +322,7 @@ static const struct file_operations stack_trace_filter_fops = {
        .open = stack_trace_filter_open,
        .read = seq_read,
        .write = ftrace_filter_write,
-        .llseek = ftrace_regex_lseek,
+        .llseek = ftrace_filter_lseek,
        .release = ftrace_regex_release,
 };
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 2485a7d09b11..7a809e321058 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
 #include <trace/syscall.h>
 #include <trace/events/syscalls.h>
+#include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/module.h>       /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
@@ -21,9 +22,6 @@ static int syscall_enter_register(struct ftrace_event_call *event,
 static int syscall_exit_register(struct ftrace_event_call *event,
                                 enum trace_reg type, void *data);
-static int syscall_enter_define_fields(struct ftrace_event_call *call);
-static int syscall_exit_define_fields(struct ftrace_event_call *call);
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
 {
@@ -32,30 +30,6 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
        return &entry->enter_fields;
 }
-struct trace_event_functions enter_syscall_print_funcs = {
-        .trace          = print_syscall_enter,
-};
-struct trace_event_functions exit_syscall_print_funcs = {
-        .trace          = print_syscall_exit,
-};
-struct ftrace_event_class event_class_syscall_enter = {
-        .system         = "syscalls",
-        .reg            = syscall_enter_register,
-        .define_fields  = syscall_enter_define_fields,
-        .get_fields     = syscall_get_enter_fields,
-        .raw_init       = init_syscall_trace,
-};
-struct ftrace_event_class event_class_syscall_exit = {
-        .system         = "syscalls",
-        .reg            = syscall_exit_register,
-        .define_fields  = syscall_exit_define_fields,
-        .fields         = LIST_HEAD_INIT(event_class_syscall_exit.fields),
-        .raw_init       = init_syscall_trace,
-};
 extern struct syscall_metadata *__start_syscalls_metadata[];
 extern struct syscall_metadata *__stop_syscalls_metadata[];
@@ -74,6 +48,38 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
 }
 #endif
+#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
+/*
+ * Some architectures that allow for 32bit applications
+ * to run on a 64bit kernel, do not map the syscalls for
+ * the 32bit tasks the same as they do for 64bit tasks.
+ *
+ *     *cough*x86*cough*
+ *
+ * In such a case, instead of reporting the wrong syscalls,
+ * simply ignore them.
+ *
+ * For an arch to ignore the compat syscalls it needs to
+ * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
+ * define the function arch_trace_is_compat_syscall() to let
+ * the tracing system know that it should ignore it.
+ */
+static int
+trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
+{
+        if (unlikely(arch_trace_is_compat_syscall(regs)))
+                return -1;
+        return syscall_get_nr(task, regs);
+}
+#else
+static inline int
+trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
+{
+        return syscall_get_nr(task, regs);
+}
+#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
 static __init struct syscall_metadata *
 find_syscall_meta(unsigned long syscall)
 {
@@ -104,7 +110,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
        return syscalls_metadata[nr];
 }
-enum print_line_t
+static enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags,
                    struct trace_event *event)
 {
@@ -157,7 +163,7 @@ end:
        return TRACE_TYPE_HANDLED;
 }
-enum print_line_t
+static enum print_line_t
 print_syscall_exit(struct trace_iterator *iter, int flags,
                   struct trace_event *event)
 {
@@ -297,16 +303,16 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
        return ret;
 }
-void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 {
        struct syscall_trace_enter *entry;
        struct syscall_metadata *sys_data;
        struct ring_buffer_event *event;
        struct ring_buffer *buffer;
-        int size;
        int syscall_nr;
+        int size;
-        syscall_nr = syscall_get_nr(current, regs);
+        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0)
                return;
        if (!test_bit(syscall_nr, enabled_enter_syscalls))
@@ -332,7 +338,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
-void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 {
        struct syscall_trace_exit *entry;
        struct syscall_metadata *sys_data;
@@ -340,7 +346,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        struct ring_buffer *buffer;
        int syscall_nr;
-        syscall_nr = syscall_get_nr(current, regs);
+        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0)
                return;
        if (!test_bit(syscall_nr, enabled_exit_syscalls))
@@ -364,7 +370,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
                trace_current_buffer_unlock_commit(buffer, event, 0, 0);
 }
-int reg_event_syscall_enter(struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -383,7 +389,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
        return ret;
 }
-void unreg_event_syscall_enter(struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct ftrace_event_call *call)
 {
        int num;
@@ -398,7 +404,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
        mutex_unlock(&syscall_trace_lock);
 }
-int reg_event_syscall_exit(struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -417,7 +423,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
        return ret;
 }
-void unreg_event_syscall_exit(struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct ftrace_event_call *call)
 {
        int num;
@@ -432,7 +438,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
        mutex_unlock(&syscall_trace_lock);
 }
-int init_syscall_trace(struct ftrace_event_call *call)
+static int init_syscall_trace(struct ftrace_event_call *call)
 {
        int id;
        int num;
@@ -457,12 +463,36 @@ int init_syscall_trace(struct ftrace_event_call *call)
        return id;
 }
+struct trace_event_functions enter_syscall_print_funcs = {
+        .trace          = print_syscall_enter,
+};
+struct trace_event_functions exit_syscall_print_funcs = {
+        .trace          = print_syscall_exit,
+};
+struct ftrace_event_class event_class_syscall_enter = {
+        .system         = "syscalls",
+        .reg            = syscall_enter_register,
+        .define_fields  = syscall_enter_define_fields,
+        .get_fields     = syscall_get_enter_fields,
+        .raw_init       = init_syscall_trace,
+};
+struct ftrace_event_class event_class_syscall_exit = {
+        .system         = "syscalls",
+        .reg            = syscall_exit_register,
+        .define_fields  = syscall_exit_define_fields,
+        .fields         = LIST_HEAD_INIT(event_class_syscall_exit.fields),
+        .raw_init       = init_syscall_trace,
+};
 unsigned long __init __weak arch_syscall_addr(int nr)
 {
        return (unsigned long)sys_call_table[nr];
 }
-int __init init_ftrace_syscalls(void)
+static int __init init_ftrace_syscalls(void)
 {
        struct syscall_metadata *meta;
        unsigned long addr;
@@ -505,7 +535,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        int rctx;
        int size;
-        syscall_nr = syscall_get_nr(current, regs);
+        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0)
                return;
        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
@@ -537,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
-int perf_sysenter_enable(struct ftrace_event_call *call)
+static int perf_sysenter_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -558,7 +588,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
        return ret;
 }
-void perf_sysenter_disable(struct ftrace_event_call *call)
+static void perf_sysenter_disable(struct ftrace_event_call *call)
 {
        int num;
@@ -581,7 +611,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        int rctx;
        int size;
-        syscall_nr = syscall_get_nr(current, regs);
+        syscall_nr = trace_get_syscall_nr(current, regs);
        if (syscall_nr < 0)
                return;
        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
@@ -615,7 +645,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
-int perf_sysexit_enable(struct ftrace_event_call *call)
+static int perf_sysexit_enable(struct ftrace_event_call *call)
 {
        int ret = 0;
        int num;
@@ -636,7 +666,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
        return ret;
 }
-void perf_sysexit_disable(struct ftrace_event_call *call)
+static void perf_sysexit_disable(struct ftrace_event_call *call)
 {
        int num;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 03003cd7dd96..8dad2a92dee9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -22,25 +22,27 @@
 #include <linux/uaccess.h>
 #include <linux/uprobes.h>
 #include <linux/namei.h>
+#include <linux/string.h>
 #include "trace_probe.h"
 #define UPROBE_EVENT_SYSTEM     "uprobes"
+struct trace_uprobe_filter {
+        rwlock_t                rwlock;
+        int                     nr_systemwide;
+        struct list_head        perf_events;
+};
 /*
 * uprobe event core functions
 */
-struct trace_uprobe;
-struct uprobe_trace_consumer {
-        struct uprobe_consumer          cons;
-        struct trace_uprobe             *tu;
-};
 struct trace_uprobe {
        struct list_head                list;
        struct ftrace_event_class       class;
        struct ftrace_event_call        call;
-        struct uprobe_trace_consumer    *consumer;
+        struct trace_uprobe_filter      filter;
+        struct uprobe_consumer          consumer;
        struct inode                    *inode;
        char                            *filename;
        unsigned long                   offset;
@@ -63,6 +65,18 @@ static LIST_HEAD(uprobe_list);
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
+{
+        rwlock_init(&filter->rwlock);
+        filter->nr_systemwide = 0;
+        INIT_LIST_HEAD(&filter->perf_events);
+}
+static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
+{
+        return !filter->nr_systemwide && list_empty(&filter->perf_events);
+}
 /*
 * Allocate new trace_uprobe and initialize it (including uprobes).
 */
@@ -91,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
                goto error;
        INIT_LIST_HEAD(&tu->list);
+        tu->consumer.handler = uprobe_dispatcher;
+        init_trace_uprobe_filter(&tu->filter);
        return tu;
 error:
@@ -189,7 +205,7 @@ static int create_trace_uprobe(int argc, char **argv)
        if (argv[0][0] == '-')
                is_delete = true;
        else if (argv[0][0] != 'p') {
-                pr_info("Probe definition must be started with 'p', 'r' or" " '-'.\n");
+                pr_info("Probe definition must be started with 'p' or '-'.\n");
                return -EINVAL;
        }
@@ -252,27 +268,32 @@ static int create_trace_uprobe(int argc, char **argv)
        if (ret)
                goto fail_address_parse;
-        ret = strict_strtoul(arg, 0, &offset);
+        inode = igrab(path.dentry->d_inode);
-        if (ret)
+        path_put(&path);
+        if (!inode || !S_ISREG(inode->i_mode)) {
+                ret = -EINVAL;
                goto fail_address_parse;
+        }
-        inode = igrab(path.dentry->d_inode);
+        ret = kstrtoul(arg, 0, &offset);
+        if (ret)
+                goto fail_address_parse;
        argc -= 2;
        argv += 2;
        /* setup a probe */
        if (!event) {
-                char *tail = strrchr(filename, '/');
+                char *tail;
                char *ptr;
-                ptr = kstrdup((tail ? tail + 1 : filename), GFP_KERNEL);
+                tail = kstrdup(kbasename(filename), GFP_KERNEL);
-                if (!ptr) {
+                if (!tail) {
                        ret = -ENOMEM;
                        goto fail_address_parse;
                }
-                tail = ptr;
                ptr = strpbrk(tail, ".-_");
                if (ptr)
                        *ptr = '\0';
@@ -356,7 +377,7 @@ fail_address_parse:
        if (inode)
                iput(inode);
-        pr_info("Failed to parse address.\n");
+        pr_info("Failed to parse address or file.\n");
        return ret;
 }
@@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
 };
 /* uprobe handler */
-static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 {
        struct uprobe_trace_entry_head *entry;
        struct ring_buffer_event *event;
@@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
        unsigned long irq_flags;
        struct ftrace_event_call *call = &tu->call;
-        tu->nhit++;
        local_save_flags(irq_flags);
        pc = preempt_count();
@@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
        event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
                                                  size, irq_flags, pc);
        if (!event)
-                return;
+                return 0;
        entry = ring_buffer_event_data(event);
-        entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+        entry->ip = instruction_pointer(task_pt_regs(current));
        data = (u8 *)&entry[1];
        for (i = 0; i < tu->nr_args; i++)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
        if (!filter_current_check_discard(buffer, call, entry, event))
                trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+        return 0;
 }
 /* Event entry printers */
@@ -533,42 +554,43 @@ partial:
        return TRACE_TYPE_PARTIAL_LINE;
 }
-static int probe_event_enable(struct trace_uprobe *tu, int flag)
+static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
 {
-        struct uprobe_trace_consumer *utc;
+        return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
-        int ret = 0;
+}
-        if (!tu->inode || tu->consumer)
+typedef bool (*filter_func_t)(struct uprobe_consumer *self,
-                return -EINTR;
+                                enum uprobe_filter_ctx ctx,
+                                struct mm_struct *mm);
-        utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
+static int
-        if (!utc)
+probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
+{
+        int ret = 0;
+        if (is_trace_uprobe_enabled(tu))
                return -EINTR;
-        utc->cons.handler = uprobe_dispatcher;
+        WARN_ON(!uprobe_filter_is_empty(&tu->filter));
-        utc->cons.filter = NULL;
-        ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
-        if (ret) {
-                kfree(utc);
-                return ret;
-        }
        tu->flags |= flag;
-        utc->tu = tu;
+        tu->consumer.filter = filter;
-        tu->consumer = utc;
+        ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
+        if (ret)
+                tu->flags &= ~flag;
-        return 0;
+        return ret;
 }
 static void probe_event_disable(struct trace_uprobe *tu, int flag)
 {
-        if (!tu->inode || !tu->consumer)
+        if (!is_trace_uprobe_enabled(tu))
                return;
-        uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
+        WARN_ON(!uprobe_filter_is_empty(&tu->filter));
+        uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
        tu->flags &= ~flag;
-        kfree(tu->consumer);
-        tu->consumer = NULL;
 }
 static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
 }
 #ifdef CONFIG_PERF_EVENTS
+static bool
+__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
+{
+        struct perf_event *event;
+        if (filter->nr_systemwide)
+                return true;
+        list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
+                if (event->hw.tp_target->mm == mm)
+                        return true;
+        }
+        return false;
+}
+static inline bool
+uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
+{
+        return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+}
+static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+{
+        bool done;
+        write_lock(&tu->filter.rwlock);
+        if (event->hw.tp_target) {
+                /*
+                 * event->parent != NULL means copy_process(), we can avoid
+                 * uprobe_apply(). current->mm must be probed and we can rely
+                 * on dup_mmap() which preserves the already installed bp's.
+                 *
+                 * attr.enable_on_exec means that exec/mmap will install the
+                 * breakpoints we need.
+                 */
+                done = tu->filter.nr_systemwide ||
+                        event->parent || event->attr.enable_on_exec ||
+                        uprobe_filter_event(tu, event);
+                list_add(&event->hw.tp_list, &tu->filter.perf_events);
+        } else {
+                done = tu->filter.nr_systemwide;
+                tu->filter.nr_systemwide++;
+        }
+        write_unlock(&tu->filter.rwlock);
+        if (!done)
+                uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+        return 0;
+}
+static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+{
+        bool done;
+        write_lock(&tu->filter.rwlock);
+        if (event->hw.tp_target) {
+                list_del(&event->hw.tp_list);
+                done = tu->filter.nr_systemwide ||
+                        (event->hw.tp_target->flags & PF_EXITING) ||
+                        uprobe_filter_event(tu, event);
+        } else {
+                tu->filter.nr_systemwide--;
+                done = tu->filter.nr_systemwide;
+        }
+        write_unlock(&tu->filter.rwlock);
+        if (!done)
+                uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
+        return 0;
+}
+static bool uprobe_perf_filter(struct uprobe_consumer *uc,
+                                enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+{
+        struct trace_uprobe *tu;
+        int ret;
+        tu = container_of(uc, struct trace_uprobe, consumer);
+        read_lock(&tu->filter.rwlock);
+        ret = __uprobe_perf_filter(&tu->filter, mm);
+        read_unlock(&tu->filter.rwlock);
+        return ret;
+}
 /* uprobe profile handler */
-static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 {
        struct ftrace_event_call *call = &tu->call;
        struct uprobe_trace_entry_head *entry;
@@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
        int size, __size, i;
        int rctx;
+        if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+                return UPROBE_HANDLER_REMOVE;
        __size = sizeof(*entry) + tu->size;
        size = ALIGN(__size + sizeof(u32), sizeof(u64));
        size -= sizeof(u32);
        if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
-                return;
+                return 0;
        preempt_disable();
@@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
        if (!entry)
                goto out;
-        entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
+        entry->ip = instruction_pointer(task_pt_regs(current));
        data = (u8 *)&entry[1];
        for (i = 0; i < tu->nr_args; i++)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 out:
        preempt_enable();
+        return 0;
 }
 #endif  /* CONFIG_PERF_EVENTS */
@@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
        switch (type) {
        case TRACE_REG_REGISTER:
-                return probe_event_enable(tu, TP_FLAG_TRACE);
+                return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
        case TRACE_REG_UNREGISTER:
                probe_event_disable(tu, TP_FLAG_TRACE);
@@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
 #ifdef CONFIG_PERF_EVENTS
        case TRACE_REG_PERF_REGISTER:
-                return probe_event_enable(tu, TP_FLAG_PROFILE);
+                return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
        case TRACE_REG_PERF_UNREGISTER:
                probe_event_disable(tu, TP_FLAG_PROFILE);
                return 0;
+        case TRACE_REG_PERF_OPEN:
+                return uprobe_perf_open(tu, data);
+        case TRACE_REG_PERF_CLOSE:
+                return uprobe_perf_close(tu, data);
 #endif
        default:
                return 0;
@@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 {
-        struct uprobe_trace_consumer *utc;
        struct trace_uprobe *tu;
+        int ret = 0;
-        utc = container_of(con, struct uprobe_trace_consumer, cons);
+        tu = container_of(con, struct trace_uprobe, consumer);
-        tu = utc->tu;
+        tu->nhit++;
-        if (!tu || tu->consumer != utc)
-                return 0;
        if (tu->flags & TP_FLAG_TRACE)
-                uprobe_trace_func(tu, regs);
+                ret |= uprobe_trace_func(tu, regs);
 #ifdef CONFIG_PERF_EVENTS
        if (tu->flags & TP_FLAG_PROFILE)
-                uprobe_perf_func(tu, regs);
+                ret |= uprobe_perf_func(tu, regs);
 #endif
-        return 0;
+        return ret;
 }
 static struct trace_event_functions uprobe_funcs = {
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index d96ba22dabfa..0c05a4592047 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -192,12 +192,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
 static struct tracepoint_entry *get_tracepoint(const char *name)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct tracepoint_entry *e;
        u32 hash = jhash(name, strlen(name), 0);
        head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
-        hlist_for_each_entry(e, node, head, hlist) {
+        hlist_for_each_entry(e, head, hlist) {
                if (!strcmp(name, e->name))
                        return e;
        }
@@ -211,13 +210,12 @@ static struct tracepoint_entry *get_tracepoint(const char *name)
 static struct tracepoint_entry *add_tracepoint(const char *name)
 {
        struct hlist_head *head;
-        struct hlist_node *node;
        struct tracepoint_entry *e;
        size_t name_len = strlen(name) + 1;
        u32 hash = jhash(name, name_len-1, 0);
        head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
-        hlist_for_each_entry(e, node, head, hlist) {
+        hlist_for_each_entry(e, head, hlist) {
                if (!strcmp(name, e->name)) {
                        printk(KERN_NOTICE
                                "tracepoint %s busy\n", name);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 625df0b44690..a1dd9a1b1327 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -32,6 +32,7 @@ void bacct_add_tsk(struct user_namespace *user_ns,
 {
        const struct cred *tcred;
        struct timespec uptime, ts;
+        cputime_t utime, stime, utimescaled, stimescaled;
        u64 ac_etime;
        BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
@@ -65,10 +66,15 @@ void bacct_add_tsk(struct user_namespace *user_ns,
        stats->ac_ppid   = pid_alive(tsk) ?
                task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;
        rcu_read_unlock();
-        stats->ac_utime = cputime_to_usecs(tsk->utime);
-        stats->ac_stime = cputime_to_usecs(tsk->stime);
+        task_cputime(tsk, &utime, &stime);
-        stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
+        stats->ac_utime = cputime_to_usecs(utime);
-        stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
+        stats->ac_stime = cputime_to_usecs(stime);
+        task_cputime_scaled(tsk, &utimescaled, &stimescaled);
+        stats->ac_utimescaled = cputime_to_usecs(utimescaled);
+        stats->ac_stimescaled = cputime_to_usecs(stimescaled);
        stats->ac_minflt = tsk->min_flt;
        stats->ac_majflt = tsk->maj_flt;
@@ -115,11 +121,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 #undef KB
 #undef MB
-/**
+static void __acct_update_integrals(struct task_struct *tsk,
- * acct_update_integrals - update mm integral fields in task_struct
+                                    cputime_t utime, cputime_t stime)
- * @tsk: task_struct for accounting
- */
-void acct_update_integrals(struct task_struct *tsk)
 {
        if (likely(tsk->mm)) {
                cputime_t time, dtime;
@@ -128,7 +131,7 @@ void acct_update_integrals(struct task_struct *tsk)
                u64 delta;
                local_irq_save(flags);
-                time = tsk->stime + tsk->utime;
+                time = stime + utime;
                dtime = time - tsk->acct_timexpd;
                jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
                delta = value.tv_sec;
@@ -145,6 +148,27 @@ void acct_update_integrals(struct task_struct *tsk)
 }
 /**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
+ */
+void acct_update_integrals(struct task_struct *tsk)
+{
+        cputime_t utime, stime;
+        task_cputime(tsk, &utime, &stime);
+        __acct_update_integrals(tsk, utime, stime);
+}
+/**
+ * acct_account_cputime - update mm integral after cputime update
+ * @tsk: task_struct for accounting
+ */
+void acct_account_cputime(struct task_struct *tsk)
+{
+        __acct_update_integrals(tsk, tsk->utime, tsk->stime);
+}
+/**
 * acct_clear_integrals - clear the mm integral fields in task_struct
 * @tsk: task_struct whose accounting fields are cleared
 */
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 1744bb80f1fb..394f70b17162 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -34,11 +34,11 @@ EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
 void fire_user_return_notifiers(void)
 {
        struct user_return_notifier *urn;
-        struct hlist_node *tmp1, *tmp2;
+        struct hlist_node *tmp2;
        struct hlist_head *head;
        head = &get_cpu_var(return_notifier_list);
-        hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link)
+        hlist_for_each_entry_safe(urn, tmp2, head, link)
                urn->on_user_return(urn);
        put_cpu_var(return_notifier_list);
 }
diff --git a/kernel/user.c b/kernel/user.c
index 750acffbe9ec..8e635a18ab52 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 /*
 * userns count is 1 for root user, 1 for init_uts_ns,
@@ -46,11 +47,12 @@ struct user_namespace init_user_ns = {
                        .count = 4294967295U,
                },
        },
-        .kref = {
+        .count = ATOMIC_INIT(3),
-                .refcount       = ATOMIC_INIT(3),
-        },
        .owner = GLOBAL_ROOT_UID,
        .group = GLOBAL_ROOT_GID,
+        .proc_inum = PROC_USER_INIT_INO,
+        .may_mount_sysfs = true,
+        .may_mount_proc = true,
 };
 EXPORT_SYMBOL_GPL(init_user_ns);
@@ -105,9 +107,8 @@ static void uid_hash_remove(struct user_struct *up)
 static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
 {
        struct user_struct *user;
-        struct hlist_node *h;
-        hlist_for_each_entry(user, h, hashent, uidhash_node) {
+        hlist_for_each_entry(user, hashent, uidhash_node) {
                if (uid_eq(user->uid, uid)) {
                        atomic_inc(&user->__count);
                        return user;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 456a6b9fba34..a54f26f82eb2 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
 #include <linux/highuid.h>
 #include <linux/cred.h>
 #include <linux/securebits.h>
@@ -20,12 +21,31 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/projid.h>
+#include <linux/fs_struct.h>
 static struct kmem_cache *user_ns_cachep __read_mostly;
 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *map);
+static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
+{
+        /* Start with the same capabilities as init but useless for doing
+         * anything as the capabilities are bound to the new user namespace.
+         */
+        cred->securebits = SECUREBITS_DEFAULT;
+        cred->cap_inheritable = CAP_EMPTY_SET;
+        cred->cap_permitted = CAP_FULL_SET;
+        cred->cap_effective = CAP_FULL_SET;
+        cred->cap_bset = CAP_FULL_SET;
+#ifdef CONFIG_KEYS
+        key_put(cred->request_key_auth);
+        cred->request_key_auth = NULL;
+#endif
+        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
+        cred->user_ns = user_ns;
+}
 /*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
@@ -39,6 +59,16 @@ int create_user_ns(struct cred *new)
        struct user_namespace *ns, *parent_ns = new->user_ns;
        kuid_t owner = new->euid;
        kgid_t group = new->egid;
+        int ret;
+        /*
+         * Verify that we can not violate the policy of which files
+         * may be accessed that is specified by the root directory,
+         * by verifing that the root directory is at the root of the
+         * mount namespace which allows all files to be accessed.
+         */
+        if (current_chrooted())
+                return -EPERM;
        /* The creator needs a mapping in the parent user namespace
         * or else we won't be able to reasonably tell userspace who
@@ -52,40 +82,50 @@ int create_user_ns(struct cred *new)
        if (!ns)
                return -ENOMEM;
-        kref_init(&ns->kref);
+        ret = proc_alloc_inum(&ns->proc_inum);
+        if (ret) {
+                kmem_cache_free(user_ns_cachep, ns);
+                return ret;
+        }
+        atomic_set(&ns->count, 1);
+        /* Leave the new->user_ns reference with the new user namespace. */
        ns->parent = parent_ns;
        ns->owner = owner;
        ns->group = group;
-        /* Start with the same capabilities as init but useless for doing
+        set_cred_user_ns(new, ns);
-         * anything as the capabilities are bound to the new user namespace.
-         */
-        new->securebits = SECUREBITS_DEFAULT;
-        new->cap_inheritable = CAP_EMPTY_SET;
-        new->cap_permitted = CAP_FULL_SET;
-        new->cap_effective = CAP_FULL_SET;
-        new->cap_bset = CAP_FULL_SET;
-#ifdef CONFIG_KEYS
-        key_put(new->request_key_auth);
-        new->request_key_auth = NULL;
-#endif
-        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
-        /* Leave the new->user_ns reference with the new user namespace. */
+        update_mnt_policy(ns);
-        /* Leave the reference to our user_ns with the new cred. */
-        new->user_ns = ns;
        return 0;
 }
-void free_user_ns(struct kref *kref)
+int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
 {
-        struct user_namespace *parent, *ns =
+        struct cred *cred;
-                container_of(kref, struct user_namespace, kref);
+        if (!(unshare_flags & CLONE_NEWUSER))
+                return 0;
-        parent = ns->parent;
+        cred = prepare_creds();
-        kmem_cache_free(user_ns_cachep, ns);
+        if (!cred)
-        put_user_ns(parent);
+                return -ENOMEM;
+        *new_cred = cred;
+        return create_user_ns(cred);
+}
+void free_user_ns(struct user_namespace *ns)
+{
+        struct user_namespace *parent;
+        do {
+                parent = ns->parent;
+                proc_free_inum(ns->proc_inum);
+                kmem_cache_free(user_ns_cachep, ns);
+                ns = parent;
+        } while (atomic_dec_and_test(&parent->count));
 }
 EXPORT_SYMBOL(free_user_ns);
@@ -372,7 +412,7 @@ static int uid_m_show(struct seq_file *seq, void *v)
        struct user_namespace *lower_ns;
        uid_t lower;
-        lower_ns = current_user_ns();
+        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;
@@ -393,7 +433,7 @@ static int gid_m_show(struct seq_file *seq, void *v)
        struct user_namespace *lower_ns;
        gid_t lower;
-        lower_ns = current_user_ns();
+        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;
@@ -492,6 +532,42 @@ struct seq_operations proc_projid_seq_operations = {
        .show = projid_m_show,
 };
+static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent)
+{
+        u32 upper_first, lower_first, upper_last, lower_last;
+        unsigned idx;
+        upper_first = extent->first;
+        lower_first = extent->lower_first;
+        upper_last = upper_first + extent->count - 1;
+        lower_last = lower_first + extent->count - 1;
+        for (idx = 0; idx < new_map->nr_extents; idx++) {
+                u32 prev_upper_first, prev_lower_first;
+                u32 prev_upper_last, prev_lower_last;
+                struct uid_gid_extent *prev;
+                prev = &new_map->extent[idx];
+                prev_upper_first = prev->first;
+                prev_lower_first = prev->lower_first;
+                prev_upper_last = prev_upper_first + prev->count - 1;
+                prev_lower_last = prev_lower_first + prev->count - 1;
+                /* Does the upper range intersect a previous extent? */
+                if ((prev_upper_first <= upper_last) &&
+                    (prev_upper_last >= upper_first))
+                        return true;
+                /* Does the lower range intersect a previous extent? */
+                if ((prev_lower_first <= lower_last) &&
+                    (prev_lower_last >= lower_first))
+                        return true;
+        }
+        return false;
+}
 static DEFINE_MUTEX(id_map_mutex);
 static ssize_t map_write(struct file *file, const char __user *buf,
@@ -504,7 +580,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
        struct user_namespace *ns = seq->private;
        struct uid_gid_map new_map;
        unsigned idx;
-        struct uid_gid_extent *extent, *last = NULL;
+        struct uid_gid_extent *extent = NULL;
        unsigned long page = 0;
        char *kbuf, *pos, *next_line;
        ssize_t ret = -EINVAL;
@@ -607,14 +683,11 @@ static ssize_t map_write(struct file *file, const char __user *buf,
                if ((extent->lower_first + extent->count) <= extent->lower_first)
                        goto out;
-                /* For now only accept extents that are strictly in order */
+                /* Do the ranges in extent overlap any previous extents? */
-                if (last &&
+                if (mappings_overlap(&new_map, extent))
-                    (((last->first + last->count) > extent->first) ||
-                     ((last->lower_first + last->count) > extent->lower_first)))
                        goto out;
                new_map.nr_extents++;
-                last = extent;
                /* Fail if the file contains too many extents */
                if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
@@ -669,10 +742,14 @@ ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t siz
 {
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
+        struct user_namespace *seq_ns = seq_user_ns(seq);
        if (!ns->parent)
                return -EPERM;
+        if ((seq_ns != ns) && (seq_ns != ns->parent))
+                return -EPERM;
        return map_write(file, buf, size, ppos, CAP_SETUID,
                         &ns->uid_map, &ns->parent->uid_map);
 }
@@ -681,10 +758,14 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz
 {
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
+        struct user_namespace *seq_ns = seq_user_ns(seq);
        if (!ns->parent)
                return -EPERM;
+        if ((seq_ns != ns) && (seq_ns != ns->parent))
+                return -EPERM;
        return map_write(file, buf, size, ppos, CAP_SETGID,
                         &ns->gid_map, &ns->parent->gid_map);
 }
@@ -709,6 +790,21 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
 static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *new_map)
 {
+        /* Allow mapping to your own filesystem ids */
+        if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) {
+                u32 id = new_map->extent[0].lower_first;
+                if (cap_setid == CAP_SETUID) {
+                        kuid_t uid = make_kuid(ns->parent, id);
+                        if (uid_eq(uid, current_fsuid()))
+                                return true;
+                }
+                else if (cap_setid == CAP_SETGID) {
+                        kgid_t gid = make_kgid(ns->parent, id);
+                        if (gid_eq(gid, current_fsgid()))
+                                return true;
+                }
+        }
        /* Allow anyone to set a mapping that doesn't require privilege */
        if (!cap_valid(cap_setid))
                return true;
@@ -722,6 +818,68 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
        return false;
 }
+static void *userns_get(struct task_struct *task)
+{
+        struct user_namespace *user_ns;
+        rcu_read_lock();
+        user_ns = get_user_ns(__task_cred(task)->user_ns);
+        rcu_read_unlock();
+        return user_ns;
+}
+static void userns_put(void *ns)
+{
+        put_user_ns(ns);
+}
+static int userns_install(struct nsproxy *nsproxy, void *ns)
+{
+        struct user_namespace *user_ns = ns;
+        struct cred *cred;
+        /* Don't allow gaining capabilities by reentering
+         * the same user namespace.
+         */
+        if (user_ns == current_user_ns())
+                return -EINVAL;
+        /* Threaded processes may not enter a different user namespace */
+        if (atomic_read(&current->mm->mm_users) > 1)
+                return -EINVAL;
+        if (current->fs->users != 1)
+                return -EINVAL;
+        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+                return -EPERM;
+        cred = prepare_creds();
+        if (!cred)
+                return -ENOMEM;
+        put_user_ns(cred->user_ns);
+        set_cred_user_ns(cred, get_user_ns(user_ns));
+        return commit_creds(cred);
+}
+static unsigned int userns_inum(void *ns)
+{
+        struct user_namespace *user_ns = ns;
+        return user_ns->proc_inum;
+}
+const struct proc_ns_operations userns_operations = {
+        .name           = "user",
+        .type           = CLONE_NEWUSER,
+        .get            = userns_get,
+        .put            = userns_put,
+        .install        = userns_install,
+        .inum           = userns_inum,
+};
 static __init int user_namespaces_init(void)
 {
        user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 679d97a5d3fd..a47fc5de3113 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -30,20 +30,27 @@ static struct uts_namespace *create_uts_ns(void)
 /*
 * Clone a new ns copying an original utsname, setting refcount to 1
 * @old_ns: namespace to clone
- * Return NULL on error (failure to kmalloc), new ns otherwise
+ * Return ERR_PTR(-ENOMEM) on error (failure to kmalloc), new ns otherwise
 */
-static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
                                          struct uts_namespace *old_ns)
 {
        struct uts_namespace *ns;
+        int err;
        ns = create_uts_ns();
        if (!ns)
                return ERR_PTR(-ENOMEM);
+        err = proc_alloc_inum(&ns->proc_inum);
+        if (err) {
+                kfree(ns);
+                return ERR_PTR(err);
+        }
        down_read(&uts_sem);
        memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
-        ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
+        ns->user_ns = get_user_ns(user_ns);
        up_read(&uts_sem);
        return ns;
 }
@@ -55,9 +62,8 @@ static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
 * versa.
 */
 struct uts_namespace *copy_utsname(unsigned long flags,
-                                   struct task_struct *tsk)
+        struct user_namespace *user_ns, struct uts_namespace *old_ns)
 {
-        struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
        struct uts_namespace *new_ns;
        BUG_ON(!old_ns);
@@ -66,7 +72,7 @@ struct uts_namespace *copy_utsname(unsigned long flags,
        if (!(flags & CLONE_NEWUTS))
                return old_ns;
-        new_ns = clone_uts_ns(tsk, old_ns);
+        new_ns = clone_uts_ns(user_ns, old_ns);
        put_uts_ns(old_ns);
        return new_ns;
@@ -78,6 +84,7 @@ void free_uts_ns(struct kref *kref)
        ns = container_of(kref, struct uts_namespace, kref);
        put_user_ns(ns->user_ns);
+        proc_free_inum(ns->proc_inum);
        kfree(ns);
 }
@@ -102,19 +109,32 @@ static void utsns_put(void *ns)
        put_uts_ns(ns);
 }
-static int utsns_install(struct nsproxy *nsproxy, void *ns)
+static int utsns_install(struct nsproxy *nsproxy, void *new)
 {
+        struct uts_namespace *ns = new;
+        if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+            !nsown_capable(CAP_SYS_ADMIN))
+                return -EPERM;
        get_uts_ns(ns);
        put_uts_ns(nsproxy->uts_ns);
        nsproxy->uts_ns = ns;
        return 0;
 }
+static unsigned int utsns_inum(void *vp)
+{
+        struct uts_namespace *ns = vp;
+        return ns->proc_inum;
+}
 const struct proc_ns_operations utsns_operations = {
        .name           = "uts",
        .type           = CLONE_NEWUTS,
        .get            = utsns_get,
        .put            = utsns_put,
        .install        = utsns_install,
+        .inum           = utsns_inum,
 };
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 63da38c2d820..4f69f9a5e221 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -15,6 +15,8 @@
 #include <linux/sysctl.h>
 #include <linux/wait.h>
+#ifdef CONFIG_PROC_SYSCTL
 static void *get_uts(ctl_table *table, int write)
 {
        char *which = table->data;
@@ -38,7 +40,6 @@ static void put_uts(ctl_table *table, int write, void *which)
                up_write(&uts_sem);
 }
-#ifdef CONFIG_PROC_SYSCTL
 /*
 *      Special case of dostring for the UTS structure. This has locks
 *      to observe. Should this be in kernel/sys.c ????
diff --git a/kernel/wait.c b/kernel/wait.c
index 7fdd9eaca2c3..6698e0c04ead 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -1,7 +1,7 @@
 /*
 * Generic waiting primitives.
 *
- * (C) 2004 William Irwin, Oracle
+ * (C) 2004 Nadia Yvette Chambers, Oracle
 */
 #include <linux/init.h>
 #include <linux/export.h>
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9d4c8d5a1f53..4a944676358e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <linux/smpboot.h>
+#include <linux/sched/rt.h>
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
@@ -31,6 +32,7 @@
 int watchdog_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
 static int __read_mostly watchdog_disabled;
+static u64 __read_mostly sample_period;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -111,12 +113,12 @@ static int get_softlockup_thresh(void)
 * resolution, and we don't need to waste time with a big divide when
 * 2^30ns == 1.074s.
 */
-static unsigned long get_timestamp(int this_cpu)
+static unsigned long get_timestamp(void)
 {
-        return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+        return local_clock() >> 30LL;  /* 2^30 ~= 10^9 */
 }
-static unsigned long get_sample_period(void)
+static void set_sample_period(void)
 {
        /*
         * convert watchdog_thresh from seconds to ns
@@ -125,15 +127,13 @@ static unsigned long get_sample_period(void)
         * and hard thresholds) to increment before the
         * hardlockup detector generates a warning
         */
-        return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
+        sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
 }
 /* Commands for resetting the watchdog */
 static void __touch_watchdog(void)
 {
-        int this_cpu = smp_processor_id();
+        __this_cpu_write(watchdog_touch_ts, get_timestamp());
-        __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));
 }
 void touch_softlockup_watchdog(void)
@@ -194,7 +194,7 @@ static int is_hardlockup(void)
 static int is_softlockup(unsigned long touch_ts)
 {
-        unsigned long now = get_timestamp(smp_processor_id());
+        unsigned long now = get_timestamp();
        /* Warn about unreasonable delays: */
        if (time_after(now, touch_ts + get_softlockup_thresh()))
@@ -275,7 +275,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        wake_up_process(__this_cpu_read(softlockup_watchdog));
        /* .. and repeat */
-        hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+        hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
        if (touch_ts == 0) {
                if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
@@ -343,6 +343,10 @@ static void watchdog_enable(unsigned int cpu)
 {
        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+        /* kick off the timer for the hardlockup detector */
+        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hrtimer->function = watchdog_timer_fn;
        if (!watchdog_enabled) {
                kthread_park(current);
                return;
@@ -351,12 +355,8 @@ static void watchdog_enable(unsigned int cpu)
        /* Enable the perf event */
        watchdog_nmi_enable(cpu);
-        /* kick off the timer for the hardlockup detector */
-        hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        hrtimer->function = watchdog_timer_fn;
        /* done here because hrtimer_start can only pin to smp_processor_id() */
-        hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
+        hrtimer_start(hrtimer, ns_to_ktime(sample_period),
                      HRTIMER_MODE_REL_PINNED);
        /* initialize timestamp */
@@ -383,7 +383,7 @@ static int watchdog_should_run(unsigned int cpu)
 /*
 * The watchdog thread function - touches the timestamp.
 *
- * It only runs once every get_sample_period() seconds (4 seconds by
+ * It only runs once every sample_period seconds (4 seconds by
 * default) to reset the softlockup timestamp. If this gets delayed
 * for more than 2*watchdog_thresh seconds then the debug-printout
 * triggers in watchdog_timer_fn().
@@ -516,6 +516,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
        if (ret || !write)
                return ret;
+        set_sample_period();
        if (watchdog_enabled && watchdog_thresh)
                watchdog_enable_all_cpus();
        else
@@ -537,6 +538,7 @@ static struct smp_hotplug_thread watchdog_threads = {
 void __init lockup_detector_init(void)
 {
+        set_sample_period();
        if (smpboot_register_percpu_thread(&watchdog_threads)) {
                pr_err("Failed to create watchdog threads, disabled\n");
                watchdog_disabled = -ENODEV;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 042d221d33cc..b48cd597145d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,32 +41,31 @@
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
 #include <linux/idr.h>
+#include <linux/hashtable.h>
-#include "workqueue_sched.h"
+#include "workqueue_internal.h"
 enum {
        /*
-         * global_cwq flags
+         * worker_pool flags
         *
-         * A bound gcwq is either associated or disassociated with its CPU.
+         * A bound pool is either associated or disassociated with its CPU.
         * While associated (!DISASSOCIATED), all workers are bound to the
         * CPU and none has %WORKER_UNBOUND set and concurrency management
         * is in effect.
         *
         * While DISASSOCIATED, the cpu may be offline and all workers have
         * %WORKER_UNBOUND set and concurrency management disabled, and may
-         * be executing on any CPU.  The gcwq behaves as an unbound one.
+         * be executing on any CPU.  The pool behaves as an unbound one.
         *
         * Note that DISASSOCIATED can be flipped only while holding
-         * assoc_mutex of all pools on the gcwq to avoid changing binding
+         * assoc_mutex to avoid changing binding state while
-         * state while create_worker() is in progress.
+         * create_worker() is in progress.
         */
-        GCWQ_DISASSOCIATED      = 1 << 0,       /* cpu can't serve workers */
-        GCWQ_FREEZING           = 1 << 1,       /* freeze in progress */
-        /* pool flags */
        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
        POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
+        POOL_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
+        POOL_FREEZING           = 1 << 3,       /* freeze in progress */
        /* worker flags */
        WORKER_STARTED          = 1 << 0,       /* started */
@@ -79,11 +78,9 @@ enum {
        WORKER_NOT_RUNNING      = WORKER_PREP | WORKER_UNBOUND |
                                  WORKER_CPU_INTENSIVE,
-        NR_WORKER_POOLS         = 2,            /* # worker pools per gcwq */
+        NR_STD_WORKER_POOLS     = 2,            /* # standard pools per cpu */
        BUSY_WORKER_HASH_ORDER  = 6,            /* 64 pointers */
-        BUSY_WORKER_HASH_SIZE   = 1 << BUSY_WORKER_HASH_ORDER,
-        BUSY_WORKER_HASH_MASK   = BUSY_WORKER_HASH_SIZE - 1,
        MAX_IDLE_WORKERS_RATIO  = 4,            /* 1/4 of busy can be idle */
        IDLE_WORKER_TIMEOUT     = 300 * HZ,     /* keep idle ones for 5 mins */
@@ -111,48 +108,24 @@ enum {
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
- * L: gcwq->lock protected.  Access with gcwq->lock held.
+ * L: pool->lock protected.  Access with pool->lock held.
 *
- * X: During normal operation, modification requires gcwq->lock and
+ * X: During normal operation, modification requires pool->lock and should
- *    should be done only from local cpu.  Either disabling preemption
+ *    be done only from local cpu.  Either disabling preemption on local
- *    on local cpu or grabbing gcwq->lock is enough for read access.
+ *    cpu or grabbing pool->lock is enough for read access.  If
- *    If GCWQ_DISASSOCIATED is set, it's identical to L.
+ *    POOL_DISASSOCIATED is set, it's identical to L.
 *
 * F: wq->flush_mutex protected.
 *
 * W: workqueue_lock protected.
 */
-struct global_cwq;
+/* struct worker is defined in workqueue_internal.h */
-struct worker_pool;
-/*
- * The poor guys doing the actual heavy lifting.  All on-duty workers
- * are either serving the manager role, on idle list or on busy hash.
- */
-struct worker {
-        /* on idle list while idle, on busy hash table while busy */
-        union {
-                struct list_head        entry;  /* L: while idle */
-                struct hlist_node       hentry; /* L: while busy */
-        };
-        struct work_struct      *current_work;  /* L: work being processed */
-        struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */
-        struct list_head        scheduled;      /* L: scheduled works */
-        struct task_struct      *task;          /* I: worker task */
-        struct worker_pool      *pool;          /* I: the associated pool */
-        /* 64 bytes boundary on 64bit, 32 on 32bit */
-        unsigned long           last_active;    /* L: last active timestamp */
-        unsigned int            flags;          /* X: flags */
-        int                     id;             /* I: worker id */
-        /* for rebinding worker to CPU */
-        struct work_struct      rebind_work;    /* L: for busy worker */
-};
 struct worker_pool {
-        struct global_cwq       *gcwq;          /* I: the owning gcwq */
+        spinlock_t              lock;           /* the pool lock */
+        unsigned int            cpu;            /* I: the associated cpu */
+        int                     id;             /* I: pool ID */
        unsigned int            flags;          /* X: flags */
        struct list_head        worklist;       /* L: list of pending works */
@@ -165,34 +138,28 @@ struct worker_pool {
        struct timer_list       idle_timer;     /* L: worker idle timeout */
        struct timer_list       mayday_timer;   /* L: SOS timer for workers */
-        struct mutex            assoc_mutex;    /* protect GCWQ_DISASSOCIATED */
+        /* workers are chained either in busy_hash or idle_list */
-        struct ida              worker_ida;     /* L: for worker IDs */
+        DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
-};
-/*
- * Global per-cpu workqueue.  There's one and only one for each cpu
- * and all works are queued and processed here regardless of their
- * target workqueues.
- */
-struct global_cwq {
-        spinlock_t              lock;           /* the gcwq lock */
-        unsigned int            cpu;            /* I: the associated cpu */
-        unsigned int            flags;          /* L: GCWQ_* flags */
-        /* workers are chained either in busy_hash or pool idle_list */
-        struct hlist_head       busy_hash[BUSY_WORKER_HASH_SIZE];
                                                /* L: hash of busy workers */
-        struct worker_pool      pools[NR_WORKER_POOLS];
+        struct mutex            assoc_mutex;    /* protect POOL_DISASSOCIATED */
-                                                /* normal and highpri pools */
+        struct ida              worker_ida;     /* L: for worker IDs */
+        /*
+         * The current concurrency level.  As it's likely to be accessed
+         * from other CPUs during try_to_wake_up(), put it in a separate
+         * cacheline.
+         */
+        atomic_t                nr_running ____cacheline_aligned_in_smp;
 } ____cacheline_aligned_in_smp;
 /*
- * The per-CPU workqueue.  The lower WORK_STRUCT_FLAG_BITS of
+ * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
- * work_struct->data are used for flags and thus cwqs need to be
+ * of work_struct->data are used for flags and the remaining high bits
- * aligned at two's power of the number of flag bits.
+ * point to the pwq; thus, pwqs need to be aligned at two's power of the
+ * number of flag bits.
 */
-struct cpu_workqueue_struct {
+struct pool_workqueue {
        struct worker_pool      *pool;          /* I: the associated pool */
        struct workqueue_struct *wq;            /* I: the owning workqueue */
        int                     work_color;     /* L: current color */
@@ -241,16 +208,16 @@ typedef unsigned long mayday_mask_t;
 struct workqueue_struct {
        unsigned int            flags;          /* W: WQ_* flags */
        union {
-                struct cpu_workqueue_struct __percpu    *pcpu;
+                struct pool_workqueue __percpu          *pcpu;
-                struct cpu_workqueue_struct             *single;
+                struct pool_workqueue                   *single;
                unsigned long                           v;
-        } cpu_wq;                               /* I: cwq's */
+        } pool_wq;                              /* I: pwq's */
        struct list_head        list;           /* W: list of all workqueues */
        struct mutex            flush_mutex;    /* protects wq flushing */
        int                     work_color;     /* F: current work color */
        int                     flush_color;    /* F: current flush color */
-        atomic_t                nr_cwqs_to_flush; /* flush in progress */
+        atomic_t                nr_pwqs_to_flush; /* flush in progress */
        struct wq_flusher       *first_flusher; /* F: first flusher */
        struct list_head        flusher_queue;  /* F: flush waiters */
        struct list_head        flusher_overflow; /* F: flush overflow list */
@@ -259,7 +226,7 @@ struct workqueue_struct {
        struct worker           *rescuer;       /* I: rescue worker */
        int                     nr_drainers;    /* W: drain in progress */
-        int                     saved_max_active; /* W: saved cwq max_active */
+        int                     saved_max_active; /* W: saved pwq max_active */
 #ifdef CONFIG_LOCKDEP
        struct lockdep_map      lockdep_map;
 #endif
@@ -280,16 +247,15 @@ EXPORT_SYMBOL_GPL(system_freezable_wq);
 #define CREATE_TRACE_POINTS
 #include <trace/events/workqueue.h>
-#define for_each_worker_pool(pool, gcwq)                                \
+#define for_each_std_worker_pool(pool, cpu)                             \
-        for ((pool) = &(gcwq)->pools[0];                                \
+        for ((pool) = &std_worker_pools(cpu)[0];                        \
-             (pool) < &(gcwq)->pools[NR_WORKER_POOLS]; (pool)++)
+             (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
-#define for_each_busy_worker(worker, i, pos, gcwq)                      \
+#define for_each_busy_worker(worker, i, pool)                           \
-        for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)                     \
+        hash_for_each(pool->busy_hash, i, worker, hentry)
-                hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
-static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
+static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
-                                  unsigned int sw)
+                                unsigned int sw)
 {
        if (cpu < nr_cpu_ids) {
                if (sw & 1) {
@@ -300,42 +266,42 @@ static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask,
                if (sw & 2)
                        return WORK_CPU_UNBOUND;
        }
-        return WORK_CPU_NONE;
+        return WORK_CPU_END;
 }
-static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
+static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask,
-                                struct workqueue_struct *wq)
+                                 struct workqueue_struct *wq)
 {
-        return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
+        return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
 }
 /*
 * CPU iterators
 *
- * An extra gcwq is defined for an invalid cpu number
+ * An extra cpu number is defined using an invalid cpu number
 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
- * specific CPU.  The following iterators are similar to
+ * specific CPU.  The following iterators are similar to for_each_*_cpu()
- * for_each_*_cpu() iterators but also considers the unbound gcwq.
+ * iterators but also considers the unbound CPU.
 *
- * for_each_gcwq_cpu()          : possible CPUs + WORK_CPU_UNBOUND
+ * for_each_wq_cpu()            : possible CPUs + WORK_CPU_UNBOUND
- * for_each_online_gcwq_cpu()   : online CPUs + WORK_CPU_UNBOUND
+ * for_each_online_wq_cpu()     : online CPUs + WORK_CPU_UNBOUND
- * for_each_cwq_cpu()           : possible CPUs for bound workqueues,
+ * for_each_pwq_cpu()           : possible CPUs for bound workqueues,
 *                                WORK_CPU_UNBOUND for unbound workqueues
 */
-#define for_each_gcwq_cpu(cpu)                                          \
+#define for_each_wq_cpu(cpu)                                            \
-        for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3);         \
+        for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3);           \
-             (cpu) < WORK_CPU_NONE;                                     \
+             (cpu) < WORK_CPU_END;                                      \
-             (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3))
+             (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
-#define for_each_online_gcwq_cpu(cpu)                                   \
+#define for_each_online_wq_cpu(cpu)                                     \
-        for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3);           \
+        for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3);             \
-             (cpu) < WORK_CPU_NONE;                                     \
+             (cpu) < WORK_CPU_END;                                      \
-             (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3))
+             (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
-#define for_each_cwq_cpu(cpu, wq)                                       \
+#define for_each_pwq_cpu(cpu, wq)                                       \
-        for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq));        \
+        for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq));       \
-             (cpu) < WORK_CPU_NONE;                                     \
+             (cpu) < WORK_CPU_END;                                      \
-             (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
+             (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
@@ -459,57 +425,70 @@ static LIST_HEAD(workqueues);
 static bool workqueue_freezing;         /* W: have wqs started freezing? */
 /*
- * The almighty global cpu workqueues.  nr_running is the only field
+ * The CPU and unbound standard worker pools.  The unbound ones have
- * which is expected to be used frequently by other cpus via
+ * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
- * try_to_wake_up().  Put it in a separate cacheline.
 */
-static DEFINE_PER_CPU(struct global_cwq, global_cwq);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
-static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, pool_nr_running[NR_WORKER_POOLS]);
+                                     cpu_std_worker_pools);
+static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
-/*
+/* idr of all pools */
- * Global cpu workqueue and nr_running counter for unbound gcwq.  The
+static DEFINE_MUTEX(worker_pool_idr_mutex);
- * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its
+static DEFINE_IDR(worker_pool_idr);
- * workers have WORKER_UNBOUND set.
- */
-static struct global_cwq unbound_global_cwq;
-static atomic_t unbound_pool_nr_running[NR_WORKER_POOLS] = {
-        [0 ... NR_WORKER_POOLS - 1]     = ATOMIC_INIT(0),       /* always 0 */
-};
 static int worker_thread(void *__worker);
-static int worker_pool_pri(struct worker_pool *pool)
+static struct worker_pool *std_worker_pools(int cpu)
+{
+        if (cpu != WORK_CPU_UNBOUND)
+                return per_cpu(cpu_std_worker_pools, cpu);
+        else
+                return unbound_std_worker_pools;
+}
+static int std_worker_pool_pri(struct worker_pool *pool)
 {
-        return pool - pool->gcwq->pools;
+        return pool - std_worker_pools(pool->cpu);
 }
-static struct global_cwq *get_gcwq(unsigned int cpu)
+/* allocate ID and assign it to @pool */
+static int worker_pool_assign_id(struct worker_pool *pool)
 {
-        if (cpu != WORK_CPU_UNBOUND)
+        int ret;
-                return &per_cpu(global_cwq, cpu);
-        else
+        mutex_lock(&worker_pool_idr_mutex);
-                return &unbound_global_cwq;
+        ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
+        if (ret >= 0)
+                pool->id = ret;
+        mutex_unlock(&worker_pool_idr_mutex);
+        return ret < 0 ? ret : 0;
 }
-static atomic_t *get_pool_nr_running(struct worker_pool *pool)
+/*
+ * Lookup worker_pool by id.  The idr currently is built during boot and
+ * never modified.  Don't worry about locking for now.
+ */
+static struct worker_pool *worker_pool_by_id(int pool_id)
 {
-        int cpu = pool->gcwq->cpu;
+        return idr_find(&worker_pool_idr, pool_id);
-        int idx = worker_pool_pri(pool);
+}
-        if (cpu != WORK_CPU_UNBOUND)
+static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
-                return &per_cpu(pool_nr_running, cpu)[idx];
+{
-        else
+        struct worker_pool *pools = std_worker_pools(cpu);
-                return &unbound_pool_nr_running[idx];
+        return &pools[highpri];
 }
-static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
+static struct pool_workqueue *get_pwq(unsigned int cpu,
-                                            struct workqueue_struct *wq)
+                                      struct workqueue_struct *wq)
 {
        if (!(wq->flags & WQ_UNBOUND)) {
                if (likely(cpu < nr_cpu_ids))
-                        return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
+                        return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
        } else if (likely(cpu == WORK_CPU_UNBOUND))
-                return wq->cpu_wq.single;
+                return wq->pool_wq.single;
        return NULL;
 }
@@ -530,19 +509,19 @@ static int work_next_color(int color)
 }
 /*
- * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data
+ * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
- * contain the pointer to the queued cwq.  Once execution starts, the flag
+ * contain the pointer to the queued pwq.  Once execution starts, the flag
- * is cleared and the high bits contain OFFQ flags and CPU number.
+ * is cleared and the high bits contain OFFQ flags and pool ID.
 *
- * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling()
+ * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling()
- * and clear_work_data() can be used to set the cwq, cpu or clear
+ * and clear_work_data() can be used to set the pwq, pool or clear
 * work->data.  These functions should only be called while the work is
 * owned - ie. while the PENDING bit is set.
 *
- * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to
+ * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
- * a work.  gcwq is available once the work has been queued anywhere after
+ * corresponding to a work.  Pool is available once the work has been
- * initialization until it is sync canceled.  cwq is available only while
+ * queued anywhere after initialization until it is sync canceled.  pwq is
- * the work item is queued.
+ * available only while the work item is queued.
 *
 * %WORK_OFFQ_CANCELING is used to mark a work item which is being
 * canceled.  While being canceled, a work item may have its PENDING set
@@ -556,16 +535,22 @@ static inline void set_work_data(struct work_struct *work, unsigned long data,
        atomic_long_set(&work->data, data | flags | work_static(work));
 }
-static void set_work_cwq(struct work_struct *work,
+static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
-                         struct cpu_workqueue_struct *cwq,
                         unsigned long extra_flags)
 {
-        set_work_data(work, (unsigned long)cwq,
+        set_work_data(work, (unsigned long)pwq,
-                      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);
+                      WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags);
 }
-static void set_work_cpu_and_clear_pending(struct work_struct *work,
+static void set_work_pool_and_keep_pending(struct work_struct *work,
-                                           unsigned int cpu)
+                                           int pool_id)
+{
+        set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT,
+                      WORK_STRUCT_PENDING);
+}
+static void set_work_pool_and_clear_pending(struct work_struct *work,
+                                            int pool_id)
 {
        /*
         * The following wmb is paired with the implied mb in
@@ -574,67 +559,92 @@ static void set_work_cpu_and_clear_pending(struct work_struct *work,
         * owner.
         */
        smp_wmb();
-        set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);
+        set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
 }
 static void clear_work_data(struct work_struct *work)
 {
-        smp_wmb();      /* see set_work_cpu_and_clear_pending() */
+        smp_wmb();      /* see set_work_pool_and_clear_pending() */
-        set_work_data(work, WORK_STRUCT_NO_CPU, 0);
+        set_work_data(work, WORK_STRUCT_NO_POOL, 0);
 }
-static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work)
+static struct pool_workqueue *get_work_pwq(struct work_struct *work)
 {
        unsigned long data = atomic_long_read(&work->data);
-        if (data & WORK_STRUCT_CWQ)
+        if (data & WORK_STRUCT_PWQ)
                return (void *)(data & WORK_STRUCT_WQ_DATA_MASK);
        else
                return NULL;
 }
-static struct global_cwq *get_work_gcwq(struct work_struct *work)
+/**
+ * get_work_pool - return the worker_pool a given work was associated with
+ * @work: the work item of interest
+ *
+ * Return the worker_pool @work was last associated with.  %NULL if none.
+ */
+static struct worker_pool *get_work_pool(struct work_struct *work)
 {
        unsigned long data = atomic_long_read(&work->data);
-        unsigned int cpu;
+        struct worker_pool *pool;
+        int pool_id;
-        if (data & WORK_STRUCT_CWQ)
+        if (data & WORK_STRUCT_PWQ)
-                return ((struct cpu_workqueue_struct *)
+                return ((struct pool_workqueue *)
-                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq;
+                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
-        cpu = data >> WORK_OFFQ_CPU_SHIFT;
+        pool_id = data >> WORK_OFFQ_POOL_SHIFT;
-        if (cpu == WORK_CPU_NONE)
+        if (pool_id == WORK_OFFQ_POOL_NONE)
                return NULL;
-        BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND);
+        pool = worker_pool_by_id(pool_id);
-        return get_gcwq(cpu);
+        WARN_ON_ONCE(!pool);
+        return pool;
+}
+/**
+ * get_work_pool_id - return the worker pool ID a given work is associated with
+ * @work: the work item of interest
+ *
+ * Return the worker_pool ID @work was last associated with.
+ * %WORK_OFFQ_POOL_NONE if none.
+ */
+static int get_work_pool_id(struct work_struct *work)
+{
+        unsigned long data = atomic_long_read(&work->data);
+        if (data & WORK_STRUCT_PWQ)
+                return ((struct pool_workqueue *)
+                        (data & WORK_STRUCT_WQ_DATA_MASK))->pool->id;
+        return data >> WORK_OFFQ_POOL_SHIFT;
 }
 static void mark_work_canceling(struct work_struct *work)
 {
-        struct global_cwq *gcwq = get_work_gcwq(work);
+        unsigned long pool_id = get_work_pool_id(work);
-        unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE;
-        set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING,
+        pool_id <<= WORK_OFFQ_POOL_SHIFT;
-                      WORK_STRUCT_PENDING);
+        set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING);
 }
 static bool work_is_canceling(struct work_struct *work)
 {
        unsigned long data = atomic_long_read(&work->data);
-        return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING);
+        return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
 }
 /*
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
- * they're being called with gcwq->lock held.
+ * they're being called with pool->lock held.
 */
 static bool __need_more_worker(struct worker_pool *pool)
 {
-        return !atomic_read(get_pool_nr_running(pool));
+        return !atomic_read(&pool->nr_running);
 }
 /*
@@ -642,7 +652,7 @@ static bool __need_more_worker(struct worker_pool *pool)
 * running workers.
 *
 * Note that, because unbound workers never contribute to nr_running, this
- * function will always return %true for unbound gcwq as long as the
+ * function will always return %true for unbound pools as long as the
 * worklist isn't empty.
 */
 static bool need_more_worker(struct worker_pool *pool)
@@ -659,9 +669,8 @@ static bool may_start_working(struct worker_pool *pool)
 /* Do I need to keep working?  Called from currently running workers. */
 static bool keep_working(struct worker_pool *pool)
 {
-        atomic_t *nr_running = get_pool_nr_running(pool);
+        return !list_empty(&pool->worklist) &&
+                atomic_read(&pool->nr_running) <= 1;
-        return !list_empty(&pool->worklist) && atomic_read(nr_running) <= 1;
 }
 /* Do we need a new worker?  Called from manager. */
@@ -714,7 +723,7 @@ static struct worker *first_worker(struct worker_pool *pool)
 * Wake up the first idle worker of @pool.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
 static void wake_up_worker(struct worker_pool *pool)
 {
@@ -739,8 +748,10 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
 {
        struct worker *worker = kthread_data(task);
-        if (!(worker->flags & WORKER_NOT_RUNNING))
+        if (!(worker->flags & WORKER_NOT_RUNNING)) {
-                atomic_inc(get_pool_nr_running(worker->pool));
+                WARN_ON_ONCE(worker->pool->cpu != cpu);
+                atomic_inc(&worker->pool->nr_running);
+        }
 }
 /**
@@ -762,12 +773,18 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
                                       unsigned int cpu)
 {
        struct worker *worker = kthread_data(task), *to_wakeup = NULL;
-        struct worker_pool *pool = worker->pool;
+        struct worker_pool *pool;
-        atomic_t *nr_running = get_pool_nr_running(pool);
+        /*
+         * Rescuers, which may not have all the fields set up like normal
+         * workers, also reach here, let's not access anything before
+         * checking NOT_RUNNING.
+         */
        if (worker->flags & WORKER_NOT_RUNNING)
                return NULL;
+        pool = worker->pool;
        /* this can only happen on the local cpu */
        BUG_ON(cpu != raw_smp_processor_id());
@@ -779,10 +796,11 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
         * NOT_RUNNING is clear.  This means that we're bound to and
         * running on the local cpu w/ rq lock held and preemption
         * disabled, which in turn means that none else could be
-         * manipulating idle_list, so dereferencing idle_list without gcwq
+         * manipulating idle_list, so dereferencing idle_list without pool
         * lock is safe.
         */
-        if (atomic_dec_and_test(nr_running) && !list_empty(&pool->worklist))
+        if (atomic_dec_and_test(&pool->nr_running) &&
+            !list_empty(&pool->worklist))
                to_wakeup = first_worker(pool);
        return to_wakeup ? to_wakeup->task : NULL;
 }
@@ -798,7 +816,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
 * woken up.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock)
+ * spin_lock_irq(pool->lock)
 */
 static inline void worker_set_flags(struct worker *worker, unsigned int flags,
                                    bool wakeup)
@@ -814,14 +832,12 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
         */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
-                atomic_t *nr_running = get_pool_nr_running(pool);
                if (wakeup) {
-                        if (atomic_dec_and_test(nr_running) &&
+                        if (atomic_dec_and_test(&pool->nr_running) &&
                            !list_empty(&pool->worklist))
                                wake_up_worker(pool);
                } else
-                        atomic_dec(nr_running);
+                        atomic_dec(&pool->nr_running);
        }
        worker->flags |= flags;
@@ -835,7 +851,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags,
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock)
+ * spin_lock_irq(pool->lock)
 */
 static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 {
@@ -853,87 +869,55 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
-                        atomic_inc(get_pool_nr_running(pool));
+                        atomic_inc(&pool->nr_running);
 }
 /**
- * busy_worker_head - return the busy hash head for a work
+ * find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
+ * @pool: pool of interest
- * @work: work to be hashed
- *
- * Return hash head of @gcwq for @work.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to the hash head.
- */
-static struct hlist_head *busy_worker_head(struct global_cwq *gcwq,
-                                           struct work_struct *work)
-{
-        const int base_shift = ilog2(sizeof(struct work_struct));
-        unsigned long v = (unsigned long)work;
-        /* simple shift and fold hash, do we need something better? */
-        v >>= base_shift;
-        v += v >> BUSY_WORKER_HASH_ORDER;
-        v &= BUSY_WORKER_HASH_MASK;
-        return &gcwq->busy_hash[v];
-}
-/**
- * __find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
- * @bwh: hash head as returned by busy_worker_head()
 * @work: work to find worker for
 *
- * Find a worker which is executing @work on @gcwq.  @bwh should be
+ * Find a worker which is executing @work on @pool by searching
- * the hash head obtained by calling busy_worker_head() with the same
+ * @pool->busy_hash which is keyed by the address of @work.  For a worker
- * work.
+ * to match, its current execution should match the address of @work and
+ * its work function.  This is to avoid unwanted dependency between
+ * unrelated work executions through a work item being recycled while still
+ * being executed.
+ *
+ * This is a bit tricky.  A work item may be freed once its execution
+ * starts and nothing prevents the freed area from being recycled for
+ * another work item.  If the same work item address ends up being reused
+ * before the original execution finishes, workqueue will identify the
+ * recycled work item as currently executing and make it wait until the
+ * current execution finishes, introducing an unwanted dependency.
+ *
+ * This function checks the work item address, work function and workqueue
+ * to avoid false positives.  Note that this isn't complete as one may
+ * construct a work function which can introduce dependency onto itself
+ * through a recycled work item.  Well, if somebody wants to shoot oneself
+ * in the foot that badly, there's only so much we can do, and if such
+ * deadlock actually occurs, it should be easy to locate the culprit work
+ * function.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 *
 * RETURNS:
 * Pointer to worker which is executing @work if found, NULL
 * otherwise.
 */
-static struct worker *__find_worker_executing_work(struct global_cwq *gcwq,
+static struct worker *find_worker_executing_work(struct worker_pool *pool,
-                                                   struct hlist_head *bwh,
+                                                 struct work_struct *work)
-                                                   struct work_struct *work)
 {
        struct worker *worker;
-        struct hlist_node *tmp;
-        hlist_for_each_entry(worker, tmp, bwh, hentry)
+        hash_for_each_possible(pool->busy_hash, worker, hentry,
-                if (worker->current_work == work)
+                               (unsigned long)work)
+                if (worker->current_work == work &&
+                    worker->current_func == work->func)
                        return worker;
-        return NULL;
-}
-/**
+        return NULL;
- * find_worker_executing_work - find worker which is executing a work
- * @gcwq: gcwq of interest
- * @work: work to find worker for
- *
- * Find a worker which is executing @work on @gcwq.  This function is
- * identical to __find_worker_executing_work() except that this
- * function calculates @bwh itself.
- *
- * CONTEXT:
- * spin_lock_irq(gcwq->lock).
- *
- * RETURNS:
- * Pointer to worker which is executing @work if found, NULL
- * otherwise.
- */
-static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
-                                                 struct work_struct *work)
-{
-        return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work),
-                                            work);
 }
 /**
@@ -951,7 +935,7 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,
 * nested inside outer list_for_each_entry_safe().
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
 static void move_linked_works(struct work_struct *work, struct list_head *head,
                              struct work_struct **nextp)
@@ -977,67 +961,67 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
                *nextp = n;
 }
-static void cwq_activate_delayed_work(struct work_struct *work)
+static void pwq_activate_delayed_work(struct work_struct *work)
 {
-        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+        struct pool_workqueue *pwq = get_work_pwq(work);
        trace_workqueue_activate_work(work);
-        move_linked_works(work, &cwq->pool->worklist, NULL);
+        move_linked_works(work, &pwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
-        cwq->nr_active++;
+        pwq->nr_active++;
 }
-static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
+static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
 {
-        struct work_struct *work = list_first_entry(&cwq->delayed_works,
+        struct work_struct *work = list_first_entry(&pwq->delayed_works,
                                                    struct work_struct, entry);
-        cwq_activate_delayed_work(work);
+        pwq_activate_delayed_work(work);
 }
 /**
- * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
+ * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
- * @cwq: cwq of interest
+ * @pwq: pwq of interest
 * @color: color of work which left the queue
 *
 * A work either has completed or is removed from pending queue,
- * decrement nr_in_flight of its cwq and handle workqueue flushing.
+ * decrement nr_in_flight of its pwq and handle workqueue flushing.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
-static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
+static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
 {
        /* ignore uncolored works */
        if (color == WORK_NO_COLOR)
                return;
-        cwq->nr_in_flight[color]--;
+        pwq->nr_in_flight[color]--;
-        cwq->nr_active--;
+        pwq->nr_active--;
-        if (!list_empty(&cwq->delayed_works)) {
+        if (!list_empty(&pwq->delayed_works)) {
                /* one down, submit a delayed one */
-                if (cwq->nr_active < cwq->max_active)
+                if (pwq->nr_active < pwq->max_active)
-                        cwq_activate_first_delayed(cwq);
+                        pwq_activate_first_delayed(pwq);
        }
        /* is flush in progress and are we at the flushing tip? */
-        if (likely(cwq->flush_color != color))
+        if (likely(pwq->flush_color != color))
                return;
        /* are there still in-flight works? */
-        if (cwq->nr_in_flight[color])
+        if (pwq->nr_in_flight[color])
                return;
-        /* this cwq is done, clear flush_color */
+        /* this pwq is done, clear flush_color */
-        cwq->flush_color = -1;
+        pwq->flush_color = -1;
        /*
-         * If this was the last cwq, wake up the first flusher.  It
+         * If this was the last pwq, wake up the first flusher.  It
         * will handle the rest.
         */
-        if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush))
+        if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
-                complete(&cwq->wq->first_flusher->done);
+                complete(&pwq->wq->first_flusher->done);
 }
 /**
@@ -1068,7 +1052,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color)
 static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
                               unsigned long *flags)
 {
-        struct global_cwq *gcwq;
+        struct worker_pool *pool;
+        struct pool_workqueue *pwq;
        local_irq_save(*flags);
@@ -1093,41 +1078,43 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
         * The queueing is in progress, or it is already queued. Try to
         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
         */
-        gcwq = get_work_gcwq(work);
+        pool = get_work_pool(work);
-        if (!gcwq)
+        if (!pool)
                goto fail;
-        spin_lock(&gcwq->lock);
+        spin_lock(&pool->lock);
-        if (!list_empty(&work->entry)) {
+        /*
+         * work->data is guaranteed to point to pwq only while the work
+         * item is queued on pwq->wq, and both updating work->data to point
+         * to pwq on queueing and to pool on dequeueing are done under
+         * pwq->pool->lock.  This in turn guarantees that, if work->data
+         * points to pwq which is associated with a locked pool, the work
+         * item is currently queued on that pool.
+         */
+        pwq = get_work_pwq(work);
+        if (pwq && pwq->pool == pool) {
+                debug_work_deactivate(work);
                /*
-                 * This work is queued, but perhaps we locked the wrong gcwq.
+                 * A delayed work item cannot be grabbed directly because
-                 * In that case we must see the new value after rmb(), see
+                 * it might have linked NO_COLOR work items which, if left
-                 * insert_work()->wmb().
+                 * on the delayed_list, will confuse pwq->nr_active
+                 * management later on and cause stall.  Make sure the work
+                 * item is activated before grabbing.
                 */
-                smp_rmb();
+                if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
-                if (gcwq == get_work_gcwq(work)) {
+                        pwq_activate_delayed_work(work);
-                        debug_work_deactivate(work);
-                        /*
+                list_del_init(&work->entry);
-                         * A delayed work item cannot be grabbed directly
+                pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
-                         * because it might have linked NO_COLOR work items
-                         * which, if left on the delayed_list, will confuse
-                         * cwq->nr_active management later on and cause
-                         * stall.  Make sure the work item is activated
-                         * before grabbing.
-                         */
-                        if (*work_data_bits(work) & WORK_STRUCT_DELAYED)
-                                cwq_activate_delayed_work(work);
-                        list_del_init(&work->entry);
+                /* work->data points to pwq iff queued, point to pool */
-                        cwq_dec_nr_in_flight(get_work_cwq(work),
+                set_work_pool_and_keep_pending(work, pool->id);
-                                get_work_color(work));
-                        spin_unlock(&gcwq->lock);
+                spin_unlock(&pool->lock);
-                        return 1;
+                return 1;
-                }
        }
-        spin_unlock(&gcwq->lock);
+        spin_unlock(&pool->lock);
 fail:
        local_irq_restore(*flags);
        if (work_is_canceling(work))
@@ -1137,33 +1124,25 @@ fail:
 }
 /**
- * insert_work - insert a work into gcwq
+ * insert_work - insert a work into a pool
- * @cwq: cwq @work belongs to
+ * @pwq: pwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
- * Insert @work which belongs to @cwq into @gcwq after @head.
+ * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
- * @extra_flags is or'd to work_struct flags.
+ * work_struct flags.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
-static void insert_work(struct cpu_workqueue_struct *cwq,
+static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
-                        struct work_struct *work, struct list_head *head,
+                        struct list_head *head, unsigned int extra_flags)
-                        unsigned int extra_flags)
 {
-        struct worker_pool *pool = cwq->pool;
+        struct worker_pool *pool = pwq->pool;
        /* we own @work, set data and link */
-        set_work_cwq(work, cwq, extra_flags);
+        set_work_pwq(work, pwq, extra_flags);
-        /*
-         * Ensure that we get the right work->data if we see the
-         * result of list_add() below, see try_to_grab_pending().
-         */
-        smp_wmb();
        list_add_tail(&work->entry, head);
        /*
@@ -1179,41 +1158,24 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
 /*
 * Test whether @work is being queued from another work executing on the
- * same workqueue.  This is rather expensive and should only be used from
+ * same workqueue.
- * cold paths.
 */
 static bool is_chained_work(struct workqueue_struct *wq)
 {
-        unsigned long flags;
+        struct worker *worker;
-        unsigned int cpu;
-        for_each_gcwq_cpu(cpu) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
-                struct worker *worker;
-                struct hlist_node *pos;
-                int i;
-                spin_lock_irqsave(&gcwq->lock, flags);
+        worker = current_wq_worker();
-                for_each_busy_worker(worker, i, pos, gcwq) {
+        /*
-                        if (worker->task != current)
+         * Return %true iff I'm a worker execuing a work item on @wq.  If
-                                continue;
+         * I'm @worker, it's safe to dereference it without locking.
-                        spin_unlock_irqrestore(&gcwq->lock, flags);
+         */
-                        /*
+        return worker && worker->current_pwq->wq == wq;
-                         * I'm @worker, no locking necessary.  See if @work
-                         * is headed to the same workqueue.
-                         */
-                        return worker->current_cwq->wq == wq;
-                }
-                spin_unlock_irqrestore(&gcwq->lock, flags);
-        }
-        return false;
 }
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
 {
-        struct global_cwq *gcwq;
+        struct pool_workqueue *pwq;
-        struct cpu_workqueue_struct *cwq;
        struct list_head *worklist;
        unsigned int work_flags;
        unsigned int req_cpu = cpu;
@@ -1233,9 +1195,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
            WARN_ON_ONCE(!is_chained_work(wq)))
                return;
-        /* determine gcwq to use */
+        /* determine the pwq to use */
        if (!(wq->flags & WQ_UNBOUND)) {
-                struct global_cwq *last_gcwq;
+                struct worker_pool *last_pool;
                if (cpu == WORK_CPU_UNBOUND)
                        cpu = raw_smp_processor_id();
@@ -1246,55 +1208,54 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
                 * work needs to be queued on that cpu to guarantee
                 * non-reentrancy.
                 */
-                gcwq = get_gcwq(cpu);
+                pwq = get_pwq(cpu, wq);
-                last_gcwq = get_work_gcwq(work);
+                last_pool = get_work_pool(work);
-                if (last_gcwq && last_gcwq != gcwq) {
+                if (last_pool && last_pool != pwq->pool) {
                        struct worker *worker;
-                        spin_lock(&last_gcwq->lock);
+                        spin_lock(&last_pool->lock);
-                        worker = find_worker_executing_work(last_gcwq, work);
+                        worker = find_worker_executing_work(last_pool, work);
-                        if (worker && worker->current_cwq->wq == wq)
+                        if (worker && worker->current_pwq->wq == wq) {
-                                gcwq = last_gcwq;
+                                pwq = get_pwq(last_pool->cpu, wq);
-                        else {
+                        } else {
                                /* meh... not running there, queue here */
-                                spin_unlock(&last_gcwq->lock);
+                                spin_unlock(&last_pool->lock);
-                                spin_lock(&gcwq->lock);
+                                spin_lock(&pwq->pool->lock);
                        }
                } else {
-                        spin_lock(&gcwq->lock);
+                        spin_lock(&pwq->pool->lock);
                }
        } else {
-                gcwq = get_gcwq(WORK_CPU_UNBOUND);
+                pwq = get_pwq(WORK_CPU_UNBOUND, wq);
-                spin_lock(&gcwq->lock);
+                spin_lock(&pwq->pool->lock);
        }
-        /* gcwq determined, get cwq and queue */
+        /* pwq determined, queue */
-        cwq = get_cwq(gcwq->cpu, wq);
+        trace_workqueue_queue_work(req_cpu, pwq, work);
-        trace_workqueue_queue_work(req_cpu, cwq, work);
        if (WARN_ON(!list_empty(&work->entry))) {
-                spin_unlock(&gcwq->lock);
+                spin_unlock(&pwq->pool->lock);
                return;
        }
-        cwq->nr_in_flight[cwq->work_color]++;
+        pwq->nr_in_flight[pwq->work_color]++;
-        work_flags = work_color_to_flags(cwq->work_color);
+        work_flags = work_color_to_flags(pwq->work_color);
-        if (likely(cwq->nr_active < cwq->max_active)) {
+        if (likely(pwq->nr_active < pwq->max_active)) {
                trace_workqueue_activate_work(work);
-                cwq->nr_active++;
+                pwq->nr_active++;
-                worklist = &cwq->pool->worklist;
+                worklist = &pwq->pool->worklist;
        } else {
                work_flags |= WORK_STRUCT_DELAYED;
-                worklist = &cwq->delayed_works;
+                worklist = &pwq->delayed_works;
        }
-        insert_work(cwq, work, worklist, work_flags);
+        insert_work(pwq, work, worklist, work_flags);
-        spin_unlock(&gcwq->lock);
+        spin_unlock(&pwq->pool->lock);
 }
 /**
@@ -1345,51 +1306,37 @@ EXPORT_SYMBOL_GPL(queue_work);
 void delayed_work_timer_fn(unsigned long __data)
 {
        struct delayed_work *dwork = (struct delayed_work *)__data;
-        struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work);
        /* should have been called from irqsafe timer with irq already off */
-        __queue_work(dwork->cpu, cwq->wq, &dwork->work);
+        __queue_work(dwork->cpu, dwork->wq, &dwork->work);
 }
-EXPORT_SYMBOL_GPL(delayed_work_timer_fn);
+EXPORT_SYMBOL(delayed_work_timer_fn);
 static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
                                struct delayed_work *dwork, unsigned long delay)
 {
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;
-        unsigned int lcpu;
        WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
                     timer->data != (unsigned long)dwork);
-        BUG_ON(timer_pending(timer));
+        WARN_ON_ONCE(timer_pending(timer));
-        BUG_ON(!list_empty(&work->entry));
+        WARN_ON_ONCE(!list_empty(&work->entry));
-        timer_stats_timer_set_start_info(&dwork->timer);
        /*
-         * This stores cwq for the moment, for the timer_fn.  Note that the
+         * If @delay is 0, queue @dwork->work immediately.  This is for
-         * work's gcwq is preserved to allow reentrance detection for
+         * both optimization and correctness.  The earliest @timer can
-         * delayed works.
+         * expire is on the closest next tick and delayed_work users depend
+         * on that there's no such delay when @delay is 0.
         */
-        if (!(wq->flags & WQ_UNBOUND)) {
+        if (!delay) {
-                struct global_cwq *gcwq = get_work_gcwq(work);
+                __queue_work(cpu, wq, &dwork->work);
+                return;
-                /*
-                 * If we cannot get the last gcwq from @work directly,
-                 * select the last CPU such that it avoids unnecessarily
-                 * triggering non-reentrancy check in __queue_work().
-                 */
-                lcpu = cpu;
-                if (gcwq)
-                        lcpu = gcwq->cpu;
-                if (lcpu == WORK_CPU_UNBOUND)
-                        lcpu = raw_smp_processor_id();
-        } else {
-                lcpu = WORK_CPU_UNBOUND;
        }
-        set_work_cwq(work, get_cwq(lcpu, wq), 0);
+        timer_stats_timer_set_start_info(&dwork->timer);
+        dwork->wq = wq;
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;
@@ -1417,9 +1364,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
        bool ret = false;
        unsigned long flags;
-        if (!delay)
-                return queue_work_on(cpu, wq, &dwork->work);
        /* read the comment in __queue_work() */
        local_irq_save(flags);
@@ -1509,12 +1453,11 @@ EXPORT_SYMBOL_GPL(mod_delayed_work);
 * necessary.
 *
 * LOCKING:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
 static void worker_enter_idle(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-        struct global_cwq *gcwq = pool->gcwq;
        BUG_ON(worker->flags & WORKER_IDLE);
        BUG_ON(!list_empty(&worker->entry) &&
@@ -1532,14 +1475,14 @@ static void worker_enter_idle(struct worker *worker)
                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
        /*
-         * Sanity check nr_running.  Because gcwq_unbind_fn() releases
+         * Sanity check nr_running.  Because wq_unbind_fn() releases
-         * gcwq->lock between setting %WORKER_UNBOUND and zapping
+         * pool->lock between setting %WORKER_UNBOUND and zapping
         * nr_running, the warning may trigger spuriously.  Check iff
         * unbind is not in progress.
         */
-        WARN_ON_ONCE(!(gcwq->flags & GCWQ_DISASSOCIATED) &&
+        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                     pool->nr_workers == pool->nr_idle &&
-                     atomic_read(get_pool_nr_running(pool)));
+                     atomic_read(&pool->nr_running));
 }
 /**
@@ -1549,7 +1492,7 @@ static void worker_enter_idle(struct worker *worker)
 * @worker is leaving idle state.  Update stats.
 *
 * LOCKING:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
 static void worker_leave_idle(struct worker *worker)
 {
@@ -1562,7 +1505,7 @@ static void worker_leave_idle(struct worker *worker)
 }
 /**
- * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq
+ * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool
 * @worker: self
 *
 * Works which are scheduled while the cpu is online must at least be
@@ -1574,27 +1517,27 @@ static void worker_leave_idle(struct worker *worker)
 * themselves to the target cpu and may race with cpu going down or
 * coming online.  kthread_bind() can't be used because it may put the
 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
- * verbatim as it's best effort and blocking and gcwq may be
+ * verbatim as it's best effort and blocking and pool may be
 * [dis]associated in the meantime.
 *
- * This function tries set_cpus_allowed() and locks gcwq and verifies the
+ * This function tries set_cpus_allowed() and locks pool and verifies the
- * binding against %GCWQ_DISASSOCIATED which is set during
+ * binding against %POOL_DISASSOCIATED which is set during
 * %CPU_DOWN_PREPARE and cleared during %CPU_ONLINE, so if the worker
 * enters idle state or fetches works without dropping lock, it can
 * guarantee the scheduling requirement described in the first paragraph.
 *
 * CONTEXT:
- * Might sleep.  Called without any lock but returns with gcwq->lock
+ * Might sleep.  Called without any lock but returns with pool->lock
 * held.
 *
 * RETURNS:
- * %true if the associated gcwq is online (@worker is successfully
+ * %true if the associated pool is online (@worker is successfully
 * bound), %false if offline.
 */
 static bool worker_maybe_bind_and_lock(struct worker *worker)
-__acquires(&gcwq->lock)
+__acquires(&pool->lock)
 {
-        struct global_cwq *gcwq = worker->pool->gcwq;
+        struct worker_pool *pool = worker->pool;
        struct task_struct *task = worker->task;
        while (true) {
@@ -1602,19 +1545,19 @@ __acquires(&gcwq->lock)
                 * The following call may fail, succeed or succeed
                 * without actually migrating the task to the cpu if
                 * it races with cpu hotunplug operation.  Verify
-                 * against GCWQ_DISASSOCIATED.
+                 * against POOL_DISASSOCIATED.
                 */
-                if (!(gcwq->flags & GCWQ_DISASSOCIATED))
+                if (!(pool->flags & POOL_DISASSOCIATED))
-                        set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu));
+                        set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu));
-                spin_lock_irq(&gcwq->lock);
+                spin_lock_irq(&pool->lock);
-                if (gcwq->flags & GCWQ_DISASSOCIATED)
+                if (pool->flags & POOL_DISASSOCIATED)
                        return false;
-                if (task_cpu(task) == gcwq->cpu &&
+                if (task_cpu(task) == pool->cpu &&
                    cpumask_equal(&current->cpus_allowed,
-                                  get_cpu_mask(gcwq->cpu)))
+                                  get_cpu_mask(pool->cpu)))
                        return true;
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
                /*
                 * We've raced with CPU hot[un]plug.  Give it a breather
@@ -1633,15 +1576,13 @@ __acquires(&gcwq->lock)
 */
 static void idle_worker_rebind(struct worker *worker)
 {
-        struct global_cwq *gcwq = worker->pool->gcwq;
        /* CPU may go down again inbetween, clear UNBOUND only on success */
        if (worker_maybe_bind_and_lock(worker))
                worker_clr_flags(worker, WORKER_UNBOUND);
        /* rebind complete, become available again */
        list_add(&worker->entry, &worker->pool->idle_list);
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&worker->pool->lock);
 }
 /*
@@ -1653,19 +1594,18 @@ static void idle_worker_rebind(struct worker *worker)
 static void busy_worker_rebind_fn(struct work_struct *work)
 {
        struct worker *worker = container_of(work, struct worker, rebind_work);
-        struct global_cwq *gcwq = worker->pool->gcwq;
        if (worker_maybe_bind_and_lock(worker))
                worker_clr_flags(worker, WORKER_UNBOUND);
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&worker->pool->lock);
 }
 /**
- * rebind_workers - rebind all workers of a gcwq to the associated CPU
+ * rebind_workers - rebind all workers of a pool to the associated CPU
- * @gcwq: gcwq of interest
+ * @pool: pool of interest
 *
- * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
+ * @pool->cpu is coming online.  Rebind all workers to the CPU.  Rebinding
 * is different for idle and busy ones.
 *
 * Idle ones will be removed from the idle_list and woken up.  They will
@@ -1683,38 +1623,31 @@ static void busy_worker_rebind_fn(struct work_struct *work)
 * including the manager will not appear on @idle_list until rebind is
 * complete, making local wake-ups safe.
 */
-static void rebind_workers(struct global_cwq *gcwq)
+static void rebind_workers(struct worker_pool *pool)
 {
-        struct worker_pool *pool;
        struct worker *worker, *n;
-        struct hlist_node *pos;
        int i;
-        lockdep_assert_held(&gcwq->lock);
+        lockdep_assert_held(&pool->assoc_mutex);
+        lockdep_assert_held(&pool->lock);
-        for_each_worker_pool(pool, gcwq)
-                lockdep_assert_held(&pool->assoc_mutex);
        /* dequeue and kick idle ones */
-        for_each_worker_pool(pool, gcwq) {
+        list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
-                list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
+                /*
-                        /*
+                 * idle workers should be off @pool->idle_list until rebind
-                         * idle workers should be off @pool->idle_list
+                 * is complete to avoid receiving premature local wake-ups.
-                         * until rebind is complete to avoid receiving
+                 */
-                         * premature local wake-ups.
+                list_del_init(&worker->entry);
-                         */
-                        list_del_init(&worker->entry);
-                        /*
+                /*
-                         * worker_thread() will see the above dequeuing
+                 * worker_thread() will see the above dequeuing and call
-                         * and call idle_worker_rebind().
+                 * idle_worker_rebind().
-                         */
+                 */
-                        wake_up_process(worker->task);
+                wake_up_process(worker->task);
-                }
        }
        /* rebind busy workers */
-        for_each_busy_worker(worker, i, pos, gcwq) {
+        for_each_busy_worker(worker, i, pool) {
                struct work_struct *rebind_work = &worker->rebind_work;
                struct workqueue_struct *wq;
@@ -1726,16 +1659,16 @@ static void rebind_workers(struct global_cwq *gcwq)
                /*
                 * wq doesn't really matter but let's keep @worker->pool
-                 * and @cwq->pool consistent for sanity.
+                 * and @pwq->pool consistent for sanity.
                 */
-                if (worker_pool_pri(worker->pool))
+                if (std_worker_pool_pri(worker->pool))
                        wq = system_highpri_wq;
                else
                        wq = system_wq;
-                insert_work(get_cwq(gcwq->cpu, wq), rebind_work,
+                insert_work(get_pwq(pool->cpu, wq), rebind_work,
-                        worker->scheduled.next,
+                            worker->scheduled.next,
-                        work_color_to_flags(WORK_NO_COLOR));
+                            work_color_to_flags(WORK_NO_COLOR));
        }
 }
@@ -1770,19 +1703,18 @@ static struct worker *alloc_worker(void)
 */
 static struct worker *create_worker(struct worker_pool *pool)
 {
-        struct global_cwq *gcwq = pool->gcwq;
+        const char *pri = std_worker_pool_pri(pool) ? "H" : "";
-        const char *pri = worker_pool_pri(pool) ? "H" : "";
        struct worker *worker = NULL;
        int id = -1;
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        while (ida_get_new(&pool->worker_ida, &id)) {
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
                if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
                        goto fail;
-                spin_lock_irq(&gcwq->lock);
+                spin_lock_irq(&pool->lock);
        }
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        worker = alloc_worker();
        if (!worker)
@@ -1791,30 +1723,30 @@ static struct worker *create_worker(struct worker_pool *pool)
        worker->pool = pool;
        worker->id = id;
-        if (gcwq->cpu != WORK_CPU_UNBOUND)
+        if (pool->cpu != WORK_CPU_UNBOUND)
                worker->task = kthread_create_on_node(worker_thread,
-                                        worker, cpu_to_node(gcwq->cpu),
+                                        worker, cpu_to_node(pool->cpu),
-                                        "kworker/%u:%d%s", gcwq->cpu, id, pri);
+                                        "kworker/%u:%d%s", pool->cpu, id, pri);
        else
                worker->task = kthread_create(worker_thread, worker,
                                              "kworker/u:%d%s", id, pri);
        if (IS_ERR(worker->task))
                goto fail;
-        if (worker_pool_pri(pool))
+        if (std_worker_pool_pri(pool))
                set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
        /*
         * Determine CPU binding of the new worker depending on
-         * %GCWQ_DISASSOCIATED.  The caller is responsible for ensuring the
+         * %POOL_DISASSOCIATED.  The caller is responsible for ensuring the
         * flag remains stable across this function.  See the comments
         * above the flag definition for details.
         *
         * As an unbound worker may later become a regular one if CPU comes
         * online, make sure every worker has %PF_THREAD_BOUND set.
         */
-        if (!(gcwq->flags & GCWQ_DISASSOCIATED)) {
+        if (!(pool->flags & POOL_DISASSOCIATED)) {
-                kthread_bind(worker->task, gcwq->cpu);
+                kthread_bind(worker->task, pool->cpu);
        } else {
                worker->task->flags |= PF_THREAD_BOUND;
                worker->flags |= WORKER_UNBOUND;
@@ -1823,9 +1755,9 @@ static struct worker *create_worker(struct worker_pool *pool)
        return worker;
 fail:
        if (id >= 0) {
-                spin_lock_irq(&gcwq->lock);
+                spin_lock_irq(&pool->lock);
                ida_remove(&pool->worker_ida, id);
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
        }
        kfree(worker);
        return NULL;
@@ -1835,10 +1767,10 @@ fail:
 * start_worker - start a newly created worker
 * @worker: worker to start
 *
- * Make the gcwq aware of @worker and start it.
+ * Make the pool aware of @worker and start it.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
 static void start_worker(struct worker *worker)
 {
@@ -1852,15 +1784,14 @@ static void start_worker(struct worker *worker)
 * destroy_worker - destroy a workqueue worker
 * @worker: worker to be destroyed
 *
- * Destroy @worker and adjust @gcwq stats accordingly.
+ * Destroy @worker and adjust @pool stats accordingly.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock) which is released and regrabbed.
 */
 static void destroy_worker(struct worker *worker)
 {
        struct worker_pool *pool = worker->pool;
-        struct global_cwq *gcwq = pool->gcwq;
        int id = worker->id;
        /* sanity check frenzy */
@@ -1875,21 +1806,20 @@ static void destroy_worker(struct worker *worker)
        list_del_init(&worker->entry);
        worker->flags |= WORKER_DIE;
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        kthread_stop(worker->task);
        kfree(worker);
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        ida_remove(&pool->worker_ida, id);
 }
 static void idle_worker_timeout(unsigned long __pool)
 {
        struct worker_pool *pool = (void *)__pool;
-        struct global_cwq *gcwq = pool->gcwq;
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        if (too_many_workers(pool)) {
                struct worker *worker;
@@ -1908,20 +1838,20 @@ static void idle_worker_timeout(unsigned long __pool)
                }
        }
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
 }
 static bool send_mayday(struct work_struct *work)
 {
-        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+        struct pool_workqueue *pwq = get_work_pwq(work);
-        struct workqueue_struct *wq = cwq->wq;
+        struct workqueue_struct *wq = pwq->wq;
        unsigned int cpu;
        if (!(wq->flags & WQ_RESCUER))
                return false;
        /* mayday mayday mayday */
-        cpu = cwq->pool->gcwq->cpu;
+        cpu = pwq->pool->cpu;
        /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
        if (cpu == WORK_CPU_UNBOUND)
                cpu = 0;
@@ -1930,13 +1860,12 @@ static bool send_mayday(struct work_struct *work)
        return true;
 }
-static void gcwq_mayday_timeout(unsigned long __pool)
+static void pool_mayday_timeout(unsigned long __pool)
 {
        struct worker_pool *pool = (void *)__pool;
-        struct global_cwq *gcwq = pool->gcwq;
        struct work_struct *work;
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        if (need_to_create_worker(pool)) {
                /*
@@ -1949,7 +1878,7 @@ static void gcwq_mayday_timeout(unsigned long __pool)
                        send_mayday(work);
        }
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
 }
@@ -1968,24 +1897,22 @@ static void gcwq_mayday_timeout(unsigned long __pool)
 * may_start_working() true.
 *
 * LOCKING:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
 *
 * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true
+ * false if no action was taken and pool->lock stayed locked, true
 * otherwise.
 */
 static bool maybe_create_worker(struct worker_pool *pool)
-__releases(&gcwq->lock)
+__releases(&pool->lock)
-__acquires(&gcwq->lock)
+__acquires(&pool->lock)
 {
-        struct global_cwq *gcwq = pool->gcwq;
        if (!need_to_create_worker(pool))
                return false;
 restart:
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
@@ -1996,7 +1923,7 @@ restart:
                worker = create_worker(pool);
                if (worker) {
                        del_timer_sync(&pool->mayday_timer);
-                        spin_lock_irq(&gcwq->lock);
+                        spin_lock_irq(&pool->lock);
                        start_worker(worker);
                        BUG_ON(need_to_create_worker(pool));
                        return true;
@@ -2013,7 +1940,7 @@ restart:
        }
        del_timer_sync(&pool->mayday_timer);
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        if (need_to_create_worker(pool))
                goto restart;
        return true;
@@ -2027,11 +1954,11 @@ restart:
 * IDLE_WORKER_TIMEOUT.
 *
 * LOCKING:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Called only from manager.
 *
 * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true
+ * false if no action was taken and pool->lock stayed locked, true
 * otherwise.
 */
 static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2061,21 +1988,21 @@ static bool maybe_destroy_workers(struct worker_pool *pool)
 * manage_workers - manage worker pool
 * @worker: self
 *
- * Assume the manager role and manage gcwq worker pool @worker belongs
+ * Assume the manager role and manage the worker pool @worker belongs
 * to.  At any given time, there can be only zero or one manager per
- * gcwq.  The exclusion is handled automatically by this function.
+ * pool.  The exclusion is handled automatically by this function.
 *
 * The caller can safely start processing works on false return.  On
 * true return, it's guaranteed that need_to_create_worker() is false
 * and may_start_working() is true.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * RETURNS:
- * false if no action was taken and gcwq->lock stayed locked, true if
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
- * some action was taken.
+ * multiple times.  Does GFP_KERNEL allocations.
 */
 static bool manage_workers(struct worker *worker)
 {
@@ -2097,20 +2024,20 @@ static bool manage_workers(struct worker *worker)
         * manager against CPU hotplug.
         *
         * assoc_mutex would always be free unless CPU hotplug is in
-         * progress.  trylock first without dropping @gcwq->lock.
+         * progress.  trylock first without dropping @pool->lock.
         */
        if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
-                spin_unlock_irq(&pool->gcwq->lock);
+                spin_unlock_irq(&pool->lock);
                mutex_lock(&pool->assoc_mutex);
                /*
                 * CPU hotplug could have happened while we were waiting
                 * for assoc_mutex.  Hotplug itself can't handle us
                 * because manager isn't either on idle or busy list, and
-                 * @gcwq's state and ours could have deviated.
+                 * @pool's state and ours could have deviated.
                 *
                 * As hotplug is now excluded via assoc_mutex, we can
                 * simply try to bind.  It will succeed or fail depending
-                 * on @gcwq's current state.  Try it and adjust
+                 * on @pool's current state.  Try it and adjust
                 * %WORKER_UNBOUND accordingly.
                 */
                if (worker_maybe_bind_and_lock(worker))
@@ -2147,18 +2074,15 @@ static bool manage_workers(struct worker *worker)
 * call this function to process a work.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock) which is released and regrabbed.
+ * spin_lock_irq(pool->lock) which is released and regrabbed.
 */
 static void process_one_work(struct worker *worker, struct work_struct *work)
-__releases(&gcwq->lock)
+__releases(&pool->lock)
-__acquires(&gcwq->lock)
+__acquires(&pool->lock)
 {
-        struct cpu_workqueue_struct *cwq = get_work_cwq(work);
+        struct pool_workqueue *pwq = get_work_pwq(work);
        struct worker_pool *pool = worker->pool;
-        struct global_cwq *gcwq = pool->gcwq;
+        bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
-        struct hlist_head *bwh = busy_worker_head(gcwq, work);
-        bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE;
-        work_func_t f = work->func;
        int work_color;
        struct worker *collision;
 #ifdef CONFIG_LOCKDEP
@@ -2176,11 +2100,11 @@ __acquires(&gcwq->lock)
        /*
         * Ensure we're on the correct CPU.  DISASSOCIATED test is
         * necessary to avoid spurious warnings from rescuers servicing the
-         * unbound or a disassociated gcwq.
+         * unbound or a disassociated pool.
         */
        WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
-                     !(gcwq->flags & GCWQ_DISASSOCIATED) &&
+                     !(pool->flags & POOL_DISASSOCIATED) &&
-                     raw_smp_processor_id() != gcwq->cpu);
+                     raw_smp_processor_id() != pool->cpu);
        /*
         * A single work shouldn't be executed concurrently by
@@ -2188,7 +2112,7 @@ __acquires(&gcwq->lock)
         * already processing the work.  If so, defer the work to the
         * currently executing one.
         */
-        collision = __find_worker_executing_work(gcwq, bwh, work);
+        collision = find_worker_executing_work(pool, work);
        if (unlikely(collision)) {
                move_linked_works(work, &collision->scheduled, NULL);
                return;
@@ -2196,9 +2120,10 @@ __acquires(&gcwq->lock)
        /* claim and dequeue */
        debug_work_deactivate(work);
-        hlist_add_head(&worker->hentry, bwh);
+        hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
        worker->current_work = work;
-        worker->current_cwq = cwq;
+        worker->current_func = work->func;
+        worker->current_pwq = pwq;
        work_color = get_work_color(work);
        list_del_init(&work->entry);
@@ -2211,53 +2136,55 @@ __acquires(&gcwq->lock)
                worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
        /*
-         * Unbound gcwq isn't concurrency managed and work items should be
+         * Unbound pool isn't concurrency managed and work items should be
         * executed ASAP.  Wake up another worker if necessary.
         */
        if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
                wake_up_worker(pool);
        /*
-         * Record the last CPU and clear PENDING which should be the last
+         * Record the last pool and clear PENDING which should be the last
-         * update to @work.  Also, do this inside @gcwq->lock so that
+         * update to @work.  Also, do this inside @pool->lock so that
         * PENDING and queued state changes happen together while IRQ is
         * disabled.
         */
-        set_work_cpu_and_clear_pending(work, gcwq->cpu);
+        set_work_pool_and_clear_pending(work, pool->id);
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
-        lock_map_acquire_read(&cwq->wq->lockdep_map);
+        lock_map_acquire_read(&pwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        trace_workqueue_execute_start(work);
-        f(work);
+        worker->current_func(work);
        /*
         * While we must be careful to not use "work" after this, the trace
         * point will only record its address.
         */
        trace_workqueue_execute_end(work);
        lock_map_release(&lockdep_map);
-        lock_map_release(&cwq->wq->lockdep_map);
+        lock_map_release(&pwq->wq->lockdep_map);
        if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
                pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
                       "     last function: %pf\n",
-                       current->comm, preempt_count(), task_pid_nr(current), f);
+                       current->comm, preempt_count(), task_pid_nr(current),
+                       worker->current_func);
                debug_show_held_locks(current);
                dump_stack();
        }
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        /* clear cpu intensive status */
        if (unlikely(cpu_intensive))
                worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
        /* we're done with it, release */
-        hlist_del_init(&worker->hentry);
+        hash_del(&worker->hentry);
        worker->current_work = NULL;
-        worker->current_cwq = NULL;
+        worker->current_func = NULL;
-        cwq_dec_nr_in_flight(cwq, work_color);
+        worker->current_pwq = NULL;
+        pwq_dec_nr_in_flight(pwq, work_color);
 }
 /**
@@ -2269,7 +2196,7 @@ __acquires(&gcwq->lock)
 * fetches a work from the top and executes it.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock) which may be released and regrabbed
+ * spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.
 */
 static void process_scheduled_works(struct worker *worker)
@@ -2285,8 +2212,8 @@ static void process_scheduled_works(struct worker *worker)
 * worker_thread - the worker thread function
 * @__worker: self
 *
- * The gcwq worker thread function.  There's a single dynamic pool of
+ * The worker thread function.  There are NR_CPU_WORKER_POOLS dynamic pools
- * these per each cpu.  These workers process all works regardless of
+ * of these per each cpu.  These workers process all works regardless of
 * their specific target workqueue.  The only exception is works which
 * belong to workqueues with a rescuer which will be explained in
 * rescuer_thread().
@@ -2295,16 +2222,15 @@ static int worker_thread(void *__worker)
 {
        struct worker *worker = __worker;
        struct worker_pool *pool = worker->pool;
-        struct global_cwq *gcwq = pool->gcwq;
        /* tell the scheduler that this is a workqueue worker */
        worker->task->flags |= PF_WQ_WORKER;
 woke_up:
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
        /* we are off idle list if destruction or rebind is requested */
        if (unlikely(list_empty(&worker->entry))) {
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
                /* if DIE is set, destruction is requested */
                if (worker->flags & WORKER_DIE) {
@@ -2363,52 +2289,61 @@ sleep:
                goto recheck;
        /*
-         * gcwq->lock is held and there's no work to process and no
+         * pool->lock is held and there's no work to process and no need to
-         * need to manage, sleep.  Workers are woken up only while
+         * manage, sleep.  Workers are woken up only while holding
-         * holding gcwq->lock or from local cpu, so setting the
+         * pool->lock or from local cpu, so setting the current state
-         * current state before releasing gcwq->lock is enough to
+         * before releasing pool->lock is enough to prevent losing any
-         * prevent losing any event.
+         * event.
         */
        worker_enter_idle(worker);
        __set_current_state(TASK_INTERRUPTIBLE);
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        schedule();
        goto woke_up;
 }
 /**
 * rescuer_thread - the rescuer thread function
- * @__wq: the associated workqueue
+ * @__rescuer: self
 *
 * Workqueue rescuer thread function.  There's one rescuer for each
 * workqueue which has WQ_RESCUER set.
 *
- * Regular work processing on a gcwq may block trying to create a new
+ * Regular work processing on a pool may block trying to create a new
 * worker which uses GFP_KERNEL allocation which has slight chance of
 * developing into deadlock if some works currently on the same queue
 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
 * the problem rescuer solves.
 *
- * When such condition is possible, the gcwq summons rescuers of all
+ * When such condition is possible, the pool summons rescuers of all
- * workqueues which have works queued on the gcwq and let them process
+ * workqueues which have works queued on the pool and let them process
 * those works so that forward progress can be guaranteed.
 *
 * This should happen rarely.
 */
-static int rescuer_thread(void *__wq)
+static int rescuer_thread(void *__rescuer)
 {
-        struct workqueue_struct *wq = __wq;
+        struct worker *rescuer = __rescuer;
-        struct worker *rescuer = wq->rescuer;
+        struct workqueue_struct *wq = rescuer->rescue_wq;
        struct list_head *scheduled = &rescuer->scheduled;
        bool is_unbound = wq->flags & WQ_UNBOUND;
        unsigned int cpu;
        set_user_nice(current, RESCUER_NICE_LEVEL);
+        /*
+         * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
+         * doesn't participate in concurrency management.
+         */
+        rescuer->task->flags |= PF_WQ_WORKER;
 repeat:
        set_current_state(TASK_INTERRUPTIBLE);
-        if (kthread_should_stop())
+        if (kthread_should_stop()) {
+                __set_current_state(TASK_RUNNING);
+                rescuer->task->flags &= ~PF_WQ_WORKER;
                return 0;
+        }
        /*
         * See whether any cpu is asking for help.  Unbounded
@@ -2416,9 +2351,8 @@ repeat:
         */
        for_each_mayday_cpu(cpu, wq->mayday_mask) {
                unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
-                struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq);
+                struct pool_workqueue *pwq = get_pwq(tcpu, wq);
-                struct worker_pool *pool = cwq->pool;
+                struct worker_pool *pool = pwq->pool;
-                struct global_cwq *gcwq = pool->gcwq;
                struct work_struct *work, *n;
                __set_current_state(TASK_RUNNING);
@@ -2434,22 +2368,24 @@ repeat:
                 */
                BUG_ON(!list_empty(&rescuer->scheduled));
                list_for_each_entry_safe(work, n, &pool->worklist, entry)
-                        if (get_work_cwq(work) == cwq)
+                        if (get_work_pwq(work) == pwq)
                                move_linked_works(work, scheduled, &n);
                process_scheduled_works(rescuer);
                /*
-                 * Leave this gcwq.  If keep_working() is %true, notify a
+                 * Leave this pool.  If keep_working() is %true, notify a
                 * regular worker; otherwise, we end up with 0 concurrency
                 * and stalling the execution.
                 */
                if (keep_working(pool))
                        wake_up_worker(pool);
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
        }
+        /* rescuers should never participate in concurrency management */
+        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
        goto repeat;
 }
@@ -2467,7 +2403,7 @@ static void wq_barrier_func(struct work_struct *work)
 /**
 * insert_wq_barrier - insert a barrier work
- * @cwq: cwq to insert barrier into
+ * @pwq: pwq to insert barrier into
 * @barr: wq_barrier to insert
 * @target: target work to attach @barr to
 * @worker: worker currently executing @target, NULL if @target is not executing
@@ -2484,12 +2420,12 @@ static void wq_barrier_func(struct work_struct *work)
 * after a work with LINKED flag set.
 *
 * Note that when @worker is non-NULL, @target may be modified
- * underneath us, so we can't reliably determine cwq from @target.
+ * underneath us, so we can't reliably determine pwq from @target.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
-static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+static void insert_wq_barrier(struct pool_workqueue *pwq,
                              struct wq_barrier *barr,
                              struct work_struct *target, struct worker *worker)
 {
@@ -2497,7 +2433,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
        unsigned int linked = 0;
        /*
-         * debugobject calls are safe here even with gcwq->lock locked
+         * debugobject calls are safe here even with pool->lock locked
         * as we know for sure that this will not trigger any of the
         * checks and call back into the fixup functions where we
         * might deadlock.
@@ -2522,23 +2458,23 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
        }
        debug_work_activate(&barr->work);
-        insert_work(cwq, &barr->work, head,
+        insert_work(pwq, &barr->work, head,
                    work_color_to_flags(WORK_NO_COLOR) | linked);
 }
 /**
- * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
 * @wq: workqueue being flushed
 * @flush_color: new flush color, < 0 for no-op
 * @work_color: new work color, < 0 for no-op
 *
- * Prepare cwqs for workqueue flushing.
+ * Prepare pwqs for workqueue flushing.
 *
- * If @flush_color is non-negative, flush_color on all cwqs should be
+ * If @flush_color is non-negative, flush_color on all pwqs should be
- * -1.  If no cwq has in-flight commands at the specified color, all
+ * -1.  If no pwq has in-flight commands at the specified color, all
- * cwq->flush_color's stay at -1 and %false is returned.  If any cwq
+ * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
- * has in flight commands, its cwq->flush_color is set to
+ * has in flight commands, its pwq->flush_color is set to
- * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq
+ * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
 * wakeup logic is armed and %true is returned.
 *
 * The caller should have initialized @wq->first_flusher prior to
@@ -2546,7 +2482,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 * @flush_color is negative, no flush color update is done and %false
 * is returned.
 *
- * If @work_color is non-negative, all cwqs should have the same
+ * If @work_color is non-negative, all pwqs should have the same
 * work_color which is previous to @work_color and all will be
 * advanced to @work_color.
 *
@@ -2557,42 +2493,42 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 * %true if @flush_color >= 0 and there's something to flush.  %false
 * otherwise.
 */
-static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
+static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
                                      int flush_color, int work_color)
 {
        bool wait = false;
        unsigned int cpu;
        if (flush_color >= 0) {
-                BUG_ON(atomic_read(&wq->nr_cwqs_to_flush));
+                BUG_ON(atomic_read(&wq->nr_pwqs_to_flush));
-                atomic_set(&wq->nr_cwqs_to_flush, 1);
+                atomic_set(&wq->nr_pwqs_to_flush, 1);
        }
-        for_each_cwq_cpu(cpu, wq) {
+        for_each_pwq_cpu(cpu, wq) {
-                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                struct global_cwq *gcwq = cwq->pool->gcwq;
+                struct worker_pool *pool = pwq->pool;
-                spin_lock_irq(&gcwq->lock);
+                spin_lock_irq(&pool->lock);
                if (flush_color >= 0) {
-                        BUG_ON(cwq->flush_color != -1);
+                        BUG_ON(pwq->flush_color != -1);
-                        if (cwq->nr_in_flight[flush_color]) {
+                        if (pwq->nr_in_flight[flush_color]) {
-                                cwq->flush_color = flush_color;
+                                pwq->flush_color = flush_color;
-                                atomic_inc(&wq->nr_cwqs_to_flush);
+                                atomic_inc(&wq->nr_pwqs_to_flush);
                                wait = true;
                        }
                }
                if (work_color >= 0) {
-                        BUG_ON(work_color != work_next_color(cwq->work_color));
+                        BUG_ON(work_color != work_next_color(pwq->work_color));
-                        cwq->work_color = work_color;
+                        pwq->work_color = work_color;
                }
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
        }
-        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush))
+        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
                complete(&wq->first_flusher->done);
        return wait;
@@ -2643,7 +2579,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                        wq->first_flusher = &this_flusher;
-                        if (!flush_workqueue_prep_cwqs(wq, wq->flush_color,
+                        if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
                                                       wq->work_color)) {
                                /* nothing to flush, done */
                                wq->flush_color = next_color;
@@ -2654,7 +2590,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                        /* wait in queue */
                        BUG_ON(wq->flush_color == this_flusher.flush_color);
                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
-                        flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }
        } else {
                /*
@@ -2721,7 +2657,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                        list_splice_tail_init(&wq->flusher_overflow,
                                              &wq->flusher_queue);
-                        flush_workqueue_prep_cwqs(wq, -1, wq->work_color);
+                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }
                if (list_empty(&wq->flusher_queue)) {
@@ -2731,7 +2667,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                /*
                 * Need to flush more colors.  Make the next flusher
-                 * the new first flusher and arm cwqs.
+                 * the new first flusher and arm pwqs.
                 */
                BUG_ON(wq->flush_color == wq->work_color);
                BUG_ON(wq->flush_color != next->flush_color);
@@ -2739,7 +2675,7 @@ void flush_workqueue(struct workqueue_struct *wq)
                list_del_init(&next->list);
                wq->first_flusher = next;
-                if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1))
+                if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
                        break;
                /*
@@ -2782,13 +2718,13 @@ void drain_workqueue(struct workqueue_struct *wq)
 reflush:
        flush_workqueue(wq);
-        for_each_cwq_cpu(cpu, wq) {
+        for_each_pwq_cpu(cpu, wq) {
-                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                struct pool_workqueue *pwq = get_pwq(cpu, wq);
                bool drained;
-                spin_lock_irq(&cwq->pool->gcwq->lock);
+                spin_lock_irq(&pwq->pool->lock);
-                drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
+                drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
-                spin_unlock_irq(&cwq->pool->gcwq->lock);
+                spin_unlock_irq(&pwq->pool->lock);
                if (drained)
                        continue;
@@ -2810,34 +2746,29 @@ EXPORT_SYMBOL_GPL(drain_workqueue);
 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 {
        struct worker *worker = NULL;
-        struct global_cwq *gcwq;
+        struct worker_pool *pool;
-        struct cpu_workqueue_struct *cwq;
+        struct pool_workqueue *pwq;
        might_sleep();
-        gcwq = get_work_gcwq(work);
+        pool = get_work_pool(work);
-        if (!gcwq)
+        if (!pool)
                return false;
-        spin_lock_irq(&gcwq->lock);
+        spin_lock_irq(&pool->lock);
-        if (!list_empty(&work->entry)) {
+        /* see the comment in try_to_grab_pending() with the same code */
-                /*
+        pwq = get_work_pwq(work);
-                 * See the comment near try_to_grab_pending()->smp_rmb().
+        if (pwq) {
-                 * If it was re-queued to a different gcwq under us, we
+                if (unlikely(pwq->pool != pool))
-                 * are not going to wait.
-                 */
-                smp_rmb();
-                cwq = get_work_cwq(work);
-                if (unlikely(!cwq || gcwq != cwq->pool->gcwq))
                        goto already_gone;
        } else {
-                worker = find_worker_executing_work(gcwq, work);
+                worker = find_worker_executing_work(pool, work);
                if (!worker)
                        goto already_gone;
-                cwq = worker->current_cwq;
+                pwq = worker->current_pwq;
        }
-        insert_wq_barrier(cwq, barr, work, worker);
+        insert_wq_barrier(pwq, barr, work, worker);
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        /*
         * If @max_active is 1 or rescuer is in use, flushing another work
@@ -2845,15 +2776,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
         * flusher is not running on the same workqueue by verifying write
         * access.
         */
-        if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
+        if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
-                lock_map_acquire(&cwq->wq->lockdep_map);
+                lock_map_acquire(&pwq->wq->lockdep_map);
        else
-                lock_map_acquire_read(&cwq->wq->lockdep_map);
+                lock_map_acquire_read(&pwq->wq->lockdep_map);
-        lock_map_release(&cwq->wq->lockdep_map);
+        lock_map_release(&pwq->wq->lockdep_map);
        return true;
 already_gone:
-        spin_unlock_irq(&gcwq->lock);
+        spin_unlock_irq(&pool->lock);
        return false;
 }
@@ -2949,8 +2880,7 @@ bool flush_delayed_work(struct delayed_work *dwork)
 {
        local_irq_disable();
        if (del_timer_sync(&dwork->timer))
-                __queue_work(dwork->cpu,
+                __queue_work(dwork->cpu, dwork->wq, &dwork->work);
-                             get_work_cwq(&dwork->work)->wq, &dwork->work);
        local_irq_enable();
        return flush_work(&dwork->work);
 }
@@ -2980,7 +2910,8 @@ bool cancel_delayed_work(struct delayed_work *dwork)
        if (unlikely(ret < 0))
                return false;
-        set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work));
+        set_work_pool_and_clear_pending(&dwork->work,
+                                        get_work_pool_id(&dwork->work));
        local_irq_restore(flags);
        return ret;
 }
@@ -3159,46 +3090,46 @@ int keventd_up(void)
        return system_wq != NULL;
 }
-static int alloc_cwqs(struct workqueue_struct *wq)
+static int alloc_pwqs(struct workqueue_struct *wq)
 {
        /*
-         * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
+         * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
         * Make sure that the alignment isn't lower than that of
         * unsigned long long.
         */
-        const size_t size = sizeof(struct cpu_workqueue_struct);
+        const size_t size = sizeof(struct pool_workqueue);
        const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
                                   __alignof__(unsigned long long));
        if (!(wq->flags & WQ_UNBOUND))
-                wq->cpu_wq.pcpu = __alloc_percpu(size, align);
+                wq->pool_wq.pcpu = __alloc_percpu(size, align);
        else {
                void *ptr;
                /*
-                 * Allocate enough room to align cwq and put an extra
+                 * Allocate enough room to align pwq and put an extra
                 * pointer at the end pointing back to the originally
                 * allocated pointer which will be used for free.
                 */
                ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
                if (ptr) {
-                        wq->cpu_wq.single = PTR_ALIGN(ptr, align);
+                        wq->pool_wq.single = PTR_ALIGN(ptr, align);
-                        *(void **)(wq->cpu_wq.single + 1) = ptr;
+                        *(void **)(wq->pool_wq.single + 1) = ptr;
                }
        }
        /* just in case, make sure it's actually aligned */
-        BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
+        BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align));
-        return wq->cpu_wq.v ? 0 : -ENOMEM;
+        return wq->pool_wq.v ? 0 : -ENOMEM;
 }
-static void free_cwqs(struct workqueue_struct *wq)
+static void free_pwqs(struct workqueue_struct *wq)
 {
        if (!(wq->flags & WQ_UNBOUND))
-                free_percpu(wq->cpu_wq.pcpu);
+                free_percpu(wq->pool_wq.pcpu);
-        else if (wq->cpu_wq.single) {
+        else if (wq->pool_wq.single) {
-                /* the pointer to free is stored right after the cwq */
+                /* the pointer to free is stored right after the pwq */
-                kfree(*(void **)(wq->cpu_wq.single + 1));
+                kfree(*(void **)(wq->pool_wq.single + 1));
        }
 }
@@ -3252,27 +3183,25 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        wq->flags = flags;
        wq->saved_max_active = max_active;
        mutex_init(&wq->flush_mutex);
-        atomic_set(&wq->nr_cwqs_to_flush, 0);
+        atomic_set(&wq->nr_pwqs_to_flush, 0);
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
        lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
        INIT_LIST_HEAD(&wq->list);
-        if (alloc_cwqs(wq) < 0)
+        if (alloc_pwqs(wq) < 0)
                goto err;
-        for_each_cwq_cpu(cpu, wq) {
+        for_each_pwq_cpu(cpu, wq) {
-                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                struct global_cwq *gcwq = get_gcwq(cpu);
-                int pool_idx = (bool)(flags & WQ_HIGHPRI);
+                BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
+                pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
-                BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK);
+                pwq->wq = wq;
-                cwq->pool = &gcwq->pools[pool_idx];
+                pwq->flush_color = -1;
-                cwq->wq = wq;
+                pwq->max_active = max_active;
-                cwq->flush_color = -1;
+                INIT_LIST_HEAD(&pwq->delayed_works);
-                cwq->max_active = max_active;
-                INIT_LIST_HEAD(&cwq->delayed_works);
        }
        if (flags & WQ_RESCUER) {
@@ -3285,7 +3214,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                if (!rescuer)
                        goto err;
-                rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+                rescuer->rescue_wq = wq;
+                rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
                                               wq->name);
                if (IS_ERR(rescuer->task))
                        goto err;
@@ -3302,8 +3232,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        spin_lock(&workqueue_lock);
        if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
-                for_each_cwq_cpu(cpu, wq)
+                for_each_pwq_cpu(cpu, wq)
-                        get_cwq(cpu, wq)->max_active = 0;
+                        get_pwq(cpu, wq)->max_active = 0;
        list_add(&wq->list, &workqueues);
@@ -3312,7 +3242,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
        return wq;
 err:
        if (wq) {
-                free_cwqs(wq);
+                free_pwqs(wq);
                free_mayday_mask(wq->mayday_mask);
                kfree(wq->rescuer);
                kfree(wq);
@@ -3343,14 +3273,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
        spin_unlock(&workqueue_lock);
        /* sanity check */
-        for_each_cwq_cpu(cpu, wq) {
+        for_each_pwq_cpu(cpu, wq) {
-                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                struct pool_workqueue *pwq = get_pwq(cpu, wq);
                int i;
                for (i = 0; i < WORK_NR_COLORS; i++)
-                        BUG_ON(cwq->nr_in_flight[i]);
+                        BUG_ON(pwq->nr_in_flight[i]);
-                BUG_ON(cwq->nr_active);
+                BUG_ON(pwq->nr_active);
-                BUG_ON(!list_empty(&cwq->delayed_works));
+                BUG_ON(!list_empty(&pwq->delayed_works));
        }
        if (wq->flags & WQ_RESCUER) {
@@ -3359,29 +3289,29 @@ void destroy_workqueue(struct workqueue_struct *wq)
                kfree(wq->rescuer);
        }
-        free_cwqs(wq);
+        free_pwqs(wq);
        kfree(wq);
 }
 EXPORT_SYMBOL_GPL(destroy_workqueue);
 /**
- * cwq_set_max_active - adjust max_active of a cwq
+ * pwq_set_max_active - adjust max_active of a pwq
- * @cwq: target cpu_workqueue_struct
+ * @pwq: target pool_workqueue
 * @max_active: new max_active value.
 *
- * Set @cwq->max_active to @max_active and activate delayed works if
+ * Set @pwq->max_active to @max_active and activate delayed works if
 * increased.
 *
 * CONTEXT:
- * spin_lock_irq(gcwq->lock).
+ * spin_lock_irq(pool->lock).
 */
-static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active)
+static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
 {
-        cwq->max_active = max_active;
+        pwq->max_active = max_active;
-        while (!list_empty(&cwq->delayed_works) &&
+        while (!list_empty(&pwq->delayed_works) &&
-               cwq->nr_active < cwq->max_active)
+               pwq->nr_active < pwq->max_active)
-                cwq_activate_first_delayed(cwq);
+                pwq_activate_first_delayed(pwq);
 }
 /**
@@ -3404,16 +3334,17 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
        wq->saved_max_active = max_active;
-        for_each_cwq_cpu(cpu, wq) {
+        for_each_pwq_cpu(cpu, wq) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct pool_workqueue *pwq = get_pwq(cpu, wq);
+                struct worker_pool *pool = pwq->pool;
-                spin_lock_irq(&gcwq->lock);
+                spin_lock_irq(&pool->lock);
                if (!(wq->flags & WQ_FREEZABLE) ||
-                    !(gcwq->flags & GCWQ_FREEZING))
+                    !(pool->flags & POOL_FREEZING))
-                        cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);
+                        pwq_set_max_active(pwq, max_active);
-                spin_unlock_irq(&gcwq->lock);
+                spin_unlock_irq(&pool->lock);
        }
        spin_unlock(&workqueue_lock);
@@ -3434,57 +3365,38 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
 */
 bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
 {
-        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+        struct pool_workqueue *pwq = get_pwq(cpu, wq);
-        return !list_empty(&cwq->delayed_works);
+        return !list_empty(&pwq->delayed_works);
 }
 EXPORT_SYMBOL_GPL(workqueue_congested);
 /**
- * work_cpu - return the last known associated cpu for @work
- * @work: the work of interest
- *
- * RETURNS:
- * CPU number if @work was ever queued.  WORK_CPU_NONE otherwise.
- */
-unsigned int work_cpu(struct work_struct *work)
-{
-        struct global_cwq *gcwq = get_work_gcwq(work);
-        return gcwq ? gcwq->cpu : WORK_CPU_NONE;
-}
-EXPORT_SYMBOL_GPL(work_cpu);
-/**
 * work_busy - test whether a work is currently pending or running
 * @work: the work to be tested
 *
 * Test whether @work is currently pending or running.  There is no
 * synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
- * Especially for reentrant wqs, the pending state might hide the
- * running state.
 *
 * RETURNS:
 * OR'd bitmask of WORK_BUSY_* bits.
 */
 unsigned int work_busy(struct work_struct *work)
 {
-        struct global_cwq *gcwq = get_work_gcwq(work);
+        struct worker_pool *pool = get_work_pool(work);
        unsigned long flags;
        unsigned int ret = 0;
-        if (!gcwq)
-                return false;
-        spin_lock_irqsave(&gcwq->lock, flags);
        if (work_pending(work))
                ret |= WORK_BUSY_PENDING;
-        if (find_worker_executing_work(gcwq, work))
-                ret |= WORK_BUSY_RUNNING;
-        spin_unlock_irqrestore(&gcwq->lock, flags);
+        if (pool) {
+                spin_lock_irqsave(&pool->lock, flags);
+                if (find_worker_executing_work(pool, work))
+                        ret |= WORK_BUSY_RUNNING;
+                spin_unlock_irqrestore(&pool->lock, flags);
+        }
        return ret;
 }
@@ -3494,86 +3406,75 @@ EXPORT_SYMBOL_GPL(work_busy);
 * CPU hotplug.
 *
 * There are two challenges in supporting CPU hotplug.  Firstly, there
- * are a lot of assumptions on strong associations among work, cwq and
+ * are a lot of assumptions on strong associations among work, pwq and
- * gcwq which make migrating pending and scheduled works very
+ * pool which make migrating pending and scheduled works very
 * difficult to implement without impacting hot paths.  Secondly,
- * gcwqs serve mix of short, long and very long running works making
+ * worker pools serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
- * This is solved by allowing a gcwq to be disassociated from the CPU
+ * This is solved by allowing the pools to be disassociated from the CPU
 * running as an unbound one and allowing it to be reattached later if the
 * cpu comes back online.
 */
-/* claim manager positions of all pools */
+static void wq_unbind_fn(struct work_struct *work)
-static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)
-{
-        struct worker_pool *pool;
-        for_each_worker_pool(pool, gcwq)
-                mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);
-        spin_lock_irq(&gcwq->lock);
-}
-/* release manager positions */
-static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)
-{
-        struct worker_pool *pool;
-        spin_unlock_irq(&gcwq->lock);
-        for_each_worker_pool(pool, gcwq)
-                mutex_unlock(&pool->assoc_mutex);
-}
-static void gcwq_unbind_fn(struct work_struct *work)
 {
-        struct global_cwq *gcwq = get_gcwq(smp_processor_id());
+        int cpu = smp_processor_id();
        struct worker_pool *pool;
        struct worker *worker;
-        struct hlist_node *pos;
        int i;
-        BUG_ON(gcwq->cpu != smp_processor_id());
+        for_each_std_worker_pool(pool, cpu) {
+                BUG_ON(cpu != smp_processor_id());
-        gcwq_claim_assoc_and_lock(gcwq);
+                mutex_lock(&pool->assoc_mutex);
+                spin_lock_irq(&pool->lock);
-        /*
+                /*
-         * We've claimed all manager positions.  Make all workers unbound
+                 * We've claimed all manager positions.  Make all workers
-         * and set DISASSOCIATED.  Before this, all workers except for the
+                 * unbound and set DISASSOCIATED.  Before this, all workers
-         * ones which are still executing works from before the last CPU
+                 * except for the ones which are still executing works from
-         * down must be on the cpu.  After this, they may become diasporas.
+                 * before the last CPU down must be on the cpu.  After
-         */
+                 * this, they may become diasporas.
-        for_each_worker_pool(pool, gcwq)
+                 */
                list_for_each_entry(worker, &pool->idle_list, entry)
                        worker->flags |= WORKER_UNBOUND;
-        for_each_busy_worker(worker, i, pos, gcwq)
+                for_each_busy_worker(worker, i, pool)
-                worker->flags |= WORKER_UNBOUND;
+                        worker->flags |= WORKER_UNBOUND;
-        gcwq->flags |= GCWQ_DISASSOCIATED;
+                pool->flags |= POOL_DISASSOCIATED;
-        gcwq_release_assoc_and_unlock(gcwq);
+                spin_unlock_irq(&pool->lock);
+                mutex_unlock(&pool->assoc_mutex);
-        /*
+                /*
-         * Call schedule() so that we cross rq->lock and thus can guarantee
+                 * Call schedule() so that we cross rq->lock and thus can
-         * sched callbacks see the %WORKER_UNBOUND flag.  This is necessary
+                 * guarantee sched callbacks see the %WORKER_UNBOUND flag.
-         * as scheduler callbacks may be invoked from other cpus.
+                 * This is necessary as scheduler callbacks may be invoked
-         */
+                 * from other cpus.
-        schedule();
+                 */
+                schedule();
-        /*
+                /*
-         * Sched callbacks are disabled now.  Zap nr_running.  After this,
+                 * Sched callbacks are disabled now.  Zap nr_running.
-         * nr_running stays zero and need_more_worker() and keep_working()
+                 * After this, nr_running stays zero and need_more_worker()
-         * are always true as long as the worklist is not empty.  @gcwq now
+                 * and keep_working() are always true as long as the
-         * behaves as unbound (in terms of concurrency management) gcwq
+                 * worklist is not empty.  This pool now behaves as an
-         * which is served by workers tied to the CPU.
+                 * unbound (in terms of concurrency management) pool which
-         *
+                 * are served by workers tied to the pool.
-         * On return from this function, the current worker would trigger
+                 */
-         * unbound chain execution of pending work items if other workers
+                atomic_set(&pool->nr_running, 0);
-         * didn't already.
-         */
+                /*
-        for_each_worker_pool(pool, gcwq)
+                 * With concurrency management just turned off, a busy
-                atomic_set(get_pool_nr_running(pool), 0);
+                 * worker blocking could lead to lengthy stalls.  Kick off
+                 * unbound chain execution of currently pending work items.
+                 */
+                spin_lock_irq(&pool->lock);
+                wake_up_worker(pool);
+                spin_unlock_irq(&pool->lock);
+        }
 }
 /*
@@ -3585,12 +3486,11 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
                                               void *hcpu)
 {
        unsigned int cpu = (unsigned long)hcpu;
-        struct global_cwq *gcwq = get_gcwq(cpu);
        struct worker_pool *pool;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_UP_PREPARE:
-                for_each_worker_pool(pool, gcwq) {
+                for_each_std_worker_pool(pool, cpu) {
                        struct worker *worker;
                        if (pool->nr_workers)
@@ -3600,18 +3500,24 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
                        if (!worker)
                                return NOTIFY_BAD;
-                        spin_lock_irq(&gcwq->lock);
+                        spin_lock_irq(&pool->lock);
                        start_worker(worker);
-                        spin_unlock_irq(&gcwq->lock);
+                        spin_unlock_irq(&pool->lock);
                }
                break;
        case CPU_DOWN_FAILED:
        case CPU_ONLINE:
-                gcwq_claim_assoc_and_lock(gcwq);
+                for_each_std_worker_pool(pool, cpu) {
-                gcwq->flags &= ~GCWQ_DISASSOCIATED;
+                        mutex_lock(&pool->assoc_mutex);
-                rebind_workers(gcwq);
+                        spin_lock_irq(&pool->lock);
-                gcwq_release_assoc_and_unlock(gcwq);
+                        pool->flags &= ~POOL_DISASSOCIATED;
+                        rebind_workers(pool);
+                        spin_unlock_irq(&pool->lock);
+                        mutex_unlock(&pool->assoc_mutex);
+                }
                break;
        }
        return NOTIFY_OK;
@@ -3631,7 +3537,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
                /* unbinding should happen on the local CPU */
-                INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
+                INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
                queue_work_on(cpu, system_highpri_wq, &unbind_work);
                flush_work(&unbind_work);
                break;
@@ -3684,10 +3590,10 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
 *
 * Start freezing workqueues.  After this function returns, all freezable
 * workqueues will queue new works to their frozen_works list instead of
- * gcwq->worklist.
+ * pool->worklist.
 *
 * CONTEXT:
- * Grabs and releases workqueue_lock and gcwq->lock's.
+ * Grabs and releases workqueue_lock and pool->lock's.
 */
 void freeze_workqueues_begin(void)
 {
@@ -3698,23 +3604,26 @@ void freeze_workqueues_begin(void)
        BUG_ON(workqueue_freezing);
        workqueue_freezing = true;
-        for_each_gcwq_cpu(cpu) {
+        for_each_wq_cpu(cpu) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
+                struct worker_pool *pool;
                struct workqueue_struct *wq;
-                spin_lock_irq(&gcwq->lock);
+                for_each_std_worker_pool(pool, cpu) {
+                        spin_lock_irq(&pool->lock);
-                BUG_ON(gcwq->flags & GCWQ_FREEZING);
+                        WARN_ON_ONCE(pool->flags & POOL_FREEZING);
-                gcwq->flags |= GCWQ_FREEZING;
+                        pool->flags |= POOL_FREEZING;
-                list_for_each_entry(wq, &workqueues, list) {
+                        list_for_each_entry(wq, &workqueues, list) {
-                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                                struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                        if (cwq && wq->flags & WQ_FREEZABLE)
+                                if (pwq && pwq->pool == pool &&
-                                cwq->max_active = 0;
+                                    (wq->flags & WQ_FREEZABLE))
-                }
+                                        pwq->max_active = 0;
+                        }
-                spin_unlock_irq(&gcwq->lock);
+                        spin_unlock_irq(&pool->lock);
+                }
        }
        spin_unlock(&workqueue_lock);
@@ -3742,20 +3651,20 @@ bool freeze_workqueues_busy(void)
        BUG_ON(!workqueue_freezing);
-        for_each_gcwq_cpu(cpu) {
+        for_each_wq_cpu(cpu) {
                struct workqueue_struct *wq;
                /*
                 * nr_active is monotonically decreasing.  It's safe
                 * to peek without lock.
                 */
                list_for_each_entry(wq, &workqueues, list) {
-                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                        struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                        if (!cwq || !(wq->flags & WQ_FREEZABLE))
+                        if (!pwq || !(wq->flags & WQ_FREEZABLE))
                                continue;
-                        BUG_ON(cwq->nr_active < 0);
+                        BUG_ON(pwq->nr_active < 0);
-                        if (cwq->nr_active) {
+                        if (pwq->nr_active) {
                                busy = true;
                                goto out_unlock;
                        }
@@ -3770,10 +3679,10 @@ out_unlock:
 * thaw_workqueues - thaw workqueues
 *
 * Thaw workqueues.  Normal queueing is restored and all collected
- * frozen works are transferred to their respective gcwq worklists.
+ * frozen works are transferred to their respective pool worklists.
 *
 * CONTEXT:
- * Grabs and releases workqueue_lock and gcwq->lock's.
+ * Grabs and releases workqueue_lock and pool->lock's.
 */
 void thaw_workqueues(void)
 {
@@ -3784,30 +3693,31 @@ void thaw_workqueues(void)
        if (!workqueue_freezing)
                goto out_unlock;
-        for_each_gcwq_cpu(cpu) {
+        for_each_wq_cpu(cpu) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
                struct worker_pool *pool;
                struct workqueue_struct *wq;
-                spin_lock_irq(&gcwq->lock);
+                for_each_std_worker_pool(pool, cpu) {
+                        spin_lock_irq(&pool->lock);
-                BUG_ON(!(gcwq->flags & GCWQ_FREEZING));
+                        WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
-                gcwq->flags &= ~GCWQ_FREEZING;
+                        pool->flags &= ~POOL_FREEZING;
-                list_for_each_entry(wq, &workqueues, list) {
+                        list_for_each_entry(wq, &workqueues, list) {
-                        struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                                struct pool_workqueue *pwq = get_pwq(cpu, wq);
-                        if (!cwq || !(wq->flags & WQ_FREEZABLE))
+                                if (!pwq || pwq->pool != pool ||
-                                continue;
+                                    !(wq->flags & WQ_FREEZABLE))
+                                        continue;
-                        /* restore max_active and repopulate worklist */
+                                /* restore max_active and repopulate worklist */
-                        cwq_set_max_active(cwq, wq->saved_max_active);
+                                pwq_set_max_active(pwq, wq->saved_max_active);
-                }
+                        }
-                for_each_worker_pool(pool, gcwq)
                        wake_up_worker(pool);
-                spin_unlock_irq(&gcwq->lock);
+                        spin_unlock_irq(&pool->lock);
+                }
        }
        workqueue_freezing = false;
@@ -3819,60 +3729,56 @@ out_unlock:
 static int __init init_workqueues(void)
 {
        unsigned int cpu;
-        int i;
-        /* make sure we have enough bits for OFFQ CPU number */
+        /* make sure we have enough bits for OFFQ pool ID */
-        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) <
+        BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
-                     WORK_CPU_LAST);
+                     WORK_CPU_END * NR_STD_WORKER_POOLS);
        cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
        hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
-        /* initialize gcwqs */
+        /* initialize CPU pools */
-        for_each_gcwq_cpu(cpu) {
+        for_each_wq_cpu(cpu) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
                struct worker_pool *pool;
-                spin_lock_init(&gcwq->lock);
+                for_each_std_worker_pool(pool, cpu) {
-                gcwq->cpu = cpu;
+                        spin_lock_init(&pool->lock);
-                gcwq->flags |= GCWQ_DISASSOCIATED;
+                        pool->cpu = cpu;
+                        pool->flags |= POOL_DISASSOCIATED;
-                for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++)
-                        INIT_HLIST_HEAD(&gcwq->busy_hash[i]);
-                for_each_worker_pool(pool, gcwq) {
-                        pool->gcwq = gcwq;
                        INIT_LIST_HEAD(&pool->worklist);
                        INIT_LIST_HEAD(&pool->idle_list);
+                        hash_init(pool->busy_hash);
                        init_timer_deferrable(&pool->idle_timer);
                        pool->idle_timer.function = idle_worker_timeout;
                        pool->idle_timer.data = (unsigned long)pool;
-                        setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,
+                        setup_timer(&pool->mayday_timer, pool_mayday_timeout,
                                    (unsigned long)pool);
                        mutex_init(&pool->assoc_mutex);
                        ida_init(&pool->worker_ida);
+                        /* alloc pool ID */
+                        BUG_ON(worker_pool_assign_id(pool));
                }
        }
        /* create the initial worker */
-        for_each_online_gcwq_cpu(cpu) {
+        for_each_online_wq_cpu(cpu) {
-                struct global_cwq *gcwq = get_gcwq(cpu);
                struct worker_pool *pool;
-                if (cpu != WORK_CPU_UNBOUND)
+                for_each_std_worker_pool(pool, cpu) {
-                        gcwq->flags &= ~GCWQ_DISASSOCIATED;
-                for_each_worker_pool(pool, gcwq) {
                        struct worker *worker;
+                        if (cpu != WORK_CPU_UNBOUND)
+                                pool->flags &= ~POOL_DISASSOCIATED;
                        worker = create_worker(pool);
                        BUG_ON(!worker);
-                        spin_lock_irq(&gcwq->lock);
+                        spin_lock_irq(&pool->lock);
                        start_worker(worker);
-                        spin_unlock_irq(&gcwq->lock);
+                        spin_unlock_irq(&pool->lock);
                }
        }
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
new file mode 100644
index 000000000000..07650264ec15
--- /dev/null
+++ b/kernel/workqueue_internal.h
@@ -0,0 +1,65 @@
+/*
+ * kernel/workqueue_internal.h
+ *
+ * Workqueue internal header file.  Only to be included by workqueue and
+ * core kernel subsystems.
+ */
+#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
+#define _KERNEL_WORKQUEUE_INTERNAL_H
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+struct worker_pool;
+/*
+ * The poor guys doing the actual heavy lifting.  All on-duty workers are
+ * either serving the manager role, on idle list or on busy hash.  For
+ * details on the locking annotation (L, I, X...), refer to workqueue.c.
+ *
+ * Only to be used in workqueue and async.
+ */
+struct worker {
+        /* on idle list while idle, on busy hash table while busy */
+        union {
+                struct list_head        entry;  /* L: while idle */
+                struct hlist_node       hentry; /* L: while busy */
+        };
+        struct work_struct      *current_work;  /* L: work being processed */
+        work_func_t             current_func;   /* L: current_work's fn */
+        struct pool_workqueue   *current_pwq; /* L: current_work's pwq */
+        struct list_head        scheduled;      /* L: scheduled works */
+        struct task_struct      *task;          /* I: worker task */
+        struct worker_pool      *pool;          /* I: the associated pool */
+        /* 64 bytes boundary on 64bit, 32 on 32bit */
+        unsigned long           last_active;    /* L: last active timestamp */
+        unsigned int            flags;          /* X: flags */
+        int                     id;             /* I: worker id */
+        /* for rebinding worker to CPU */
+        struct work_struct      rebind_work;    /* L: for busy worker */
+        /* used only by rescuers to point to the target workqueue */
+        struct workqueue_struct *rescue_wq;     /* I: the workqueue to rescue */
+};
+/**
+ * current_wq_worker - return struct worker if %current is a workqueue worker
+ */
+static inline struct worker *current_wq_worker(void)
+{
+        if (current->flags & PF_WQ_WORKER)
+                return kthread_data(current);
+        return NULL;
+}
+/*
+ * Scheduler hooks for concurrency managed workqueue.  Only to be used from
+ * sched.c and workqueue.c.
+ */
+void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task,
+                                       unsigned int cpu);
+#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
deleted file mode 100644
index 2d10fc98dc79..000000000000
--- a/kernel/workqueue_sched.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * kernel/workqueue_sched.h
- *
- * Scheduler hooks for concurrency managed workqueue.  Only to be
- * included from sched.c and workqueue.c.
- */
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
-                                       unsigned int cpu);