32 files changed, 647 insertions, 466 deletions
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 3a5ca582ba1e..ed206fd88cca 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -250,7 +250,6 @@ static void untag_chunk(struct node *p)
                spin_unlock(&hash_lock);
                spin_unlock(&entry->lock);
                fsnotify_destroy_mark(entry);
-                fsnotify_put_mark(entry);
                goto out;
        }
@@ -259,7 +258,7 @@ static void untag_chunk(struct node *p)
        fsnotify_duplicate_mark(&new->mark, entry);
        if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
-                free_chunk(new);
+                fsnotify_put_mark(&new->mark);
                goto Fallback;
        }
@@ -293,7 +292,7 @@ static void untag_chunk(struct node *p)
        spin_unlock(&hash_lock);
        spin_unlock(&entry->lock);
        fsnotify_destroy_mark(entry);
-        fsnotify_put_mark(entry);
+        fsnotify_put_mark(&new->mark);  /* drop initial reference */
        goto out;
 Fallback:
@@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
        entry = &chunk->mark;
        if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
-                free_chunk(chunk);
+                fsnotify_put_mark(entry);
                return -ENOSPC;
        }
@@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
        insert_hash(chunk);
        spin_unlock(&hash_lock);
        spin_unlock(&entry->lock);
+        fsnotify_put_mark(entry);       /* drop initial reference */
        return 0;
 }
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        fsnotify_duplicate_mark(chunk_entry, old_entry);
        if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
                spin_unlock(&old_entry->lock);
-                free_chunk(chunk);
+                fsnotify_put_mark(chunk_entry);
                fsnotify_put_mark(old_entry);
                return -ENOSPC;
        }
@@ -444,8 +444,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
        spin_unlock(&chunk_entry->lock);
        spin_unlock(&old_entry->lock);
        fsnotify_destroy_mark(old_entry);
+        fsnotify_put_mark(chunk_entry); /* drop initial reference */
        fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
-        fsnotify_put_mark(old_entry); /* and kill it */
        return 0;
 }
@@ -916,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
        struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
        evict_chunk(chunk);
-        fsnotify_put_mark(entry);
+        /*
+         * We are guaranteed to have at least one reference to the mark from
+         * either the inode or the caller of fsnotify_destroy_mark().
+         */
+        BUG_ON(atomic_read(&entry->refcnt) < 1);
 }
 static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8b68ce78ff17..be7b33b73d30 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -12,6 +12,7 @@
 #include <linux/kdb.h>
 #include <linux/kdebug.h>
 #include <linux/export.h>
+#include <linux/hardirq.h>
 #include "kdb_private.h"
 #include "../debug_core.h"
@@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks)
        if (atomic_read(&kgdb_setting_breakpoint))
                reason = KDB_REASON_KEYBOARD;
+        if (in_nmi())
+                reason = KDB_REASON_NMI;
        for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
                if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
                        reason = KDB_REASON_BREAK;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index bb9520f0f6ff..0a69d2adc4f3 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -715,9 +715,6 @@ kdb_printit:
        /* check for having reached the LINES number of printed lines */
        if (kdb_nextline == linecount) {
                char buf1[16] = "";
-#if defined(CONFIG_SMP)
-                char buf2[32];
-#endif
                /* Watch out for recursion here.  Any routine that calls
                 * kdb_printf will come back through here.  And kdb_read
@@ -732,14 +729,6 @@ kdb_printit:
                if (moreprompt == NULL)
                        moreprompt = "more> ";
-#if defined(CONFIG_SMP)
-                if (strchr(moreprompt, '%')) {
-                        sprintf(buf2, moreprompt, get_cpu());
-                        put_cpu();
-                        moreprompt = buf2;
-                }
-#endif
                kdb_input_flush();
                c = console_drivers;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 1f91413edb87..31df1706b9a9 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -139,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
 static char *__env[] = {
 #if defined(CONFIG_SMP)
 "PROMPT=[%d]kdb> ",
- "MOREPROMPT=[%d]more> ",
 #else
 "PROMPT=kdb> ",
- "MOREPROMPT=more> ",
 #endif
+ "MOREPROMPT=more> ",
 "RADIX=16",
 "MDCOUNT=8",                   /* lines of md output */
 KDB_PLATFORM_ENV,
@@ -1236,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
                *cmdbuf = '\0';
                *(cmd_hist[cmd_head]) = '\0';
-                if (KDB_FLAG(ONLY_DO_DUMP)) {
-                        /* kdb is off but a catastrophic error requires a dump.
-                         * Take the dump and reboot.
-                         * Turn on logging so the kdb output appears in the log
-                         * buffer in the dump.
-                         */
-                        const char *setargs[] = { "set", "LOGGING", "1" };
-                        kdb_set(2, setargs);
-                        kdb_reboot(0, NULL);
-                        /*NOTREACHED*/
-                }
 do_full_getstr:
 #if defined(CONFIG_SMP)
                snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6581a040f399..98d4597f43d6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -153,7 +153,8 @@ put_callchain_entry(int rctx)
        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
 }
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
 {
        int rctx;
        struct perf_callchain_entry *entry;
@@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
        }
        if (regs) {
+                /*
+                 * Disallow cross-task user callchains.
+                 */
+                if (event->ctx->task && event->ctx->task != current)
+                        goto exit_put;
                perf_callchain_store(entry, PERF_CONTEXT_USER);
                perf_callchain_user(entry, regs);
        }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f1cf0edeb39a..7fee567153f0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1253,7 +1253,7 @@ retry:
 /*
 * Cross CPU call to disable a performance event
 */
-static int __perf_event_disable(void *info)
+int __perf_event_disable(void *info)
 {
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
@@ -2935,12 +2935,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 /*
 * Called when the last reference to the file is gone.
 */
-static int perf_release(struct inode *inode, struct file *file)
+static void put_event(struct perf_event *event)
 {
-        struct perf_event *event = file->private_data;
        struct task_struct *owner;
-        file->private_data = NULL;
+        if (!atomic_long_dec_and_test(&event->refcount))
+                return;
        rcu_read_lock();
        owner = ACCESS_ONCE(event->owner);
@@ -2975,7 +2975,13 @@ static int perf_release(struct inode *inode, struct file *file)
                put_task_struct(owner);
        }
-        return perf_event_release_kernel(event);
+        perf_event_release_kernel(event);
+}
+static int perf_release(struct inode *inode, struct file *file)
+{
+        put_event(file->private_data);
+        return 0;
 }
 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -3227,7 +3233,7 @@ unlock:
 static const struct file_operations perf_fops;
-static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+static struct file *perf_fget_light(int fd, int *fput_needed)
 {
        struct file *file;
@@ -3241,7 +3247,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed)
                return ERR_PTR(-EBADF);
        }
-        return file->private_data;
+        return file;
 }
 static int perf_event_set_output(struct perf_event *event,
@@ -3273,19 +3279,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case PERF_EVENT_IOC_SET_OUTPUT:
        {
+                struct file *output_file = NULL;
                struct perf_event *output_event = NULL;
                int fput_needed = 0;
                int ret;
                if (arg != -1) {
-                        output_event = perf_fget_light(arg, &fput_needed);
+                        output_file = perf_fget_light(arg, &fput_needed);
-                        if (IS_ERR(output_event))
+                        if (IS_ERR(output_file))
-                                return PTR_ERR(output_event);
+                                return PTR_ERR(output_file);
+                        output_event = output_file->private_data;
                }
                ret = perf_event_set_output(event, output_event);
                if (output_event)
-                        fput_light(output_event->filp, fput_needed);
+                        fput_light(output_file, fput_needed);
                return ret;
        }
@@ -4039,7 +4047,7 @@ void perf_prepare_sample(struct perf_event_header *header,
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;
-                data->callchain = perf_callchain(regs);
+                data->callchain = perf_callchain(event, regs);
                if (data->callchain)
                        size += data->callchain->nr;
@@ -5209,7 +5217,8 @@ static int perf_tp_event_match(struct perf_event *event,
 }
 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
-                   struct pt_regs *regs, struct hlist_head *head, int rctx)
+                   struct pt_regs *regs, struct hlist_head *head, int rctx,
+                   struct task_struct *task)
 {
        struct perf_sample_data data;
        struct perf_event *event;
@@ -5228,6 +5237,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
                        perf_swevent_event(event, count, &data, regs);
        }
+        /*
+         * If we got specified a target task, also iterate its context and
+         * deliver this event there too.
+         */
+        if (task && task != current) {
+                struct perf_event_context *ctx;
+                struct trace_entry *entry = record;
+                rcu_read_lock();
+                ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+                if (!ctx)
+                        goto unlock;
+                list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+                        if (event->attr.type != PERF_TYPE_TRACEPOINT)
+                                continue;
+                        if (event->attr.config != entry->type)
+                                continue;
+                        if (perf_tp_event_match(event, &data, regs))
+                                perf_swevent_event(event, count, &data, regs);
+                }
+unlock:
+                rcu_read_unlock();
+        }
        perf_swevent_put_recursion_context(rctx);
 }
 EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -5924,6 +5958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        mutex_init(&event->mmap_mutex);
+        atomic_long_set(&event->refcount, 1);
        event->cpu              = cpu;
        event->attr             = *attr;
        event->group_leader     = group_leader;
@@ -6234,12 +6269,12 @@ SYSCALL_DEFINE5(perf_event_open,
                return event_fd;
        if (group_fd != -1) {
-                group_leader = perf_fget_light(group_fd, &fput_needed);
+                group_file = perf_fget_light(group_fd, &fput_needed);
-                if (IS_ERR(group_leader)) {
+                if (IS_ERR(group_file)) {
-                        err = PTR_ERR(group_leader);
+                        err = PTR_ERR(group_file);
                        goto err_fd;
                }
-                group_file = group_leader->filp;
+                group_leader = group_file->private_data;
                if (flags & PERF_FLAG_FD_OUTPUT)
                        output_event = group_leader;
                if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6376,7 +6411,6 @@ SYSCALL_DEFINE5(perf_event_open,
                put_ctx(gctx);
        }
-        event->filp = event_file;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
@@ -6470,7 +6504,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                goto err_free;
        }
-        event->filp = NULL;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_install_in_context(ctx, event, cpu);
@@ -6552,7 +6585,7 @@ static void sync_child_event(struct perf_event *child_event,
         * Release the parent event, if this was the last
         * reference to it.
         */
-        fput(parent_event->filp);
+        put_event(parent_event);
 }
 static void
@@ -6628,9 +6661,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
         *
         *   __perf_event_exit_task()
         *     sync_child_event()
-         *       fput(parent_event->filp)
+         *       put_event()
-         *         perf_release()
+         *         mutex_lock(&ctx->mutex)
-         *           mutex_lock(&ctx->mutex)
         *
         * But since its the parent context it won't be the same instance.
         */
@@ -6698,7 +6730,7 @@ static void perf_free_event(struct perf_event *event,
        list_del_init(&event->child_list);
        mutex_unlock(&parent->child_mutex);
-        fput(parent->filp);
+        put_event(parent);
        perf_group_detach(event);
        list_del_event(event, ctx);
@@ -6778,6 +6810,12 @@ inherit_event(struct perf_event *parent_event,
                                           NULL, NULL);
        if (IS_ERR(child_event))
                return child_event;
+        if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+                free_event(child_event);
+                return NULL;
+        }
        get_ctx(child_ctx);
        /*
@@ -6819,14 +6857,6 @@ inherit_event(struct perf_event *parent_event,
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
        /*
-         * Get a reference to the parent filp - we will fput it
-         * when the child event exits. This is safe to do because
-         * we are in the parent and we know that the filp still
-         * exists and has a nonzero count:
-         */
-        atomic_long_inc(&parent_event->filp->f_count);
-        /*
         * Link this into the parent event's child list
         */
        WARN_ON_ONCE(parent_event->ctx->parent_ctx);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index bb38c4d3ee12..9a7b487c6fe2 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
        int old_type = bp->attr.bp_type;
        int err = 0;
-        perf_event_disable(bp);
+        /*
+         * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
+         * will not be possible to raise IPIs that invoke __perf_event_disable.
+         * So call the function directly after making sure we are targeting the
+         * current task.
+         */
+        if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
+                __perf_event_disable(bp);
+        else
+                perf_event_disable(bp);
        bp->attr.bp_addr = attr->bp_addr;
        bp->attr.bp_type = attr->bp_type;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f90afc..a096c19f2c2a 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle,
 }
 /* Callchain handling */
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs);
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3bd2280d79f6..2c8857e12855 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -455,8 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                if (retval)
                        goto out;
-                if (file && uprobe_mmap(tmp))
+                if (file)
-                        goto out;
+                        uprobe_mmap(tmp);
        }
        /* a new mm has just been created */
        arch_dup_mmap(oldmm, mm);
diff --git a/kernel/futex.c b/kernel/futex.c
index e2b0fb9a0b3b..3717e7b306e0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 * @uaddr2:     the pi futex we will take prior to returning to user-space
 *
 * The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
+ * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
- * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
- * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
- * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * without one, the pi logic would not know which task to boost/deboost, if
- * need to.
+ * there was a need to.
 *
 * We call schedule in futex_wait_queue_me() when we enqueue and return there
 * via the following:
@@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
        struct futex_q q = futex_q_init;
        int res, ret;
+        if (uaddr == uaddr2)
+                return -EINVAL;
        if (!bitset)
                return -EINVAL;
@@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
                 * the pi_state.
                 */
-                WARN_ON(!&q.pi_state);
+                WARN_ON(!q.pi_state);
                pi_mutex = &q.pi_state->pi_mutex;
                ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
                debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
         * fault, unlock the rt_mutex and return the fault to userspace.
         */
        if (ret == -EFAULT) {
-                if (rt_mutex_owner(pi_mutex) == current)
+                if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
                        rt_mutex_unlock(pi_mutex);
        } else if (ret == -EINTR) {
                /*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a8e8f059627..4c69326aa773 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -944,6 +944,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        }
        /*
+         * Drivers are often written to work w/o knowledge about the
+         * underlying irq chip implementation, so a request for a
+         * threaded irq without a primary hard irq context handler
+         * requires the ONESHOT flag to be set. Some irq chips like
+         * MSI based interrupts are per se one shot safe. Check the
+         * chip flags, so we can avoid the unmask dance at the end of
+         * the threaded handler for those.
+         */
+        if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
+                new->flags &= ~IRQF_ONESHOT;
+        /*
         * The following block of code has to be executed atomically
         */
        raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1017,7 +1029,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                 */
                new->thread_mask = 1 << ffz(thread_mask);
-        } else if (new->handler == irq_default_primary_handler) {
+        } else if (new->handler == irq_default_primary_handler &&
+                   !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
                /*
                 * The interrupt was requested with handler = NULL, so
                 * we use the default primary handler for it. But it
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index b3c7fd554250..6144bab8fd8e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -232,15 +232,19 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
         */
        tmp.data = &current->nsproxy->pid_ns->last_pid;
-        return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+        return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 }
+extern int pid_max;
+static int zero = 0;
 static struct ctl_table pid_ns_ctl_table[] = {
        {
                .procname = "ns_last_pid",
                .maxlen = sizeof(int),
                .mode = 0666, /* permissions are checked in the handler */
                .proc_handler = pid_ns_ctl_handler,
+                .extra1 = &zero,
+                .extra2 = &pid_max,
        },
        { }
 };
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1da39ea248fd..c8b7446b27df 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -178,9 +178,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
        arch_suspend_enable_irqs();
        BUG_ON(irqs_disabled());
-        /* Kick the lockup detector */
-        lockup_detector_bootcpu_resume();
 Enable_cpus:
        enable_nonboot_cpus();
diff --git a/kernel/printk.c b/kernel/printk.c
index 6a76ab9d4476..66a2ea37b576 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1034,6 +1034,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                        struct log *msg = log_from_idx(idx);
                        len += msg_print_text(msg, prev, true, NULL, 0);
+                        prev = msg->flags;
                        idx = log_next(idx);
                        seq++;
                }
@@ -1046,6 +1047,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
                        struct log *msg = log_from_idx(idx);
                        len -= msg_print_text(msg, prev, true, NULL, 0);
+                        prev = msg->flags;
                        idx = log_next(idx);
                        seq++;
                }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d325c4b2dcbb..649c9f876cb1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 # define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
 #endif
+static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+{
+        u64 temp = (__force u64) rtime;
+        temp *= (__force u64) utime;
+        if (sizeof(cputime_t) == 4)
+                temp = div_u64(temp, (__force u32) total);
+        else
+                temp = div64_u64(temp, (__force u64) total);
+        return (__force cputime_t) temp;
+}
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        cputime_t rtime, utime = p->utime, total = utime + p->stime;
@@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
         */
        rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
-        if (total) {
+        if (total)
-                u64 temp = (__force u64) rtime;
+                utime = scale_utime(utime, rtime, total);
+        else
-                temp *= (__force u64) utime;
-                do_div(temp, (__force u32) total);
-                utime = (__force cputime_t) temp;
-        } else
                utime = rtime;
        /*
@@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        total = cputime.utime + cputime.stime;
        rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
-        if (total) {
+        if (total)
-                u64 temp = (__force u64) rtime;
+                utime = scale_utime(cputime.utime, rtime, total);
+        else
-                temp *= (__force u64) cputime.utime;
-                do_div(temp, (__force u32) total);
-                utime = (__force cputime_t) temp;
-        } else
                utime = rtime;
        sig->prev_utime = max(sig->prev_utime, utime);
@@ -4340,9 +4346,7 @@ recheck:
         */
        if (unlikely(policy == p->policy && (!rt_policy(policy) ||
                        param->sched_priority == p->rt_priority))) {
+                task_rq_unlock(rq, p, &flags);
-                __task_rq_unlock(rq);
-                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                return 0;
        }
@@ -5300,27 +5304,17 @@ void idle_task_exit(void)
 }
 /*
- * While a dead CPU has no uninterruptible tasks queued at this point,
+ * Since this CPU is going 'away' for a while, fold any nr_active delta
- * it might still have a nonzero ->nr_uninterruptible counter, because
+ * we might have. Assumes we're called after migrate_tasks() so that the
- * for performance reasons the counter is not stricly tracking tasks to
+ * nr_active count is stable.
- * their home CPUs. So we just add the counter to another CPU's counter,
+ *
- * to keep the global sum constant after CPU-down:
+ * Also see the comment "Global load-average calculations".
- */
-static void migrate_nr_uninterruptible(struct rq *rq_src)
-{
-        struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-        rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
-        rq_src->nr_uninterruptible = 0;
-}
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
 */
-static void calc_global_load_remove(struct rq *rq)
+static void calc_load_migrate(struct rq *rq)
 {
-        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+        long delta = calc_load_fold_active(rq);
-        rq->calc_load_active = 0;
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
 }
 /*
@@ -5348,9 +5342,6 @@ static void migrate_tasks(unsigned int dead_cpu)
         */
        rq->stop = NULL;
-        /* Ensure any throttled groups are reachable by pick_next_task */
-        unthrottle_offline_cfs_rqs(rq);
        for ( ; ; ) {
                /*
                 * There's this thread running, bail when that's the only
@@ -5614,8 +5605,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                BUG_ON(rq->nr_running != 1); /* the migration thread */
                raw_spin_unlock_irqrestore(&rq->lock, flags);
-                migrate_nr_uninterruptible(rq);
+                calc_load_migrate(rq);
-                calc_global_load_remove(rq);
                break;
 #endif
        }
@@ -6024,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 * allows us to avoid some pointer chasing select_idle_sibling().
 *
- * Iterate domains and sched_groups downward, assigning CPUs to be
- * select_idle_sibling() hw buddy.  Cross-wiring hw makes bouncing
- * due to random perturbation self canceling, ie sw buddies pull
- * their counterpart to their CPU's hw counterpart.
- *
 * Also keep a unique ID per domain (we use the first cpu number in
 * the cpumask of the domain), this allows us to quickly tell if
 * two cpus are in the same cache domain, see cpus_share_cache().
@@ -6042,40 +6027,8 @@ static void update_top_cache_domain(int cpu)
        int id = cpu;
        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-        if (sd) {
+        if (sd)
-                struct sched_domain *tmp = sd;
-                struct sched_group *sg, *prev;
-                bool right;
-                /*
-                 * Traverse to first CPU in group, and count hops
-                 * to cpu from there, switching direction on each
-                 * hop, never ever pointing the last CPU rightward.
-                 */
-                do {
-                        id = cpumask_first(sched_domain_span(tmp));
-                        prev = sg = tmp->groups;
-                        right = 1;
-                        while (cpumask_first(sched_group_cpus(sg)) != id)
-                                sg = sg->next;
-                        while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
-                                prev = sg;
-                                sg = sg->next;
-                                right = !right;
-                        }
-                        /* A CPU went down, never point back to domain start. */
-                        if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
-                                right = false;
-                        sg = right ? sg->next : prev;
-                        tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
-                } while ((tmp = tmp->child));
                id = cpumask_first(sched_domain_span(sd));
-        }
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_id, cpu) = id;
@@ -7248,6 +7201,7 @@ int in_sched_functions(unsigned long addr)
 #ifdef CONFIG_CGROUP_SCHED
 struct task_group root_task_group;
+LIST_HEAD(task_groups);
 #endif
 DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586fdf660..23aa789c53ee 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -65,8 +65,8 @@ static int convert_prio(int prio)
 int cpupri_find(struct cpupri *cp, struct task_struct *p,
                struct cpumask *lowest_mask)
 {
-        int                  idx      = 0;
+        int idx = 0;
-        int                  task_pri = convert_prio(p->prio);
+        int task_pri = convert_prio(p->prio);
        if (task_pri >= MAX_RT_PRIO)
                return 0;
@@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 */
 void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 {
-        int                 *currpri = &cp->cpu_to_pri[cpu];
+        int *currpri = &cp->cpu_to_pri[cpu];
-        int                  oldpri  = *currpri;
+        int oldpri = *currpri;
-        int                  do_mb = 0;
+        int do_mb = 0;
        newpri = convert_prio(newpri);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 22321db64952..96e2b18b6283 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        hrtimer_cancel(&cfs_b->slack_timer);
 }
-void unthrottle_offline_cfs_rqs(struct rq *rq)
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
 {
        struct cfs_rq *cfs_rq;
@@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
        return NULL;
 }
 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
 #endif /* CONFIG_CFS_BANDWIDTH */
@@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
+        struct sched_group *sg;
+        int i;
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target)
                return prev_cpu;
        /*
-         * Otherwise, check assigned siblings to find an elegible idle cpu.
+         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
        for_each_lower_domain(sd) {
-                if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
+                sg = sd->groups;
-                        continue;
+                do {
-                if (idle_cpu(sd->idle_buddy))
+                        if (!cpumask_intersects(sched_group_cpus(sg),
-                        return sd->idle_buddy;
+                                                tsk_cpus_allowed(p)))
-        }
+                                goto next;
+                        for_each_cpu(i, sched_group_cpus(sg)) {
+                                if (!idle_cpu(i))
+                                        goto next;
+                        }
+                        target = cpumask_first_and(sched_group_cpus(sg),
+                                        tsk_cpus_allowed(p));
+                        goto done;
+next:
+                        sg = sg->next;
+                } while (sg != sd->groups);
+        }
+done:
        return target;
 }
@@ -3069,6 +3083,9 @@ struct lb_env {
        int                     new_dst_cpu;
        enum cpu_idle_type      idle;
        long                    imbalance;
+        /* The set of CPUs under consideration for load-balancing */
+        struct cpumask          *cpus;
        unsigned int            flags;
        unsigned int            loop;
@@ -3384,6 +3401,14 @@ static int tg_load_down(struct task_group *tg, void *data)
 static void update_h_load(long cpu)
 {
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long now = jiffies;
+        if (rq->h_load_throttle == now)
+                return;
+        rq->h_load_throttle = now;
        rcu_read_lock();
        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
        rcu_read_unlock();
@@ -3647,14 +3672,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 * @group: sched_group whose statistics are to be updated.
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sgs: variable to hold the statistics for this group.
 */
 static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
-                        int local_group, const struct cpumask *cpus,
+                        int local_group, int *balance, struct sg_lb_stats *sgs)
-                        int *balance, struct sg_lb_stats *sgs)
 {
        unsigned long nr_running, max_nr_running, min_nr_running;
        unsigned long load, max_cpu_load, min_cpu_load;
@@ -3671,7 +3694,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        max_nr_running = 0;
        min_nr_running = ~0UL;
-        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
                nr_running = rq->nr_running;
@@ -3795,13 +3818,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 /**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @env: The load balancing environment.
- * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
 static inline void update_sd_lb_stats(struct lb_env *env,
-                                      const struct cpumask *cpus,
+                                        int *balance, struct sd_lb_stats *sds)
-                                      int *balance, struct sd_lb_stats *sds)
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
@@ -3818,8 +3839,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(env, sg, load_idx, local_group,
+                update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
-                                   cpus, balance, &sgs);
                if (local_group && !(*balance))
                        return;
@@ -4055,7 +4075,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 * to restore balance.
 *
 * @env: The load balancing environment.
- * @cpus: The set of CPUs under consideration for load-balancing.
 * @balance: Pointer to a variable indicating if this_cpu
 *      is the appropriate cpu to perform load balancing at this_level.
 *
@@ -4065,7 +4084,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 *                 put to idle by rebalancing its tasks onto our group.
 */
 static struct sched_group *
-find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
+find_busiest_group(struct lb_env *env, int *balance)
 {
        struct sd_lb_stats sds;
@@ -4075,7 +4094,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
         * Compute the various statistics relavent for load balancing at
         * this level.
         */
-        update_sd_lb_stats(env, cpus, balance, &sds);
+        update_sd_lb_stats(env, balance, &sds);
        /*
         * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4155,8 +4174,7 @@ ret:
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
 static struct rq *find_busiest_queue(struct lb_env *env,
-                                     struct sched_group *group,
+                                     struct sched_group *group)
-                                     const struct cpumask *cpus)
 {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
@@ -4171,7 +4189,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                if (!capacity)
                        capacity = fix_small_capacity(env->sd, group);
-                if (!cpumask_test_cpu(i, cpus))
+                if (!cpumask_test_cpu(i, env->cpus))
                        continue;
                rq = cpu_rq(i);
@@ -4252,6 +4270,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .dst_grpmask    = sched_group_cpus(sd->groups),
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
+                .cpus           = cpus,
        };
        cpumask_copy(cpus, cpu_active_mask);
@@ -4260,7 +4279,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        group = find_busiest_group(&env, cpus, balance);
+        group = find_busiest_group(&env, balance);
        if (*balance == 0)
                goto out_balanced;
@@ -4270,7 +4289,7 @@ redo:
                goto out_balanced;
        }
-        busiest = find_busiest_queue(&env, group, cpus);
+        busiest = find_busiest_queue(&env, group);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -4294,11 +4313,10 @@ redo:
                env.src_rq    = busiest;
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
+                update_h_load(env.src_cpu);
 more_balance:
                local_irq_save(flags);
                double_rq_lock(this_rq, busiest);
-                if (!env.loop)
-                        update_h_load(env.src_cpu);
                /*
                 * cur_ld_moved - load moved in current iteration
@@ -4950,6 +4968,9 @@ static void rq_online_fair(struct rq *rq)
 static void rq_offline_fair(struct rq *rq)
 {
        update_sysctl();
+        /* Ensure any throttled groups are reachable by pick_next_task */
+        unthrottle_offline_cfs_rqs(rq);
 }
 #endif /* CONFIG_SMP */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 573e1ca01102..e0b7ba9c040f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -691,6 +691,7 @@ balanced:
                 * runtime - in which case borrowing doesn't make sense.
                 */
                rt_rq->rt_runtime = RUNTIME_INF;
+                rt_rq->rt_throttled = 0;
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
                raw_spin_unlock(&rt_b->rt_runtime_lock);
        }
@@ -788,6 +789,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
        const struct cpumask *span;
        span = sched_rt_period_mask();
+#ifdef CONFIG_RT_GROUP_SCHED
+        /*
+         * FIXME: isolated CPUs should really leave the root task group,
+         * whether they are isolcpus or were isolated via cpusets, lest
+         * the timer run on a CPU which does not service all runqueues,
+         * potentially leaving other CPUs indefinitely throttled.  If
+         * isolation is really required, the user will turn the throttle
+         * off to kill the perturbations it causes anyway.  Meanwhile,
+         * this maintains functionality for boot and/or troubleshooting.
+         */
+        if (rt_b == &root_task_group.rt_bandwidth)
+                span = cpu_online_mask;
+#endif
        for_each_cpu(i, span) {
                int enqueue = 0;
                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c35a1a7dd4d6..0848fa36c383 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex;
 struct cfs_rq;
 struct rt_rq;
-static LIST_HEAD(task_groups);
+extern struct list_head task_groups;
 struct cfs_bandwidth {
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -374,7 +374,11 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
        struct list_head leaf_cfs_rq_list;
-#endif
+#ifdef CONFIG_SMP
+        unsigned long h_load_throttle;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
        struct list_head leaf_rt_rq_list;
 #endif
@@ -1140,7 +1144,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
 extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void unthrottle_offline_cfs_rqs(struct rq *rq);
 extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e86fd23..da5eb5bed84a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
 {
        struct task_struct *stop = rq->stop;
-        if (stop && stop->on_rq)
+        if (stop && stop->on_rq) {
+                stop->se.exec_start = rq->clock_task;
                return stop;
+        }
        return NULL;
 }
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
 static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 {
+        struct task_struct *curr = rq->curr;
+        u64 delta_exec;
+        delta_exec = rq->clock_task - curr->se.exec_start;
+        if (unlikely((s64)delta_exec < 0))
+                delta_exec = 0;
+        schedstat_set(curr->se.statistics.exec_max,
+                        max(curr->se.statistics.exec_max, delta_exec));
+        curr->se.sum_exec_runtime += delta_exec;
+        account_group_exec_runtime(curr, delta_exec);
+        curr->se.exec_start = rq->clock_task;
+        cpuacct_charge(curr, delta_exec);
 }
 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
 static void set_curr_task_stop(struct rq *rq)
 {
+        struct task_struct *stop = rq->stop;
+        stop->se.exec_start = rq->clock_task;
 }
 static void switched_to_stop(struct rq *rq, struct task_struct *p)
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 91d4e1742a0c..d320d44903bd 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -75,6 +75,7 @@ void task_work_run(void)
                        p = q->next;
                        q->func(q);
                        q = p;
+                        cond_resched();
                }
        }
 }
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a470154e0408..46da0537c10b 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
 * requested HZ value. It is also not recommended
 * for "tick-less" systems.
 */
-#define NSEC_PER_JIFFY  ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+#define NSEC_PER_JIFFY  ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ))
 /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
 * conversion, the .shift value could be zero. However
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b7fbadc5c973..24174b4d669b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock);
 /* USER_HZ period (usecs): */
 unsigned long                   tick_usec = TICK_USEC;
-/* ACTHZ period (nsecs): */
+/* SHIFTED_HZ period (nsecs): */
 unsigned long                   tick_nsec;
 static u64                      tick_length;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 024540f97f74..3a9e5d5c1091 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
        tick_do_update_jiffies64(now);
        update_cpu_load_nohz();
+        calc_load_exit_idle();
        touch_softlockup_watchdog();
        /*
         * Cancel the scheduled timer and restore the tick
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f045cc50832d..d3b91e75cecd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -65,14 +65,14 @@ struct timekeeper {
         * used instead.
         */
        struct timespec         wall_to_monotonic;
-        /* time spent in suspend */
-        struct timespec         total_sleep_time;
-        /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
-        struct timespec         raw_time;
        /* Offset clock monotonic -> clock realtime */
        ktime_t                 offs_real;
+        /* time spent in suspend */
+        struct timespec         total_sleep_time;
        /* Offset clock monotonic -> clock boottime */
        ktime_t                 offs_boot;
+        /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
+        struct timespec         raw_time;
        /* Seqlock for all timekeeper values */
        seqlock_t               lock;
 };
@@ -108,13 +108,39 @@ static struct timespec tk_xtime(struct timekeeper *tk)
 static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
 {
        tk->xtime_sec = ts->tv_sec;
-        tk->xtime_nsec = ts->tv_nsec << tk->shift;
+        tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
 }
 static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
 {
        tk->xtime_sec += ts->tv_sec;
-        tk->xtime_nsec += ts->tv_nsec << tk->shift;
+        tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
+        tk_normalize_xtime(tk);
+}
+static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
+{
+        struct timespec tmp;
+        /*
+         * Verify consistency of: offset_real = -wall_to_monotonic
+         * before modifying anything
+         */
+        set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
+                                        -tk->wall_to_monotonic.tv_nsec);
+        WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
+        tk->wall_to_monotonic = wtm;
+        set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
+        tk->offs_real = timespec_to_ktime(tmp);
+}
+static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
+{
+        /* Verify consistency before modifying */
+        WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
+        tk->total_sleep_time    = t;
+        tk->offs_boot           = timespec_to_ktime(t);
 }
 /**
@@ -217,14 +243,6 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
        return nsec + arch_gettimeoffset();
 }
-static void update_rt_offset(struct timekeeper *tk)
-{
-        struct timespec tmp, *wtm = &tk->wall_to_monotonic;
-        set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
-        tk->offs_real = timespec_to_ktime(tmp);
-}
 /* must hold write on timekeeper.lock */
 static void timekeeping_update(struct timekeeper *tk, bool clearntp)
 {
@@ -234,12 +252,10 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
                tk->ntp_error = 0;
                ntp_clear();
        }
-        update_rt_offset(tk);
        xt = tk_xtime(tk);
        update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
 }
 /**
 * timekeeping_forward_now - update clock to the current time
 *
@@ -261,7 +277,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
        tk->xtime_nsec += cycle_delta * tk->mult;
        /* If arch requires, add in gettimeoffset() */
-        tk->xtime_nsec += arch_gettimeoffset() << tk->shift;
+        tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
        tk_normalize_xtime(tk);
@@ -277,38 +293,39 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 */
 void getnstimeofday(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs = 0;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ts->tv_sec = timekeeper.xtime_sec;
+                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
+                nsecs = timekeeping_get_ns(tk);
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
+        ts->tv_nsec = 0;
        timespec_add_ns(ts, nsecs);
 }
 EXPORT_SYMBOL(getnstimeofday);
 ktime_t ktime_get(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned int seq;
        s64 secs, nsecs;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                secs = timekeeper.xtime_sec +
+                secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
-                                timekeeper.wall_to_monotonic.tv_sec;
+                nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
-                nsecs = timekeeping_get_ns(&timekeeper) +
-                                timekeeper.wall_to_monotonic.tv_nsec;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        /*
         * Use ktime_set/ktime_add_ns to create a proper ktime on
         * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -327,21 +344,24 @@ EXPORT_SYMBOL_GPL(ktime_get);
 */
 void ktime_get_ts(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec tomono;
+        s64 nsec;
        unsigned int seq;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ts->tv_sec = timekeeper.xtime_sec;
+                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
+                nsec = timekeeping_get_ns(tk);
-                tomono = timekeeper.wall_to_monotonic;
+                tomono = tk->wall_to_monotonic;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
-        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+        ts->tv_sec += tomono.tv_sec;
-                                ts->tv_nsec + tomono.tv_nsec);
+        ts->tv_nsec = 0;
+        timespec_add_ns(ts, nsec + tomono.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
@@ -358,22 +378,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
 */
 void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs_raw, nsecs_real;
        WARN_ON_ONCE(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                *ts_raw = timekeeper.raw_time;
+                *ts_raw = tk->raw_time;
-                ts_real->tv_sec = timekeeper.xtime_sec;
+                ts_real->tv_sec = tk->xtime_sec;
                ts_real->tv_nsec = 0;
-                nsecs_raw = timekeeping_get_ns_raw(&timekeeper);
+                nsecs_raw = timekeeping_get_ns_raw(tk);
-                nsecs_real = timekeeping_get_ns(&timekeeper);
+                nsecs_real = timekeeping_get_ns(tk);
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        timespec_add_ns(ts_raw, nsecs_raw);
        timespec_add_ns(ts_real, nsecs_real);
@@ -406,28 +427,28 @@ EXPORT_SYMBOL(do_gettimeofday);
 */
 int do_settimeofday(const struct timespec *tv)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec ts_delta, xt;
        unsigned long flags;
-        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+        if (!timespec_valid_strict(tv))
                return -EINVAL;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
-        xt = tk_xtime(&timekeeper);
+        xt = tk_xtime(tk);
        ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
        ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
-        timekeeper.wall_to_monotonic =
+        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
-                        timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
-        tk_set_xtime(&timekeeper, tv);
+        tk_set_xtime(tk, tv);
-        timekeeping_update(&timekeeper, true);
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
@@ -436,7 +457,6 @@ int do_settimeofday(const struct timespec *tv)
 }
 EXPORT_SYMBOL(do_settimeofday);
 /**
 * timekeeping_inject_offset - Adds or subtracts from the current time.
 * @tv:         pointer to the timespec variable containing the offset
@@ -445,28 +465,37 @@ EXPORT_SYMBOL(do_settimeofday);
 */
 int timekeeping_inject_offset(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
+        struct timespec tmp;
+        int ret = 0;
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
+        /* Make sure the proposed value is valid */
+        tmp = timespec_add(tk_xtime(tk),  *ts);
+        if (!timespec_valid_strict(&tmp)) {
+                ret = -EINVAL;
+                goto error;
+        }
-        tk_xtime_add(&timekeeper, ts);
+        tk_xtime_add(tk, ts);
-        timekeeper.wall_to_monotonic =
+        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
-                                timespec_sub(timekeeper.wall_to_monotonic, *ts);
-        timekeeping_update(&timekeeper, true);
+error: /* even if we error out, we forwarded the time, so call update */
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
-        return 0;
+        return ret;
 }
 EXPORT_SYMBOL(timekeeping_inject_offset);
@@ -477,23 +506,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
 */
 static int change_clocksource(void *data)
 {
+        struct timekeeper *tk = &timekeeper;
        struct clocksource *new, *old;
        unsigned long flags;
        new = (struct clocksource *) data;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
        if (!new->enable || new->enable(new) == 0) {
-                old = timekeeper.clock;
+                old = tk->clock;
-                tk_setup_internals(&timekeeper, new);
+                tk_setup_internals(tk, new);
                if (old->disable)
                        old->disable(old);
        }
-        timekeeping_update(&timekeeper, true);
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        return 0;
 }
@@ -507,7 +537,9 @@ static int change_clocksource(void *data)
 */
 void timekeeping_notify(struct clocksource *clock)
 {
-        if (timekeeper.clock == clock)
+        struct timekeeper *tk = &timekeeper;
+        if (tk->clock == clock)
                return;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
@@ -536,35 +568,36 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
 */
 void getrawmonotonic(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        s64 nsecs;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                nsecs = timekeeping_get_ns_raw(&timekeeper);
+                nsecs = timekeeping_get_ns_raw(tk);
-                *ts = timekeeper.raw_time;
+                *ts = tk->raw_time;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        timespec_add_ns(ts, nsecs);
 }
 EXPORT_SYMBOL(getrawmonotonic);
 /**
 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
 int timekeeping_valid_for_hres(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        int ret;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+                ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return ret;
 }
@@ -574,15 +607,16 @@ int timekeeping_valid_for_hres(void)
 */
 u64 timekeeping_max_deferment(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        u64 ret;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ret = timekeeper.clock->max_idle_ns;
+                ret = tk->clock->max_idle_ns;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return ret;
 }
@@ -622,46 +656,56 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)
 */
 void __init timekeeping_init(void)
 {
+        struct timekeeper *tk = &timekeeper;
        struct clocksource *clock;
        unsigned long flags;
-        struct timespec now, boot;
+        struct timespec now, boot, tmp;
        read_persistent_clock(&now);
+        if (!timespec_valid_strict(&now)) {
+                pr_warn("WARNING: Persistent clock returned invalid value!\n"
+                        "         Check your CMOS/BIOS settings.\n");
+                now.tv_sec = 0;
+                now.tv_nsec = 0;
+        }
        read_boot_clock(&boot);
+        if (!timespec_valid_strict(&boot)) {
+                pr_warn("WARNING: Boot clock returned invalid value!\n"
+                        "         Check your CMOS/BIOS settings.\n");
+                boot.tv_sec = 0;
+                boot.tv_nsec = 0;
+        }
-        seqlock_init(&timekeeper.lock);
+        seqlock_init(&tk->lock);
        ntp_init();
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
-        tk_setup_internals(&timekeeper, clock);
+        tk_setup_internals(tk, clock);
-        tk_set_xtime(&timekeeper, &now);
+        tk_set_xtime(tk, &now);
-        timekeeper.raw_time.tv_sec = 0;
+        tk->raw_time.tv_sec = 0;
-        timekeeper.raw_time.tv_nsec = 0;
+        tk->raw_time.tv_nsec = 0;
        if (boot.tv_sec == 0 && boot.tv_nsec == 0)
-                boot = tk_xtime(&timekeeper);
+                boot = tk_xtime(tk);
-        set_normalized_timespec(&timekeeper.wall_to_monotonic,
+        set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
-                                -boot.tv_sec, -boot.tv_nsec);
+        tk_set_wall_to_mono(tk, tmp);
-        update_rt_offset(&timekeeper);
-        timekeeper.total_sleep_time.tv_sec = 0;
+        tmp.tv_sec = 0;
-        timekeeper.total_sleep_time.tv_nsec = 0;
+        tmp.tv_nsec = 0;
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        tk_set_sleep_time(tk, tmp);
+        write_sequnlock_irqrestore(&tk->lock, flags);
 }
 /* time in seconds when suspend began */
 static struct timespec timekeeping_suspend_time;
-static void update_sleep_time(struct timespec t)
-{
-        timekeeper.total_sleep_time = t;
-        timekeeper.offs_boot = timespec_to_ktime(t);
-}
 /**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @delta: pointer to a timespec delta value
@@ -672,18 +716,16 @@ static void update_sleep_time(struct timespec t)
 static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
                                                        struct timespec *delta)
 {
-        if (!timespec_valid(delta)) {
+        if (!timespec_valid_strict(delta)) {
                printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
                                        "sleep delta value!\n");
                return;
        }
        tk_xtime_add(tk, delta);
-        tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta);
+        tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
-        update_sleep_time(timespec_add(tk->total_sleep_time, *delta));
+        tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
 }
 /**
 * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec delta value
@@ -696,6 +738,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
 */
 void timekeeping_inject_sleeptime(struct timespec *delta)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec ts;
@@ -704,21 +747,20 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
        if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
                return;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
-        __timekeeping_inject_sleeptime(&timekeeper, delta);
+        __timekeeping_inject_sleeptime(tk, delta);
-        timekeeping_update(&timekeeper, true);
+        timekeeping_update(tk, true);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        /* signal hrtimers about time change */
        clock_was_set();
 }
 /**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 *
@@ -728,6 +770,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 */
 static void timekeeping_resume(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec ts;
@@ -735,18 +778,18 @@ static void timekeeping_resume(void)
        clocksource_resume();
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
        if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
                ts = timespec_sub(ts, timekeeping_suspend_time);
-                __timekeeping_inject_sleeptime(&timekeeper, &ts);
+                __timekeeping_inject_sleeptime(tk, &ts);
        }
        /* re-base the last cycle value */
-        timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
+        tk->clock->cycle_last = tk->clock->read(tk->clock);
-        timekeeper.ntp_error = 0;
+        tk->ntp_error = 0;
        timekeeping_suspended = 0;
-        timekeeping_update(&timekeeper, false);
+        timekeeping_update(tk, false);
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        touch_softlockup_watchdog();
@@ -758,14 +801,15 @@ static void timekeeping_resume(void)
 static int timekeeping_suspend(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long flags;
        struct timespec         delta, delta_delta;
        static struct timespec  old_delta;
        read_persistent_clock(&timekeeping_suspend_time);
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
-        timekeeping_forward_now(&timekeeper);
+        timekeeping_forward_now(tk);
        timekeeping_suspended = 1;
        /*
@@ -774,7 +818,7 @@ static int timekeeping_suspend(void)
         * try to compensate so the difference in system time
         * and persistent_clock time stays close to constant.
         */
-        delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time);
+        delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
        delta_delta = timespec_sub(delta, old_delta);
        if (abs(delta_delta.tv_sec)  >= 2) {
                /*
@@ -787,7 +831,7 @@ static int timekeeping_suspend(void)
                timekeeping_suspend_time =
                        timespec_add(timekeeping_suspend_time, delta_delta);
        }
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
        clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
        clocksource_suspend();
@@ -898,27 +942,29 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
                 * the error. This causes the likely below to be unlikely.
                 *
                 * The proper fix is to avoid rounding up by using
-                 * the high precision timekeeper.xtime_nsec instead of
+                 * the high precision tk->xtime_nsec instead of
                 * xtime.tv_nsec everywhere. Fixing this will take some
                 * time.
                 */
                if (likely(error <= interval))
                        adj = 1;
                else
-                        adj = timekeeping_bigadjust(tk, error, &interval,
+                        adj = timekeeping_bigadjust(tk, error, &interval, &offset);
-                                                        &offset);
+        } else {
-        } else if (error < -interval) {
+                if (error < -interval) {
-                /* See comment above, this is just switched for the negative */
+                        /* See comment above, this is just switched for the negative */
-                error >>= 2;
+                        error >>= 2;
-                if (likely(error >= -interval)) {
+                        if (likely(error >= -interval)) {
-                        adj = -1;
+                                adj = -1;
-                        interval = -interval;
+                                interval = -interval;
-                        offset = -offset;
+                                offset = -offset;
-                } else
+                        } else {
-                        adj = timekeeping_bigadjust(tk, error, &interval,
+                                adj = timekeeping_bigadjust(tk, error, &interval, &offset);
-                                                        &offset);
+                        }
-        } else
+                } else {
-                return;
+                        goto out_adjust;
+                }
+        }
        if (unlikely(tk->clock->maxadj &&
                (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
@@ -981,6 +1027,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
        tk->xtime_nsec -= offset;
        tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
+out_adjust:
        /*
         * It may be possible that when we entered this function, xtime_nsec
         * was very small.  Further, if we're slightly speeding the clocksource
@@ -1003,7 +1050,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 }
 /**
 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
 *
@@ -1024,15 +1070,21 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
                /* Figure out if its a leap sec and apply if needed */
                leap = second_overflow(tk->xtime_sec);
-                tk->xtime_sec += leap;
+                if (unlikely(leap)) {
-                tk->wall_to_monotonic.tv_sec -= leap;
+                        struct timespec ts;
-                if (leap)
-                        clock_was_set_delayed();
+                        tk->xtime_sec += leap;
+                        ts.tv_sec = leap;
+                        ts.tv_nsec = 0;
+                        tk_set_wall_to_mono(tk,
+                                timespec_sub(tk->wall_to_monotonic, ts));
+                        clock_was_set_delayed();
+                }
        }
 }
 /**
 * logarithmic_accumulation - shifted accumulation of cycles
 *
@@ -1076,7 +1128,6 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
        return offset;
 }
 /**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
@@ -1084,25 +1135,30 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 static void update_wall_time(void)
 {
        struct clocksource *clock;
+        struct timekeeper *tk = &timekeeper;
        cycle_t offset;
        int shift = 0, maxshift;
        unsigned long flags;
        s64 remainder;
-        write_seqlock_irqsave(&timekeeper.lock, flags);
+        write_seqlock_irqsave(&tk->lock, flags);
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
                goto out;
-        clock = timekeeper.clock;
+        clock = tk->clock;
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-        offset = timekeeper.cycle_interval;
+        offset = tk->cycle_interval;
 #else
        offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
 #endif
+        /* Check if there's really nothing to do */
+        if (offset < tk->cycle_interval)
+                goto out;
        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
@@ -1111,19 +1167,19 @@ static void update_wall_time(void)
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
-        shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+        shift = ilog2(offset) - ilog2(tk->cycle_interval);
        shift = max(0, shift);
        /* Bound shift to one less than what overflows tick_length */
        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
        shift = min(shift, maxshift);
-        while (offset >= timekeeper.cycle_interval) {
+        while (offset >= tk->cycle_interval) {
-                offset = logarithmic_accumulation(&timekeeper, offset, shift);
+                offset = logarithmic_accumulation(tk, offset, shift);
-                if(offset < timekeeper.cycle_interval<<shift)
+                if (offset < tk->cycle_interval<<shift)
                        shift--;
        }
        /* correct the clock when NTP error is too big */
-        timekeeping_adjust(&timekeeper, offset);
+        timekeeping_adjust(tk, offset);
        /*
@@ -1135,21 +1191,21 @@ static void update_wall_time(void)
        * the vsyscall implementations are converted to use xtime_nsec
        * (shifted nanoseconds), this can be killed.
        */
-        remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1);
+        remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
-        timekeeper.xtime_nsec -= remainder;
+        tk->xtime_nsec -= remainder;
-        timekeeper.xtime_nsec += 1 << timekeeper.shift;
+        tk->xtime_nsec += 1ULL << tk->shift;
-        timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift;
+        tk->ntp_error += remainder << tk->ntp_error_shift;
        /*
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
-        accumulate_nsecs_to_secs(&timekeeper);
+        accumulate_nsecs_to_secs(tk);
-        timekeeping_update(&timekeeper, false);
+        timekeeping_update(tk, false);
 out:
-        write_sequnlock_irqrestore(&timekeeper.lock, flags);
+        write_sequnlock_irqrestore(&tk->lock, flags);
 }
@@ -1166,18 +1222,18 @@ out:
 */
 void getboottime(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec boottime = {
-                .tv_sec = timekeeper.wall_to_monotonic.tv_sec +
+                .tv_sec = tk->wall_to_monotonic.tv_sec +
-                                timekeeper.total_sleep_time.tv_sec,
+                                tk->total_sleep_time.tv_sec,
-                .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec +
+                .tv_nsec = tk->wall_to_monotonic.tv_nsec +
-                                timekeeper.total_sleep_time.tv_nsec
+                                tk->total_sleep_time.tv_nsec
        };
        set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(getboottime);
 /**
 * get_monotonic_boottime - Returns monotonic time since boot
 * @ts:         pointer to the timespec to be set
@@ -1189,22 +1245,25 @@ EXPORT_SYMBOL_GPL(getboottime);
 */
 void get_monotonic_boottime(struct timespec *ts)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec tomono, sleep;
+        s64 nsec;
        unsigned int seq;
        WARN_ON(timekeeping_suspended);
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                ts->tv_sec = timekeeper.xtime_sec;
+                ts->tv_sec = tk->xtime_sec;
-                ts->tv_nsec = timekeeping_get_ns(&timekeeper);
+                nsec = timekeeping_get_ns(tk);
-                tomono = timekeeper.wall_to_monotonic;
+                tomono = tk->wall_to_monotonic;
-                sleep = timekeeper.total_sleep_time;
+                sleep = tk->total_sleep_time;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
-        set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
+        ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
-                        ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
+        ts->tv_nsec = 0;
+        timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(get_monotonic_boottime);
@@ -1231,31 +1290,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
 */
 void monotonic_to_bootbased(struct timespec *ts)
 {
-        *ts = timespec_add(*ts, timekeeper.total_sleep_time);
+        struct timekeeper *tk = &timekeeper;
+        *ts = timespec_add(*ts, tk->total_sleep_time);
 }
 EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
 unsigned long get_seconds(void)
 {
-        return timekeeper.xtime_sec;
+        struct timekeeper *tk = &timekeeper;
+        return tk->xtime_sec;
 }
 EXPORT_SYMBOL(get_seconds);
 struct timespec __current_kernel_time(void)
 {
-        return tk_xtime(&timekeeper);
+        struct timekeeper *tk = &timekeeper;
+        return tk_xtime(tk);
 }
 struct timespec current_kernel_time(void)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec now;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                now = tk_xtime(&timekeeper);
+                now = tk_xtime(tk);
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return now;
 }
@@ -1263,15 +1329,16 @@ EXPORT_SYMBOL(current_kernel_time);
 struct timespec get_monotonic_coarse(void)
 {
+        struct timekeeper *tk = &timekeeper;
        struct timespec now, mono;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                now = tk_xtime(&timekeeper);
+                now = tk_xtime(tk);
-                mono = timekeeper.wall_to_monotonic;
+                mono = tk->wall_to_monotonic;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
                                now.tv_nsec + mono.tv_nsec);
@@ -1300,14 +1367,15 @@ void do_timer(unsigned long ticks)
 void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
                                struct timespec *wtom, struct timespec *sleep)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                *xtim = tk_xtime(&timekeeper);
+                *xtim = tk_xtime(tk);
-                *wtom = timekeeper.wall_to_monotonic;
+                *wtom = tk->wall_to_monotonic;
-                *sleep = timekeeper.total_sleep_time;
+                *sleep = tk->total_sleep_time;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
 }
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1321,19 +1389,20 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 */
 ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 {
+        struct timekeeper *tk = &timekeeper;
        ktime_t now;
        unsigned int seq;
        u64 secs, nsecs;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                secs = timekeeper.xtime_sec;
+                secs = tk->xtime_sec;
-                nsecs = timekeeping_get_ns(&timekeeper);
+                nsecs = timekeeping_get_ns(tk);
-                *offs_real = timekeeper.offs_real;
+                *offs_real = tk->offs_real;
-                *offs_boot = timekeeper.offs_boot;
+                *offs_boot = tk->offs_boot;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        now = ktime_add_ns(ktime_set(secs, 0), nsecs);
        now = ktime_sub(now, *offs_real);
@@ -1346,19 +1415,19 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 */
 ktime_t ktime_get_monotonic_offset(void)
 {
+        struct timekeeper *tk = &timekeeper;
        unsigned long seq;
        struct timespec wtom;
        do {
-                seq = read_seqbegin(&timekeeper.lock);
+                seq = read_seqbegin(&tk->lock);
-                wtom = timekeeper.wall_to_monotonic;
+                wtom = tk->wall_to_monotonic;
-        } while (read_seqretry(&timekeeper.lock, seq));
+        } while (read_seqretry(&tk->lock, seq));
        return timespec_to_ktime(wtom);
 }
 EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
 /**
 * xtime_update() - advances the timekeeping infrastructure
 * @ticks:      number of ticks, that have elapsed since the last call.
diff --git a/kernel/timer.c b/kernel/timer.c
index a61c09374eba..8c5e7b908c68 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1407,13 +1407,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
 #endif
-#ifndef __alpha__
-/*
- * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
- * should be moved into arch/i386 instead?
- */
 /**
 * sys_getpid - return the thread group id of the current process
 *
@@ -1469,8 +1462,6 @@ SYSCALL_DEFINE0(getegid)
        return from_kgid_munged(current_user_ns(), current_egid());
 }
-#endif
 static void process_timeout(unsigned long __data)
 {
        wake_up_process((struct task_struct *)__data);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index fee3752ae8f6..8a6d2ee2086c 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
        head = this_cpu_ptr(event_function.perf_events);
        perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
-                              1, &regs, head);
+                              1, &regs, head, NULL);
 #undef ENTRY_SIZE
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b31d3d5699fe..1a2117043bb1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+        perf_trace_buf_submit(entry, size, rctx,
+                                        entry->ip, 1, regs, head, NULL);
 }
 /* Kretprobe profile handler */
@@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
        store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+        perf_trace_buf_submit(entry, size, rctx,
+                                        entry->ret_ip, 1, regs, head, NULL);
 }
 #endif  /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 96fc73369099..6b245f64c8dd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
        int size;
        syscall_nr = syscall_get_nr(current, regs);
+        if (syscall_nr < 0)
+                return;
        if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
                return;
@@ -532,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
                               (unsigned long *)&rec->args);
        head = this_cpu_ptr(sys_data->enter_event->perf_events);
-        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        int size;
        syscall_nr = syscall_get_nr(current, regs);
+        if (syscall_nr < 0)
+                return;
        if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
                return;
@@ -608,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
        rec->ret = syscall_get_return_value(current, regs);
        head = this_cpu_ptr(sys_data->exit_event->perf_events);
-        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+        perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
 }
 int perf_sysexit_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2b36ac68549e..03003cd7dd96 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
                call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
        head = this_cpu_ptr(call->perf_events);
-        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+        perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
 out:
        preempt_enable();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 69add8a9da68..4b1dfba70f7c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -575,7 +575,7 @@ out:
 /*
 * Create/destroy watchdog threads as CPUs come and go:
 */
-static int
+static int __cpuinit
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
        int hotcpu = (unsigned long)hcpu;
@@ -610,27 +610,10 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-static struct notifier_block cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
        .notifier_call = cpu_callback
 };
-#ifdef CONFIG_SUSPEND
-/*
- * On exit from suspend we force an offline->online transition on the boot CPU
- * so that the PMU state that was lost while in suspended state gets set up
- * properly for the boot CPU.  This information is required for restarting the
- * NMI watchdog.
- */
-void lockup_detector_bootcpu_resume(void)
-{
-        void *cpu = (void *)(long)smp_processor_id();
-        cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu);
-        cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu);
-        cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu);
-}
-#endif
 void __init lockup_detector_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 692d97628a10..3c5a79e2134c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -66,6 +66,7 @@ enum {
        /* pool flags */
        POOL_MANAGE_WORKERS     = 1 << 0,       /* need to manage workers */
+        POOL_MANAGING_WORKERS   = 1 << 1,       /* managing workers */
        /* worker flags */
        WORKER_STARTED          = 1 << 0,       /* started */
@@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
 /* Do we have too many workers and should some go away? */
 static bool too_many_workers(struct worker_pool *pool)
 {
-        bool managing = mutex_is_locked(&pool->manager_mutex);
+        bool managing = pool->flags & POOL_MANAGING_WORKERS;
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;
@@ -1326,6 +1327,15 @@ static void idle_worker_rebind(struct worker *worker)
        /* we did our part, wait for rebind_workers() to finish up */
        wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+        /*
+         * rebind_workers() shouldn't finish until all workers passed the
+         * above WORKER_REBIND wait.  Tell it when done.
+         */
+        spin_lock_irq(&worker->pool->gcwq->lock);
+        if (!--worker->idle_rebind->cnt)
+                complete(&worker->idle_rebind->done);
+        spin_unlock_irq(&worker->pool->gcwq->lock);
 }
 /*
@@ -1339,8 +1349,16 @@ static void busy_worker_rebind_fn(struct work_struct *work)
        struct worker *worker = container_of(work, struct worker, rebind_work);
        struct global_cwq *gcwq = worker->pool->gcwq;
-        if (worker_maybe_bind_and_lock(worker))
+        worker_maybe_bind_and_lock(worker);
-                worker_clr_flags(worker, WORKER_REBIND);
+        /*
+         * %WORKER_REBIND must be cleared even if the above binding failed;
+         * otherwise, we may confuse the next CPU_UP cycle or oops / get
+         * stuck by calling idle_worker_rebind() prematurely.  If CPU went
+         * down again inbetween, %WORKER_UNBOUND would be set, so clearing
+         * %WORKER_REBIND is always safe.
+         */
+        worker_clr_flags(worker, WORKER_REBIND);
        spin_unlock_irq(&gcwq->lock);
 }
@@ -1396,12 +1414,15 @@ retry:
        /* set REBIND and kick idle ones, we'll wait for these later */
        for_each_worker_pool(pool, gcwq) {
                list_for_each_entry(worker, &pool->idle_list, entry) {
+                        unsigned long worker_flags = worker->flags;
                        if (worker->flags & WORKER_REBIND)
                                continue;
-                        /* morph UNBOUND to REBIND */
+                        /* morph UNBOUND to REBIND atomically */
-                        worker->flags &= ~WORKER_UNBOUND;
+                        worker_flags &= ~WORKER_UNBOUND;
-                        worker->flags |= WORKER_REBIND;
+                        worker_flags |= WORKER_REBIND;
+                        ACCESS_ONCE(worker->flags) = worker_flags;
                        idle_rebind.cnt++;
                        worker->idle_rebind = &idle_rebind;
@@ -1419,25 +1440,15 @@ retry:
                goto retry;
        }
-        /*
+        /* all idle workers are rebound, rebind busy workers */
-         * All idle workers are rebound and waiting for %WORKER_REBIND to
-         * be cleared inside idle_worker_rebind().  Clear and release.
-         * Clearing %WORKER_REBIND from this foreign context is safe
-         * because these workers are still guaranteed to be idle.
-         */
-        for_each_worker_pool(pool, gcwq)
-                list_for_each_entry(worker, &pool->idle_list, entry)
-                        worker->flags &= ~WORKER_REBIND;
-        wake_up_all(&gcwq->rebind_hold);
-        /* rebind busy workers */
        for_each_busy_worker(worker, i, pos, gcwq) {
                struct work_struct *rebind_work = &worker->rebind_work;
+                unsigned long worker_flags = worker->flags;
-                /* morph UNBOUND to REBIND */
+                /* morph UNBOUND to REBIND atomically */
-                worker->flags &= ~WORKER_UNBOUND;
+                worker_flags &= ~WORKER_UNBOUND;
-                worker->flags |= WORKER_REBIND;
+                worker_flags |= WORKER_REBIND;
+                ACCESS_ONCE(worker->flags) = worker_flags;
                if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
                                     work_data_bits(rebind_work)))
@@ -1449,6 +1460,34 @@ retry:
                            worker->scheduled.next,
                            work_color_to_flags(WORK_NO_COLOR));
        }
+        /*
+         * All idle workers are rebound and waiting for %WORKER_REBIND to
+         * be cleared inside idle_worker_rebind().  Clear and release.
+         * Clearing %WORKER_REBIND from this foreign context is safe
+         * because these workers are still guaranteed to be idle.
+         *
+         * We need to make sure all idle workers passed WORKER_REBIND wait
+         * in idle_worker_rebind() before returning; otherwise, workers can
+         * get stuck at the wait if hotplug cycle repeats.
+         */
+        idle_rebind.cnt = 1;
+        INIT_COMPLETION(idle_rebind.done);
+        for_each_worker_pool(pool, gcwq) {
+                list_for_each_entry(worker, &pool->idle_list, entry) {
+                        worker->flags &= ~WORKER_REBIND;
+                        idle_rebind.cnt++;
+                }
+        }
+        wake_up_all(&gcwq->rebind_hold);
+        if (--idle_rebind.cnt) {
+                spin_unlock_irq(&gcwq->lock);
+                wait_for_completion(&idle_rebind.done);
+                spin_lock_irq(&gcwq->lock);
+        }
 }
 static struct worker *alloc_worker(void)
@@ -1794,9 +1833,45 @@ static bool manage_workers(struct worker *worker)
        struct worker_pool *pool = worker->pool;
        bool ret = false;
-        if (!mutex_trylock(&pool->manager_mutex))
+        if (pool->flags & POOL_MANAGING_WORKERS)
                return ret;
+        pool->flags |= POOL_MANAGING_WORKERS;
+        /*
+         * To simplify both worker management and CPU hotplug, hold off
+         * management while hotplug is in progress.  CPU hotplug path can't
+         * grab %POOL_MANAGING_WORKERS to achieve this because that can
+         * lead to idle worker depletion (all become busy thinking someone
+         * else is managing) which in turn can result in deadlock under
+         * extreme circumstances.  Use @pool->manager_mutex to synchronize
+         * manager against CPU hotplug.
+         *
+         * manager_mutex would always be free unless CPU hotplug is in
+         * progress.  trylock first without dropping @gcwq->lock.
+         */
+        if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
+                spin_unlock_irq(&pool->gcwq->lock);
+                mutex_lock(&pool->manager_mutex);
+                /*
+                 * CPU hotplug could have happened while we were waiting
+                 * for manager_mutex.  Hotplug itself can't handle us
+                 * because manager isn't either on idle or busy list, and
+                 * @gcwq's state and ours could have deviated.
+                 *
+                 * As hotplug is now excluded via manager_mutex, we can
+                 * simply try to bind.  It will succeed or fail depending
+                 * on @gcwq's current state.  Try it and adjust
+                 * %WORKER_UNBOUND accordingly.
+                 */
+                if (worker_maybe_bind_and_lock(worker))
+                        worker->flags &= ~WORKER_UNBOUND;
+                else
+                        worker->flags |= WORKER_UNBOUND;
+                ret = true;
+        }
        pool->flags &= ~POOL_MANAGE_WORKERS;
        /*
@@ -1806,6 +1881,7 @@ static bool manage_workers(struct worker *worker)
        ret |= maybe_destroy_workers(pool);
        ret |= maybe_create_worker(pool);
+        pool->flags &= ~POOL_MANAGING_WORKERS;
        mutex_unlock(&pool->manager_mutex);
        return ret;
 }
@@ -3500,18 +3576,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 #ifdef CONFIG_SMP
 struct work_for_cpu {
-        struct completion completion;
+        struct work_struct work;
        long (*fn)(void *);
        void *arg;
        long ret;
 };
-static int do_work_for_cpu(void *_wfc)
+static void work_for_cpu_fn(struct work_struct *work)
 {
-        struct work_for_cpu *wfc = _wfc;
+        struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
        wfc->ret = wfc->fn(wfc->arg);
-        complete(&wfc->completion);
-        return 0;
 }
 /**
@@ -3526,19 +3601,11 @@ static int do_work_for_cpu(void *_wfc)
 */
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 {
-        struct task_struct *sub_thread;
+        struct work_for_cpu wfc = { .fn = fn, .arg = arg };
-        struct work_for_cpu wfc = {
-                .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
-                .fn = fn,
-                .arg = arg,
-        };
-        sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
+        INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
-        if (IS_ERR(sub_thread))
+        schedule_work_on(cpu, &wfc.work);
-                return PTR_ERR(sub_thread);
+        flush_work(&wfc.work);
-        kthread_bind(sub_thread, cpu);
-        wake_up_process(sub_thread);
-        wait_for_completion(&wfc.completion);
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);