25 files changed, 457 insertions, 268 deletions
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6c61263ff96d..74cc0fc6bb81 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -311,6 +311,7 @@ int audit_match_class(int class, unsigned syscall)
        return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall);
 }
+#ifdef CONFIG_AUDITSYSCALL
 static inline int audit_match_class_bits(int class, u32 *mask)
 {
        int i;
@@ -347,6 +348,7 @@ static int audit_match_signal(struct audit_entry *entry)
                return 1;
        }
 }
+#endif
 /* Common user-space to kernel rule translation. */
 static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
diff --git a/kernel/exit.c b/kernel/exit.c
index c6d14b8008dd..5c8ecbaa19a5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -762,11 +762,8 @@ static void exit_notify(struct task_struct *tsk)
                read_lock(&tasklist_lock);
                spin_lock_irq(&tsk->sighand->siglock);
                for (t = next_thread(tsk); t != tsk; t = next_thread(t))
-                        if (!signal_pending(t) && !(t->flags & PF_EXITING)) {
+                        if (!signal_pending(t) && !(t->flags & PF_EXITING))
-                                recalc_sigpending_tsk(t);
+                                recalc_sigpending_and_wake(t);
-                                if (signal_pending(t))
-                                        signal_wake_up(t, 0);
-                        }
                spin_unlock_irq(&tsk->sighand->siglock);
                read_unlock(&tasklist_lock);
        }
@@ -895,13 +892,29 @@ fastcall NORET_TYPE void do_exit(long code)
        if (unlikely(tsk->flags & PF_EXITING)) {
                printk(KERN_ALERT
                        "Fixing recursive fault but reboot is needed!\n");
+                /*
+                 * We can do this unlocked here. The futex code uses
+                 * this flag just to verify whether the pi state
+                 * cleanup has been done or not. In the worst case it
+                 * loops once more. We pretend that the cleanup was
+                 * done as there is no way to return. Either the
+                 * OWNER_DIED bit is set by now or we push the blocked
+                 * task into the wait for ever nirwana as well.
+                 */
+                tsk->flags |= PF_EXITPIDONE;
                if (tsk->io_context)
                        exit_io_context();
                set_current_state(TASK_UNINTERRUPTIBLE);
                schedule();
        }
+        /*
+         * tsk->flags are checked in the futex code to protect against
+         * an exiting task cleaning up the robust pi futexes.
+         */
+        spin_lock_irq(&tsk->pi_lock);
        tsk->flags |= PF_EXITING;
+        spin_unlock_irq(&tsk->pi_lock);
        if (unlikely(in_atomic()))
                printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -915,7 +928,7 @@ fastcall NORET_TYPE void do_exit(long code)
        }
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
-                hrtimer_cancel(&tsk->signal->real_timer);
+                hrtimer_cancel(&tsk->signal->real_timer);
                exit_itimers(tsk->signal);
        }
        acct_collect(code, group_dead);
@@ -968,6 +981,12 @@ fastcall NORET_TYPE void do_exit(long code)
         * Make sure we are holding no locks:
         */
        debug_check_no_locks_held(tsk);
+        /*
+         * We can do this unlocked here. The futex code uses this flag
+         * just to verify whether the pi state cleanup has been done
+         * or not. In the worst case it loops once more.
+         */
+        tsk->flags |= PF_EXITPIDONE;
        if (tsk->io_context)
                exit_io_context();
diff --git a/kernel/fork.c b/kernel/fork.c
index 49530e40ea8b..73ad5cda1bcd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -45,6 +45,7 @@
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
+#include <linux/freezer.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
@@ -1405,7 +1406,9 @@ long do_fork(unsigned long clone_flags,
                }
                if (clone_flags & CLONE_VFORK) {
+                        freezer_do_not_count();
                        wait_for_completion(&vfork);
+                        freezer_count();
                        if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
                                current->ptrace_message = nr;
                                ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
@@ -1427,10 +1430,8 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep,
 {
        struct sighand_struct *sighand = data;
-        if (flags & SLAB_CTOR_CONSTRUCTOR) {
+        spin_lock_init(&sighand->siglock);
-                spin_lock_init(&sighand->siglock);
+        INIT_LIST_HEAD(&sighand->signalfd_list);
-                INIT_LIST_HEAD(&sighand->signalfd_list);
-        }
 }
 void __init proc_caches_init(void)
diff --git a/kernel/futex.c b/kernel/futex.c
index b7ce15c67e32..3b7f7713d9a4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -430,10 +430,6 @@ static struct task_struct * futex_find_get_task(pid_t pid)
                p = NULL;
                goto out_unlock;
        }
-        if (p->exit_state != 0) {
-                p = NULL;
-                goto out_unlock;
-        }
        get_task_struct(p);
 out_unlock:
        rcu_read_unlock();
@@ -502,7 +498,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        struct futex_q *this, *next;
        struct plist_head *head;
        struct task_struct *p;
-        pid_t pid;
+        pid_t pid = uval & FUTEX_TID_MASK;
        head = &hb->chain;
@@ -520,6 +516,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
                                return -EINVAL;
                        WARN_ON(!atomic_read(&pi_state->refcount));
+                        WARN_ON(pid && pi_state->owner &&
+                                pi_state->owner->pid != pid);
                        atomic_inc(&pi_state->refcount);
                        *ps = pi_state;
@@ -530,15 +528,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        /*
         * We are the first waiter - try to look up the real owner and attach
-         * the new pi_state to it, but bail out when the owner died bit is set
+         * the new pi_state to it, but bail out when TID = 0
-         * and TID = 0:
         */
-        pid = uval & FUTEX_TID_MASK;
+        if (!pid)
-        if (!pid && (uval & FUTEX_OWNER_DIED))
                return -ESRCH;
        p = futex_find_get_task(pid);
-        if (!p)
+        if (IS_ERR(p))
-                return -ESRCH;
+                return PTR_ERR(p);
+        /*
+         * We need to look at the task state flags to figure out,
+         * whether the task is exiting. To protect against the do_exit
+         * change of the task flags, we do this protected by
+         * p->pi_lock:
+         */
+        spin_lock_irq(&p->pi_lock);
+        if (unlikely(p->flags & PF_EXITING)) {
+                /*
+                 * The task is on the way out. When PF_EXITPIDONE is
+                 * set, we know that the task has finished the
+                 * cleanup:
+                 */
+                int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+                spin_unlock_irq(&p->pi_lock);
+                put_task_struct(p);
+                return ret;
+        }
        pi_state = alloc_pi_state();
@@ -551,7 +567,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        /* Store the key for possible exit cleanups: */
        pi_state->key = *key;
-        spin_lock_irq(&p->pi_lock);
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &p->pi_state_list);
        pi_state->owner = p;
@@ -618,6 +633,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
         * preserve the owner died bit.)
         */
        if (!(uval & FUTEX_OWNER_DIED)) {
+                int ret = 0;
                newval = FUTEX_WAITERS | new_owner->pid;
                /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
                newval |= (uval & FUTEX_WAITER_REQUEUED);
@@ -625,10 +642,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
                pagefault_disable();
                curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
                pagefault_enable();
                if (curval == -EFAULT)
-                        return -EFAULT;
+                        ret = -EFAULT;
                if (curval != uval)
-                        return -EINVAL;
+                        ret = -EINVAL;
+                if (ret) {
+                        spin_unlock(&pi_state->pi_mutex.wait_lock);
+                        return ret;
+                }
        }
        spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1174,7 +1196,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
 #ifdef CONFIG_DEBUG_PI_LIST
                                this->list.plist.lock = &hb2->lock;
 #endif
-                        }
+                        }
                        this->key = key2;
                        get_futex_key_refs(&key2);
                        drop_count++;
@@ -1326,12 +1348,10 @@ static void unqueue_me_pi(struct futex_q *q)
 /*
 * Fixup the pi_state owner with current.
 *
- * The cur->mm semaphore must be  held, it is released at return of this
+ * Must be called with hash bucket lock held and mm->sem held for non
- * function.
+ * private futexes.
 */
-static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                                struct futex_q *q,
-                                struct futex_hash_bucket *hb,
                                struct task_struct *curr)
 {
        u32 newtid = curr->pid | FUTEX_WAITERS;
@@ -1355,23 +1375,24 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
        list_add(&pi_state->list, &curr->pi_state_list);
        spin_unlock_irq(&curr->pi_lock);
-        /* Unqueue and drop the lock */
-        unqueue_me_pi(q);
-        if (fshared)
-                up_read(fshared);
        /*
         * We own it, so we have to replace the pending owner
         * TID. This must be atomic as we have preserve the
         * owner died bit here.
         */
-        ret = get_user(uval, uaddr);
+        ret = get_futex_value_locked(&uval, uaddr);
        while (!ret) {
                newval = (uval & FUTEX_OWNER_DIED) | newtid;
                newval |= (uval & FUTEX_WAITER_REQUEUED);
+                pagefault_disable();
                curval = futex_atomic_cmpxchg_inatomic(uaddr,
                                                       uval, newval);
+                pagefault_enable();
                if (curval == -EFAULT)
-                        ret = -EFAULT;
+                        ret = -EFAULT;
                if (curval == uval)
                        break;
                uval = curval;
@@ -1553,10 +1574,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
                         */
                        uaddr = q.pi_state->key.uaddr;
-                        /* mmap_sem and hash_bucket lock are unlocked at
+                        ret = fixup_pi_state_owner(uaddr, &q, curr);
-                           return of this function */
-                        ret = fixup_pi_state_owner(uaddr, fshared,
-                                                   &q, hb, curr);
                } else {
                        /*
                         * Catch the rare case, where the lock was released
@@ -1567,12 +1585,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
                                if (rt_mutex_trylock(&q.pi_state->pi_mutex))
                                        ret = 0;
                        }
-                        /* Unqueue and drop the lock */
-                        unqueue_me_pi(&q);
-                        if (fshared)
-                                up_read(fshared);
                }
+                /* Unqueue and drop the lock */
+                unqueue_me_pi(&q);
+                if (fshared)
+                        up_read(fshared);
                debug_rt_mutex_free_waiter(&q.waiter);
                return ret;
@@ -1688,7 +1707,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
        struct futex_hash_bucket *hb;
        u32 uval, newval, curval;
        struct futex_q q;
-        int ret, lock_held, attempt = 0;
+        int ret, lock_taken, ownerdied = 0, attempt = 0;
        if (refill_pi_state_cache())
                return -ENOMEM;
@@ -1709,10 +1728,11 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
        if (unlikely(ret != 0))
                goto out_release_sem;
+ retry_unlocked:
        hb = queue_lock(&q, -1, NULL);
 retry_locked:
-        lock_held = 0;
+        ret = lock_taken = 0;
        /*
         * To avoid races, we attempt to take the lock here again
@@ -1728,43 +1748,44 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
        if (unlikely(curval == -EFAULT))
                goto uaddr_faulted;
-        /* We own the lock already */
+        /*
+         * Detect deadlocks. In case of REQUEUE_PI this is a valid
+         * situation and we return success to user space.
+         */
        if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
-                if (!detect && 0)
-                        force_sig(SIGKILL, current);
-                /*
-                 * Normally, this check is done in user space.
-                 * In case of requeue, the owner may attempt to lock this futex,
-                 * even if the ownership has already been given by the previous
-                 * waker.
-                 * In the usual case, this is a case of deadlock, but not in case
-                 * of REQUEUE_PI.
-                 */
                if (!(curval & FUTEX_WAITER_REQUEUED))
                        ret = -EDEADLK;
                goto out_unlock_release_sem;
        }
        /*
-         * Surprise - we got the lock. Just return
+         * Surprise - we got the lock. Just return to userspace:
-         * to userspace:
         */
        if (unlikely(!curval))
                goto out_unlock_release_sem;
        uval = curval;
        /*
-         * In case of a requeue, check if there already is an owner
+         * Set the WAITERS flag, so the owner will know it has someone
-         * If not, just take the futex.
+         * to wake at next unlock
         */
-        if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
+        newval = curval | FUTEX_WAITERS;
-                /* set current as futex owner */
-                newval = curval | current->pid;
+        /*
-                lock_held = 1;
+         * There are two cases, where a futex might have no owner (the
-        } else
+         * owner TID is 0): OWNER_DIED or REQUEUE. We take over the
-                /* Set the WAITERS flag, so the owner will know it has someone
+         * futex in this case. We also do an unconditional take over,
-                   to wake at next unlock */
+         * when the owner of the futex died.
-                newval = curval | FUTEX_WAITERS;
+         *
+         * This is safe as we are protected by the hash bucket lock !
+         */
+        if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+                /* Keep the OWNER_DIED and REQUEUE bits */
+                newval = (curval & ~FUTEX_TID_MASK) | current->pid;
+                ownerdied = 0;
+                lock_taken = 1;
+        }
        pagefault_disable();
        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1775,8 +1796,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
        if (unlikely(curval != uval))
                goto retry_locked;
-        if (lock_held) {
+        /*
-                set_pi_futex_owner(hb, &q.key, curr);
+         * We took the lock due to requeue or owner died take over.
+         */
+        if (unlikely(lock_taken)) {
+                /* For requeue we need to fixup the pi_futex */
+                if (curval & FUTEX_WAITER_REQUEUED)
+                        set_pi_futex_owner(hb, &q.key, curr);
                goto out_unlock_release_sem;
        }
@@ -1787,34 +1813,40 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
        ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
        if (unlikely(ret)) {
-                /*
+                switch (ret) {
-                 * There were no waiters and the owner task lookup
-                 * failed. When the OWNER_DIED bit is set, then we
-                 * know that this is a robust futex and we actually
-                 * take the lock. This is safe as we are protected by
-                 * the hash bucket lock. We also set the waiters bit
-                 * unconditionally here, to simplify glibc handling of
-                 * multiple tasks racing to acquire the lock and
-                 * cleanup the problems which were left by the dead
-                 * owner.
-                 */
-                if (curval & FUTEX_OWNER_DIED) {
-                        uval = newval;
-                        newval = current->pid |
-                                FUTEX_OWNER_DIED | FUTEX_WAITERS;
-                        pagefault_disable();
+                case -EAGAIN:
-                        curval = futex_atomic_cmpxchg_inatomic(uaddr,
+                        /*
-                                                               uval, newval);
+                         * Task is exiting and we just wait for the
-                        pagefault_enable();
+                         * exit to complete.
+                         */
+                        queue_unlock(&q, hb);
+                        if (fshared)
+                                up_read(fshared);
+                        cond_resched();
+                        goto retry;
-                        if (unlikely(curval == -EFAULT))
+                case -ESRCH:
+                        /*
+                         * No owner found for this futex. Check if the
+                         * OWNER_DIED bit is set to figure out whether
+                         * this is a robust futex or not.
+                         */
+                        if (get_futex_value_locked(&curval, uaddr))
                                goto uaddr_faulted;
-                        if (unlikely(curval != uval))
+                        /*
+                         * We simply start over in case of a robust
+                         * futex. The code above will take the futex
+                         * and return happy.
+                         */
+                        if (curval & FUTEX_OWNER_DIED) {
+                                ownerdied = 1;
                                goto retry_locked;
-                        ret = 0;
+                        }
+                default:
+                        goto out_unlock_release_sem;
                }
-                goto out_unlock_release_sem;
        }
        /*
@@ -1845,31 +1877,42 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
                down_read(fshared);
        spin_lock(q.lock_ptr);
-        /*
+        if (!ret) {
-         * Got the lock. We might not be the anticipated owner if we
+                /*
-         * did a lock-steal - fix up the PI-state in that case.
+                 * Got the lock. We might not be the anticipated owner
-         */
+                 * if we did a lock-steal - fix up the PI-state in
-        if (!ret && q.pi_state->owner != curr)
+                 * that case:
-                /* mmap_sem is unlocked at return of this function */
+                 */
-                ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
+                if (q.pi_state->owner != curr)
-        else {
+                        ret = fixup_pi_state_owner(uaddr, &q, curr);
+        } else {
                /*
                 * Catch the rare case, where the lock was released
-                 * when we were on the way back before we locked
+                 * when we were on the way back before we locked the
-                 * the hash bucket.
+                 * hash bucket.
                 */
-                if (ret && q.pi_state->owner == curr) {
+                if (q.pi_state->owner == curr &&
-                        if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+                    rt_mutex_trylock(&q.pi_state->pi_mutex)) {
-                                ret = 0;
+                        ret = 0;
+                } else {
+                        /*
+                         * Paranoia check. If we did not take the lock
+                         * in the trylock above, then we should not be
+                         * the owner of the rtmutex, neither the real
+                         * nor the pending one:
+                         */
+                        if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
+                                printk(KERN_ERR "futex_lock_pi: ret = %d "
+                                       "pi-mutex: %p pi-state %p\n", ret,
+                                       q.pi_state->pi_mutex.owner,
+                                       q.pi_state->owner);
                }
-                /* Unqueue and drop the lock */
-                unqueue_me_pi(&q);
-                if (fshared)
-                        up_read(fshared);
        }
-        if (!detect && ret == -EDEADLK && 0)
+        /* Unqueue and drop the lock */
-                force_sig(SIGKILL, current);
+        unqueue_me_pi(&q);
+        if (fshared)
+                up_read(fshared);
        return ret != -EINTR ? ret : -ERESTARTNOINTR;
@@ -1887,16 +1930,19 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
         * non-atomically.  Therefore, if get_user below is not
         * enough, we need to handle the fault ourselves, while
         * still holding the mmap_sem.
+         *
+         * ... and hb->lock. :-) --ANK
         */
+        queue_unlock(&q, hb);
        if (attempt++) {
                ret = futex_handle_fault((unsigned long)uaddr, fshared,
                                         attempt);
                if (ret)
-                        goto out_unlock_release_sem;
+                        goto out_release_sem;
-                goto retry_locked;
+                goto retry_unlocked;
        }
-        queue_unlock(&q, hb);
        if (fshared)
                up_read(fshared);
@@ -1940,9 +1986,9 @@ retry:
                goto out;
        hb = hash_futex(&key);
+retry_unlocked:
        spin_lock(&hb->lock);
-retry_locked:
        /*
         * To avoid races, try to do the TID -> 0 atomic transition
         * again. If it succeeds then we can return without waking
@@ -2005,16 +2051,19 @@ pi_faulted:
         * non-atomically.  Therefore, if get_user below is not
         * enough, we need to handle the fault ourselves, while
         * still holding the mmap_sem.
+         *
+         * ... and hb->lock. --ANK
         */
+        spin_unlock(&hb->lock);
        if (attempt++) {
                ret = futex_handle_fault((unsigned long)uaddr, fshared,
                                         attempt);
                if (ret)
-                        goto out_unlock;
+                        goto out;
-                goto retry_locked;
+                goto retry_unlocked;
        }
-        spin_unlock(&hb->lock);
        if (fshared)
                up_read(fshared);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 338a9b489fbc..27478948b318 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -144,20 +144,21 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
        struct timespec ts;
        ktime_t t, *tp = NULL;
        int val2 = 0;
+        int cmd = op & FUTEX_CMD_MASK;
-        if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
+        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
                if (get_compat_timespec(&ts, utime))
                        return -EFAULT;
                if (!timespec_valid(&ts))
                        return -EINVAL;
                t = timespec_to_ktime(ts);
-                if (op == FUTEX_WAIT)
+                if (cmd == FUTEX_WAIT)
                        t = ktime_add(ktime_get(), t);
                tp = &t;
        }
-        if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
-            || op == FUTEX_CMP_REQUEUE_PI)
+            || cmd == FUTEX_CMP_REQUEUE_PI)
                val2 = (int) (unsigned long) utime;
        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index b0d81aae472f..bd9e272d55e9 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -135,6 +135,39 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
        }
 }
+static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
+{
+        struct irqaction *action;
+        if (!irqfixup)
+                return 0;
+        /* We didn't actually handle the IRQ - see if it was misrouted? */
+        if (action_ret == IRQ_NONE)
+                return 1;
+        /*
+         * But for 'irqfixup == 2' we also do it for handled interrupts if
+         * they are marked as IRQF_IRQPOLL (or for irq zero, which is the
+         * traditional PC timer interrupt.. Legacy)
+         */
+        if (irqfixup < 2)
+                return 0;
+        if (!irq)
+                return 1;
+        /*
+         * Since we don't get the descriptor lock, "action" can
+         * change under us.  We don't really care, but we don't
+         * want to follow a NULL pointer. So tell the compiler to
+         * just load it once by using a barrier.
+         */
+        action = desc->action;
+        barrier();
+        return action && (action->flags & IRQF_IRQPOLL);
+}
 void note_interrupt(unsigned int irq, struct irq_desc *desc,
                    irqreturn_t action_ret)
 {
@@ -144,15 +177,10 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
                        report_bad_irq(irq, desc, action_ret);
        }
-        if (unlikely(irqfixup)) {
+        if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
-                /* Don't punish working computers */
+                int ok = misrouted_irq(irq);
-                if ((irqfixup == 2 && ((irq == 0) ||
+                if (action_ret == IRQ_NONE)
-                                (desc->action->flags & IRQF_IRQPOLL))) ||
+                        desc->irqs_unhandled -= ok;
-                                action_ret == IRQ_NONE) {
-                        int ok = misrouted_irq(irq);
-                        if (action_ret == IRQ_NONE)
-                                desc->irqs_unhandled -= ok;
-                }
        }
        desc->irq_count++;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index f1bda23140b2..fed54418626c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -257,7 +257,8 @@ const char *kallsyms_lookup(unsigned long addr,
                pos = get_symbol_pos(addr, symbolsize, offset);
                /* Grab name */
                kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
-                *modname = NULL;
+                if (modname)
+                        *modname = NULL;
                return namebuf;
        }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index df8a8e8f6ca4..bbd51b81a3e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -70,7 +70,7 @@ static int kthread(void *_create)
        data = create->data;
        /* OK, tell user we're spawned, wait for stop or wakeup */
-        __set_current_state(TASK_INTERRUPTIBLE);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
        complete(&create->started);
        schedule();
@@ -162,7 +162,10 @@ EXPORT_SYMBOL(kthread_create);
 */
 void kthread_bind(struct task_struct *k, unsigned int cpu)
 {
-        BUG_ON(k->state != TASK_INTERRUPTIBLE);
+        if (k->state != TASK_UNINTERRUPTIBLE) {
+                WARN_ON(1);
+                return;
+        }
        /* Must have done schedule() in kthread() before we set_task_cpu */
        wait_task_inactive(k);
        set_task_cpu(k, cpu);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index b5f0543ed84d..f445b9cd60fb 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -416,7 +416,8 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
        mutex_lock(&pm_mutex);
        for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
-                if (!strncmp(buf, hibernation_modes[i], len)) {
+                if (len == strlen(hibernation_modes[i])
+                    && !strncmp(buf, hibernation_modes[i], len)) {
                        mode = i;
                        break;
                }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 40d56a31245e..8812985f3029 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -97,25 +97,26 @@ static int suspend_prepare(suspend_state_t state)
                }
        }
-        if (pm_ops->prepare) {
-                if ((error = pm_ops->prepare(state)))
-                        goto Thaw;
-        }
        suspend_console();
        error = device_suspend(PMSG_SUSPEND);
        if (error) {
                printk(KERN_ERR "Some devices failed to suspend\n");
-                goto Resume_devices;
+                goto Resume_console;
        }
+        if (pm_ops->prepare) {
+                if ((error = pm_ops->prepare(state)))
+                        goto Resume_devices;
+        }
        error = disable_nonboot_cpus();
        if (!error)
                return 0;
        enable_nonboot_cpus();
- Resume_devices:
        pm_finish(state);
+ Resume_devices:
        device_resume();
+ Resume_console:
        resume_console();
 Thaw:
        thaw_processes();
@@ -289,13 +290,13 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
        len = p ? p - buf : n;
        /* First, check if we are requested to hibernate */
-        if (!strncmp(buf, "disk", len)) {
+        if (len == 4 && !strncmp(buf, "disk", len)) {
                error = hibernate();
                return error ? error : n;
        }
        for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
-                if (*s && !strncmp(buf, *s, len))
+                if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
                        break;
        }
        if (state < PM_SUSPEND_MAX && *s)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 088419387388..e0233d8422b9 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -31,16 +31,36 @@ static inline int freezeable(struct task_struct * p)
        return 1;
 }
+/*
+ * freezing is complete, mark current process as frozen
+ */
+static inline void frozen_process(void)
+{
+        if (!unlikely(current->flags & PF_NOFREEZE)) {
+                current->flags |= PF_FROZEN;
+                wmb();
+        }
+        clear_tsk_thread_flag(current, TIF_FREEZE);
+}
 /* Refrigerator is place where frozen processes are stored :-). */
 void refrigerator(void)
 {
        /* Hmm, should we be allowed to suspend when there are realtime
           processes around? */
        long save;
+        task_lock(current);
+        if (freezing(current)) {
+                frozen_process();
+                task_unlock(current);
+        } else {
+                task_unlock(current);
+                return;
+        }
        save = current->state;
        pr_debug("%s entered refrigerator\n", current->comm);
-        frozen_process(current);
        spin_lock_irq(&current->sighand->siglock);
        recalc_sigpending(); /* We sent fake signal, clean it up */
        spin_unlock_irq(&current->sighand->siglock);
@@ -81,7 +101,7 @@ static void cancel_freezing(struct task_struct *p)
                pr_debug("  clean up: %s\n", p->comm);
                do_not_freeze(p);
                spin_lock_irqsave(&p->sighand->siglock, flags);
-                recalc_sigpending_tsk(p);
+                recalc_sigpending_and_wake(p);
                spin_unlock_irqrestore(&p->sighand->siglock, flags);
        }
 }
@@ -112,22 +132,12 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
                                cancel_freezing(p);
                                continue;
                        }
-                        if (is_user_space(p)) {
+                        if (freeze_user_space && !is_user_space(p))
-                                if (!freeze_user_space)
+                                continue;
-                                        continue;
+                        freeze_process(p);
-                                /* Freeze the task unless there is a vfork
+                        if (!freezer_should_skip(p))
-                                 * completion pending
+                                todo++;
-                                 */
-                                if (!p->vfork_done)
-                                        freeze_process(p);
-                        } else {
-                                if (freeze_user_space)
-                                        continue;
-                                freeze_process(p);
-                        }
-                        todo++;
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
                yield();                        /* Yield is okay here */
@@ -149,13 +159,16 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space)
                                TIMEOUT / HZ, todo);
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
-                        if (is_user_space(p) == !freeze_user_space)
+                        if (freeze_user_space && !is_user_space(p))
                                continue;
-                        if (freezeable(p) && !frozen(p))
+                        task_lock(p);
+                        if (freezeable(p) && !frozen(p) &&
+                            !freezer_should_skip(p))
                                printk(KERN_ERR " %s\n", p->comm);
                        cancel_freezing(p);
+                        task_unlock(p);
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
        }
@@ -200,9 +213,7 @@ static void thaw_tasks(int thaw_user_space)
                if (is_user_space(p) == !thaw_user_space)
                        continue;
-                if (!thaw_process(p))
+                thaw_process(p);
-                        printk(KERN_WARNING " Strange, %s not stopped\n",
-                                p->comm );
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b8b235cc19d1..8b1a1b837145 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -584,7 +584,7 @@ int swsusp_check(void)
        resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
        if (!IS_ERR(resume_bdev)) {
                set_blocksize(resume_bdev, PAGE_SIZE);
-                memset(swsusp_header, 0, sizeof(PAGE_SIZE));
+                memset(swsusp_header, 0, PAGE_SIZE);
                error = bio_read_page(swsusp_resume_block,
                                        swsusp_header, NULL);
                if (error)
diff --git a/kernel/profile.c b/kernel/profile.c
index cc91b9bf759d..5b20fe977bed 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -26,6 +26,7 @@
 #include <asm/sections.h>
 #include <asm/semaphore.h>
 #include <asm/irq_regs.h>
+#include <asm/ptrace.h>
 struct profile_hit {
        u32 pc, hits;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 12879f6c1ec3..a6fbb4130521 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -189,6 +189,19 @@ int rt_mutex_adjust_prio_chain(struct task_struct *task,
        if (!waiter || !waiter->task)
                goto out_unlock_pi;
+        /*
+         * Check the orig_waiter state. After we dropped the locks,
+         * the previous owner of the lock might have released the lock
+         * and made us the pending owner:
+         */
+        if (orig_waiter && !orig_waiter->task)
+                goto out_unlock_pi;
+        /*
+         * Drop out, when the task has no waiters. Note,
+         * top_waiter can be NULL, when we are in the deboosting
+         * mode!
+         */
        if (top_waiter && (!task_has_pi_waiters(task) ||
                           top_waiter != task_top_pi_waiter(task)))
                goto out_unlock_pi;
@@ -636,9 +649,16 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                         * all over without going into schedule to try
                         * to get the lock now:
                         */
-                        if (unlikely(!waiter.task))
+                        if (unlikely(!waiter.task)) {
+                                /*
+                                 * Reset the return value. We might
+                                 * have returned with -EDEADLK and the
+                                 * owner released the lock while we
+                                 * were walking the pi chain.
+                                 */
+                                ret = 0;
                                continue;
+                        }
                        if (unlikely(ret))
                                break;
                }
diff --git a/kernel/sched.c b/kernel/sched.c
index 799d23b4e35d..13cdab3b4c48 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4775,9 +4775,7 @@ int __sched cond_resched_softirq(void)
        BUG_ON(!in_softirq());
        if (need_resched() && system_state == SYSTEM_RUNNING) {
-                raw_local_irq_disable();
+                local_bh_enable();
-                _local_bh_enable();
-                raw_local_irq_enable();
                __cond_resched();
                local_bh_disable();
                return 1;
diff --git a/kernel/signal.c b/kernel/signal.c
index 364fc95bf97c..fe590e00db8d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -96,20 +96,38 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 #define PENDING(p,b) has_pending_signals(&(p)->signal, (b))
-fastcall void recalc_sigpending_tsk(struct task_struct *t)
+static int recalc_sigpending_tsk(struct task_struct *t)
 {
        if (t->signal->group_stop_count > 0 ||
            (freezing(t)) ||
            PENDING(&t->pending, &t->blocked) ||
-            PENDING(&t->signal->shared_pending, &t->blocked))
+            PENDING(&t->signal->shared_pending, &t->blocked)) {
                set_tsk_thread_flag(t, TIF_SIGPENDING);
-        else
+                return 1;
-                clear_tsk_thread_flag(t, TIF_SIGPENDING);
+        }
+        /*
+         * We must never clear the flag in another thread, or in current
+         * when it's possible the current syscall is returning -ERESTART*.
+         * So we don't clear it here, and only callers who know they should do.
+         */
+        return 0;
+}
+/*
+ * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up.
+ * This is superfluous when called on current, the wakeup is a harmless no-op.
+ */
+void recalc_sigpending_and_wake(struct task_struct *t)
+{
+        if (recalc_sigpending_tsk(t))
+                signal_wake_up(t, 0);
 }
 void recalc_sigpending(void)
 {
-        recalc_sigpending_tsk(current);
+        if (!recalc_sigpending_tsk(current))
+                clear_thread_flag(TIF_SIGPENDING);
 }
 /* Given the mask, find the first available signal that should be serviced. */
@@ -373,7 +391,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                        }
                }
        }
-        recalc_sigpending_tsk(tsk);
+        if (likely(tsk == current))
+                recalc_sigpending();
        if (signr && unlikely(sig_kernel_stop(signr))) {
                /*
                 * Set a marker that we have dequeued a stop signal.  Our
@@ -744,7 +763,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
                action->sa.sa_handler = SIG_DFL;
                if (blocked) {
                        sigdelset(&t->blocked, sig);
-                        recalc_sigpending_tsk(t);
+                        recalc_sigpending_and_wake(t);
                }
        }
        ret = specific_send_sig_info(sig, info, t);
@@ -1568,8 +1587,9 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
        /*
         * Queued signals ignored us while we were stopped for tracing.
         * So check for any that we should take before resuming user mode.
+         * This sets TIF_SIGPENDING, but never clears it.
         */
-        recalc_sigpending();
+        recalc_sigpending_tsk(current);
 }
 void ptrace_notify(int exit_code)
@@ -2273,7 +2293,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
                        rm_from_queue_full(&mask, &t->signal->shared_pending);
                        do {
                                rm_from_queue_full(&mask, &t->pending);
-                                recalc_sigpending_tsk(t);
+                                recalc_sigpending_and_wake(t);
                                t = next_thread(t);
                        } while (t != current);
                }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4073353abd4f..30ee462ee79f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -227,7 +227,7 @@ static ctl_table kern_table[] = {
                .ctl_name       = KERN_CORE_PATTERN,
                .procname       = "core_pattern",
                .data           = core_pattern,
-                .maxlen         = 128,
+                .maxlen         = CORENAME_MAX_SIZE,
                .mode           = 0644,
                .proc_handler   = &proc_dostring,
                .strategy       = &sysctl_string,
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 3db5c3c460d7..51b6a6a6158c 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -74,7 +74,7 @@ static struct clocksource *watchdog;
 static struct timer_list watchdog_timer;
 static DEFINE_SPINLOCK(watchdog_lock);
 static cycle_t watchdog_last;
-static int watchdog_resumed;
+static unsigned long watchdog_resumed;
 /*
 * Interval: 0.5sec Threshold: 0.0625s
@@ -104,9 +104,7 @@ static void clocksource_watchdog(unsigned long data)
        spin_lock(&watchdog_lock);
-        resumed = watchdog_resumed;
+        resumed = test_and_clear_bit(0, &watchdog_resumed);
-        if (unlikely(resumed))
-                watchdog_resumed = 0;
        wdnow = watchdog->read();
        wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
@@ -151,9 +149,7 @@ static void clocksource_watchdog(unsigned long data)
 }
 static void clocksource_resume_watchdog(void)
 {
-        spin_lock(&watchdog_lock);
+        set_bit(0, &watchdog_resumed);
-        watchdog_resumed = 1;
-        spin_unlock(&watchdog_lock);
 }
 static void clocksource_check_watchdog(struct clocksource *cs)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index cb25649c6f50..87aa5ff931e0 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -11,6 +11,8 @@
 #include <linux/mm.h>
 #include <linux/time.h>
 #include <linux/timex.h>
+#include <linux/jiffies.h>
+#include <linux/hrtimer.h>
 #include <asm/div64.h>
 #include <asm/timex.h>
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index eadfce2fff74..8001d37071f5 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -243,11 +243,18 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
 {
        int cpu = get_cpu();
-        if (cpu == *oncpu)
+        if (!cpu_isset(*oncpu, cpu_online_map)) {
-                tick_do_broadcast_on_off(&reason);
+                printk(KERN_ERR "tick-braodcast: ignoring broadcast for "
-        else
+                       "offline CPU #%d\n", *oncpu);
-                smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
+        } else {
-                                         &reason, 1, 1);
+                if (cpu == *oncpu)
+                        tick_do_broadcast_on_off(&reason);
+                else
+                        smp_call_function_single(*oncpu,
+                                                 tick_do_broadcast_on_off,
+                                                 &reason, 1, 1);
+        }
        put_cpu();
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3483e6cb9549..52db9e3c526e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -167,9 +167,15 @@ void tick_nohz_stop_sched_tick(void)
                goto end;
        cpu = smp_processor_id();
-        if (unlikely(local_softirq_pending()))
+        if (unlikely(local_softirq_pending())) {
-                printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+                static int ratelimit;
-                       local_softirq_pending());
+                if (ratelimit < 10) {
+                        printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
+                               local_softirq_pending());
+                        ratelimit++;
+                }
+        }
        now = ktime_get();
        /*
@@ -241,6 +247,21 @@ void tick_nohz_stop_sched_tick(void)
                if (cpu == tick_do_timer_cpu)
                        tick_do_timer_cpu = -1;
+                ts->idle_sleeps++;
+                /*
+                 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
+                 * there is no timer pending or at least extremly far
+                 * into the future (12 days for HZ=1000). In this case
+                 * we simply stop the tick timer:
+                 */
+                if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) {
+                        ts->idle_expires.tv64 = KTIME_MAX;
+                        if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+                                hrtimer_cancel(&ts->sched_timer);
+                        goto out;
+                }
                /*
                 * calculate the expiry time for the next timer wheel
                 * timer
@@ -248,7 +269,6 @@ void tick_nohz_stop_sched_tick(void)
                expires = ktime_add_ns(last_update, tick_period.tv64 *
                                       delta_jiffies);
                ts->idle_expires = expires;
-                ts->idle_sleeps++;
                if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
                        hrtimer_start(&ts->sched_timer, expires,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f9217bf644f6..3d1042f82a68 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -273,6 +273,8 @@ static int timekeeping_resume(struct sys_device *dev)
        unsigned long flags;
        unsigned long now = read_persistent_clock();
+        clocksource_resume();
        write_seqlock_irqsave(&xtime_lock, flags);
        if (now && (now > timekeeping_suspend_time)) {
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 868f1bceb07f..321693724ad7 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -117,21 +117,6 @@ static struct entry entries[MAX_ENTRIES];
 static atomic_t overflow_count;
-static void reset_entries(void)
-{
-        nr_entries = 0;
-        memset(entries, 0, sizeof(entries));
-        atomic_set(&overflow_count, 0);
-}
-static struct entry *alloc_entry(void)
-{
-        if (nr_entries >= MAX_ENTRIES)
-                return NULL;
-        return entries + nr_entries++;
-}
 /*
 * The entries are in a hash-table, for fast lookup:
 */
@@ -149,6 +134,22 @@ static struct entry *alloc_entry(void)
 static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly;
+static void reset_entries(void)
+{
+        nr_entries = 0;
+        memset(entries, 0, sizeof(entries));
+        memset(tstat_hash_table, 0, sizeof(tstat_hash_table));
+        atomic_set(&overflow_count, 0);
+}
+static struct entry *alloc_entry(void)
+{
+        if (nr_entries >= MAX_ENTRIES)
+                return NULL;
+        return entries + nr_entries++;
+}
 static int match_entries(struct entry *entry1, struct entry *entry2)
 {
        return entry1->timer       == entry2->timer       &&
@@ -202,12 +203,15 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
        if (curr) {
                *curr = *entry;
                curr->count = 0;
+                curr->next = NULL;
                memcpy(curr->comm, comm, TASK_COMM_LEN);
+                smp_mb(); /* Ensure that curr is initialized before insert */
                if (prev)
                        prev->next = curr;
                else
                        *head = curr;
-                curr->next = NULL;
        }
 out_unlock:
        spin_unlock(&table_lock);
@@ -232,10 +236,15 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
        /*
         * It doesnt matter which lock we take:
         */
-        spinlock_t *lock = &per_cpu(lookup_lock, raw_smp_processor_id());
+        spinlock_t *lock;
        struct entry *entry, input;
        unsigned long flags;
+        if (likely(!active))
+                return;
+        lock = &per_cpu(lookup_lock, raw_smp_processor_id());
        input.timer = timer;
        input.start_func = startf;
        input.expire_func = timerf;
@@ -360,6 +369,7 @@ static ssize_t tstats_write(struct file *file, const char __user *buf,
                if (!active) {
                        reset_entries();
                        time_start = ktime_get();
+                        smp_mb();
                        active = 1;
                }
                break;
diff --git a/kernel/timer.c b/kernel/timer.c
index a6c580ac084b..1a69705c2fb9 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -666,7 +666,7 @@ static inline void __run_timers(tvec_base_t *base)
 static unsigned long __next_timer_interrupt(tvec_base_t *base)
 {
        unsigned long timer_jiffies = base->timer_jiffies;
-        unsigned long expires = timer_jiffies + (LONG_MAX >> 1);
+        unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
        int index, slot, array, found = 0;
        struct timer_list *nte;
        tvec_t *varray[4];
@@ -752,6 +752,14 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
        tsdelta = ktime_to_timespec(hr_delta);
        delta = timespec_to_jiffies(&tsdelta);
+        /*
+         * Limit the delta to the max value, which is checked in
+         * tick_nohz_stop_sched_tick():
+         */
+        if (delta > NEXT_TIMER_MAX_DELTA)
+                delta = NEXT_TIMER_MAX_DELTA;
        /*
         * Take rounding errors in to account and make sure, that it
         * expires in the next tick. Otherwise we go into an endless
@@ -1499,8 +1507,6 @@ unregister_time_interpolator(struct time_interpolator *ti)
                prev = &curr->next;
        }
-        clocksource_resume();
        write_seqlock_irqsave(&xtime_lock, flags);
        if (ti == time_interpolator) {
                /* we lost the best time-interpolator: */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fb56fedd5c02..3bebf73be976 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -47,7 +47,6 @@ struct cpu_workqueue_struct {
        struct workqueue_struct *wq;
        struct task_struct *thread;
-        int should_stop;
        int run_depth;          /* Detect run_workqueue() recursion depth */
 } ____cacheline_aligned;
@@ -71,7 +70,13 @@ static LIST_HEAD(workqueues);
 static int singlethread_cpu __read_mostly;
 static cpumask_t cpu_singlethread_map __read_mostly;
-/* optimization, we could use cpu_possible_map */
+/*
+ * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
+ * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
+ * which comes in between can't use for_each_online_cpu(). We could
+ * use cpu_possible_map, the cpumask below is more a documentation
+ * than optimization.
+ */
 static cpumask_t cpu_populated_map __read_mostly;
 /* If it's single threaded, it isn't in the list of workqueues. */
@@ -272,24 +277,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
        spin_unlock_irq(&cwq->lock);
 }
-/*
- * NOTE: the caller must not touch *cwq if this func returns true
- */
-static int cwq_should_stop(struct cpu_workqueue_struct *cwq)
-{
-        int should_stop = cwq->should_stop;
-        if (unlikely(should_stop)) {
-                spin_lock_irq(&cwq->lock);
-                should_stop = cwq->should_stop && list_empty(&cwq->worklist);
-                if (should_stop)
-                        cwq->thread = NULL;
-                spin_unlock_irq(&cwq->lock);
-        }
-        return should_stop;
-}
 static int worker_thread(void *__cwq)
 {
        struct cpu_workqueue_struct *cwq = __cwq;
@@ -302,14 +289,15 @@ static int worker_thread(void *__cwq)
        for (;;) {
                prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
-                if (!freezing(current) && !cwq->should_stop
+                if (!freezing(current) &&
-                    && list_empty(&cwq->worklist))
+                    !kthread_should_stop() &&
+                    list_empty(&cwq->worklist))
                        schedule();
                finish_wait(&cwq->more_work, &wait);
                try_to_freeze();
-                if (cwq_should_stop(cwq))
+                if (kthread_should_stop())
                        break;
                run_workqueue(cwq);
@@ -340,18 +328,21 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
        insert_work(cwq, &barr->work, tail);
 }
-static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
+static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 {
+        int active;
        if (cwq->thread == current) {
                /*
                 * Probably keventd trying to flush its own queue. So simply run
                 * it by hand rather than deadlocking.
                 */
                run_workqueue(cwq);
+                active = 1;
        } else {
                struct wq_barrier barr;
-                int active = 0;
+                active = 0;
                spin_lock_irq(&cwq->lock);
                if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
                        insert_wq_barrier(cwq, &barr, 1);
@@ -362,6 +353,8 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
                if (active)
                        wait_for_completion(&barr.done);
        }
+        return active;
 }
 /**
@@ -674,7 +667,6 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
                return PTR_ERR(p);
        cwq->thread = p;
-        cwq->should_stop = 0;
        return 0;
 }
@@ -740,29 +732,27 @@ EXPORT_SYMBOL_GPL(__create_workqueue);
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
-        struct wq_barrier barr;
+        /*
-        int alive = 0;
+         * Our caller is either destroy_workqueue() or CPU_DEAD,
+         * workqueue_mutex protects cwq->thread
-        spin_lock_irq(&cwq->lock);
+         */
-        if (cwq->thread != NULL) {
+        if (cwq->thread == NULL)
-                insert_wq_barrier(cwq, &barr, 1);
+                return;
-                cwq->should_stop = 1;
-                alive = 1;
-        }
-        spin_unlock_irq(&cwq->lock);
-        if (alive) {
+        /*
-                wait_for_completion(&barr.done);
+         * If the caller is CPU_DEAD the single flush_cpu_workqueue()
+         * is not enough, a concurrent flush_workqueue() can insert a
+         * barrier after us.
+         * When ->worklist becomes empty it is safe to exit because no
+         * more work_structs can be queued on this cwq: flush_workqueue
+         * checks list_empty(), and a "normal" queue_work() can't use
+         * a dead CPU.
+         */
+        while (flush_cpu_workqueue(cwq))
+                ;
-                while (unlikely(cwq->thread != NULL))
+        kthread_stop(cwq->thread);
-                        cpu_relax();
+        cwq->thread = NULL;
-                /*
-                 * Wait until cwq->thread unlocks cwq->lock,
-                 * it won't touch *cwq after that.
-                 */
-                smp_rmb();
-                spin_unlock_wait(&cwq->lock);
-        }
 }
 /**