35 files changed, 1711 insertions, 1101 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0b46a5dff4c0..c64ce9c14207 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -23,7 +23,7 @@ config PREEMPT_VOLUNTARY
          "explicit preemption points" to the kernel code. These new
          preemption points have been selected to reduce the maximum
          latency of rescheduling, providing faster application reactions,
-          at the cost of slighly lower throughput.
+          at the cost of slightly lower throughput.
          This allows reaction to interactive events by allowing a
          low priority process to voluntarily preempt itself even if it
@@ -43,7 +43,7 @@ config PREEMPT
          even if it is in kernel mode executing a system call and would
          otherwise not be about to reach a natural preemption point.
          This allows applications to run more 'smoothly' even when the
-          system is under load, at the cost of slighly lower throughput
+          system is under load, at the cost of slightly lower throughput
          and a slight runtime overhead to kernel code.
          Select this if you are building a kernel for a desktop or
diff --git a/kernel/configs.c b/kernel/configs.c
index 8fa1fb28f8a7..e84d3f9c6c7b 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -61,18 +61,9 @@ static ssize_t
 ikconfig_read_current(struct file *file, char __user *buf,
                      size_t len, loff_t * offset)
 {
-        loff_t pos = *offset;
+        return simple_read_from_buffer(buf, len, offset,
-        ssize_t count;
+                                       kernel_config_data + MAGIC_SIZE,
+                                       kernel_config_data_size);
-        if (pos >= kernel_config_data_size)
-                return 0;
-        count = min(len, (size_t)(kernel_config_data_size - pos));
-        if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
-                return -EFAULT;
-        *offset += count;
-        return count;
 }
 static const struct file_operations ikconfig_file_ops = {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 36e70845cfc3..208cf3497c10 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -97,7 +97,7 @@ static inline void check_for_tasks(int cpu)
                    (!cputime_eq(p->utime, cputime_zero) ||
                     !cputime_eq(p->stime, cputime_zero)))
                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
-                                (state = %ld, flags = %lx) \n",
+                                (state = %ld, flags = %x) \n",
                                 p->comm, p->pid, cpu, p->state, p->flags);
        }
        write_unlock_irq(&tasklist_lock);
@@ -120,11 +120,13 @@ static int take_cpu_down(void *unused)
 }
 /* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu)
+static int _cpu_down(unsigned int cpu, int tasks_frozen)
 {
-        int err;
+        int err, nr_calls = 0;
        struct task_struct *p;
        cpumask_t old_allowed, tmp;
+        void *hcpu = (void *)(long)cpu;
+        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
        if (num_online_cpus() == 1)
                return -EBUSY;
@@ -132,12 +134,16 @@ static int _cpu_down(unsigned int cpu)
        if (!cpu_online(cpu))
                return -EINVAL;
-        err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
+        raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
-                                                (void *)(long)cpu);
+        err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
+                                        hcpu, -1, &nr_calls);
        if (err == NOTIFY_BAD) {
+                __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
+                                          hcpu, nr_calls, NULL);
                printk("%s: attempt to take down CPU %u failed\n",
                                __FUNCTION__, cpu);
-                return -EINVAL;
+                err = -EINVAL;
+                goto out_release;
        }
        /* Ensure that we are not runnable on dying cpu */
@@ -152,8 +158,8 @@ static int _cpu_down(unsigned int cpu)
        if (IS_ERR(p) || cpu_online(cpu)) {
                /* CPU didn't die: tell everyone.  Can't complain. */
-                if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
+                if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
-                                (void *)(long)cpu) == NOTIFY_BAD)
+                                            hcpu) == NOTIFY_BAD)
                        BUG();
                if (IS_ERR(p)) {
@@ -170,13 +176,9 @@ static int _cpu_down(unsigned int cpu)
        /* This actually kills the CPU. */
        __cpu_die(cpu);
-        /* Move it here so it can run. */
-        kthread_bind(p, get_cpu());
-        put_cpu();
        /* CPU is completely dead: tell everyone.  Too late to complain. */
-        if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD,
+        if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
-                        (void *)(long)cpu) == NOTIFY_BAD)
+                                    hcpu) == NOTIFY_BAD)
                BUG();
        check_for_tasks(cpu);
@@ -185,6 +187,8 @@ out_thread:
        err = kthread_stop(p);
 out_allowed:
        set_cpus_allowed(current, old_allowed);
+out_release:
+        raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
        return err;
 }
@@ -196,7 +200,7 @@ int cpu_down(unsigned int cpu)
        if (cpu_hotplug_disabled)
                err = -EBUSY;
        else
-                err = _cpu_down(cpu);
+                err = _cpu_down(cpu, 0);
        mutex_unlock(&cpu_add_remove_lock);
        return err;
@@ -204,15 +208,18 @@ int cpu_down(unsigned int cpu)
 #endif /*CONFIG_HOTPLUG_CPU*/
 /* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu)
+static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 {
-        int ret;
+        int ret, nr_calls = 0;
        void *hcpu = (void *)(long)cpu;
+        unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
        if (cpu_online(cpu) || !cpu_present(cpu))
                return -EINVAL;
-        ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+        raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+        ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
+                                                        -1, &nr_calls);
        if (ret == NOTIFY_BAD) {
                printk("%s: attempt to bring up CPU %u failed\n",
                                __FUNCTION__, cpu);
@@ -229,12 +236,13 @@ static int __cpuinit _cpu_up(unsigned int cpu)
        BUG_ON(!cpu_online(cpu));
        /* Now call notifier in preparation. */
-        raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
+        raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
 out_notify:
        if (ret != 0)
-                raw_notifier_call_chain(&cpu_chain,
+                __raw_notifier_call_chain(&cpu_chain,
-                                CPU_UP_CANCELED, hcpu);
+                                CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+        raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
        return ret;
 }
@@ -247,19 +255,13 @@ int __cpuinit cpu_up(unsigned int cpu)
        if (cpu_hotplug_disabled)
                err = -EBUSY;
        else
-                err = _cpu_up(cpu);
+                err = _cpu_up(cpu, 0);
        mutex_unlock(&cpu_add_remove_lock);
        return err;
 }
 #ifdef CONFIG_SUSPEND_SMP
-/* Needed to prevent the microcode driver from requesting firmware in its CPU
- * hotplug notifier during the suspend/resume.
- */
-int suspend_cpu_hotplug;
-EXPORT_SYMBOL(suspend_cpu_hotplug);
 static cpumask_t frozen_cpus;
 int disable_nonboot_cpus(void)
@@ -267,7 +269,6 @@ int disable_nonboot_cpus(void)
        int cpu, first_cpu, error = 0;
        mutex_lock(&cpu_add_remove_lock);
-        suspend_cpu_hotplug = 1;
        first_cpu = first_cpu(cpu_online_map);
        /* We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
@@ -277,7 +278,7 @@ int disable_nonboot_cpus(void)
        for_each_online_cpu(cpu) {
                if (cpu == first_cpu)
                        continue;
-                error = _cpu_down(cpu);
+                error = _cpu_down(cpu, 1);
                if (!error) {
                        cpu_set(cpu, frozen_cpus);
                        printk("CPU%d is down\n", cpu);
@@ -294,7 +295,6 @@ int disable_nonboot_cpus(void)
        } else {
                printk(KERN_ERR "Non-boot CPUs are not disabled\n");
        }
-        suspend_cpu_hotplug = 0;
        mutex_unlock(&cpu_add_remove_lock);
        return error;
 }
@@ -309,10 +309,9 @@ void enable_nonboot_cpus(void)
        if (cpus_empty(frozen_cpus))
                goto out;
-        suspend_cpu_hotplug = 1;
        printk("Enabling non-boot CPUs ...\n");
        for_each_cpu_mask(cpu, frozen_cpus) {
-                error = _cpu_up(cpu);
+                error = _cpu_up(cpu, 1);
                if (!error) {
                        printk("CPU%d is up\n", cpu);
                        continue;
@@ -320,7 +319,6 @@ void enable_nonboot_cpus(void)
                printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
        }
        cpus_clear(frozen_cpus);
-        suspend_cpu_hotplug = 0;
 out:
        mutex_unlock(&cpu_add_remove_lock);
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 88b416dfbc72..f57854b08922 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1772,12 +1772,7 @@ static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
 {
        struct ctr_struct *ctr = file->private_data;
-        if (*ppos + nbytes > ctr->bufsz)
+        return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
-                nbytes = ctr->bufsz - *ppos;
-        if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
-                return -EFAULT;
-        *ppos += nbytes;
-        return nbytes;
 }
 static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
diff --git a/kernel/exit.c b/kernel/exit.c
index f5a7abb621f3..b0c6f0c3a2df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
+#include <linux/kthread.h>
 #include <linux/mempolicy.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
@@ -254,26 +255,25 @@ static int has_stopped_jobs(struct pid *pgrp)
 }
 /**
- * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to.
+ * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
 *
 * If a kernel thread is launched as a result of a system call, or if
- * it ever exits, it should generally reparent itself to init so that
+ * it ever exits, it should generally reparent itself to kthreadd so it
- * it is correctly cleaned up on exit.
+ * isn't in the way of other processes and is correctly cleaned up on exit.
 *
 * The various task state such as scheduling policy and priority may have
 * been inherited from a user process, so we reset them to sane values here.
 *
- * NOTE that reparent_to_init() gives the caller full capabilities.
+ * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
 */
-static void reparent_to_init(void)
+static void reparent_to_kthreadd(void)
 {
        write_lock_irq(&tasklist_lock);
        ptrace_unlink(current);
        /* Reparent to init */
        remove_parent(current);
-        current->parent = child_reaper(current);
+        current->real_parent = current->parent = kthreadd_task;
-        current->real_parent = child_reaper(current);
        add_parent(current);
        /* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -347,7 +347,7 @@ int disallow_signal(int sig)
                return -EINVAL;
        spin_lock_irq(&current->sighand->siglock);
-        sigaddset(&current->blocked, sig);
+        current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
        return 0;
@@ -400,7 +400,7 @@ void daemonize(const char *name, ...)
        current->files = init_task.files;
        atomic_inc(&current->files->count);
-        reparent_to_init();
+        reparent_to_kthreadd();
 }
 EXPORT_SYMBOL(daemonize);
diff --git a/kernel/fork.c b/kernel/fork.c
index a8dd75d4992b..5dd3979747f5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,7 +105,7 @@ static struct kmem_cache *mm_cachep;
 void free_task(struct task_struct *tsk)
 {
-        free_thread_info(tsk->thread_info);
+        free_thread_info(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        free_task_struct(tsk);
 }
@@ -175,7 +175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        }
        *tsk = *orig;
-        tsk->thread_info = ti;
+        tsk->stack = ti;
        setup_thread_stack(tsk, orig);
 #ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/futex.c b/kernel/futex.c
index 600bc9d801f2..b7ce15c67e32 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
+ *  PRIVATE futexes by Eric Dumazet
+ *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
@@ -53,6 +56,12 @@
 #include "rtmutex_common.h"
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+# include "rtmutex-debug.h"
+#else
+# include "rtmutex.h"
+#endif
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
 /*
@@ -81,12 +90,12 @@ struct futex_pi_state {
 * we can wake only the relevant ones (hashed queues may be shared).
 *
 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
- * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0.
+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
 * The order of wakup is always to make the first condition true, then
 * wake up q->waiters, then make the second condition true.
 */
 struct futex_q {
-        struct list_head list;
+        struct plist_node list;
        wait_queue_head_t waiters;
        /* Which hash list lock to use: */
@@ -102,14 +111,20 @@ struct futex_q {
        /* Optional priority inheritance state: */
        struct futex_pi_state *pi_state;
        struct task_struct *task;
+        /*
+         * This waiter is used in case of requeue from a
+         * normal futex to a PI-futex
+         */
+        struct rt_mutex_waiter waiter;
 };
 /*
 * Split the global futex_lock into every hash list lock.
 */
 struct futex_hash_bucket {
-       spinlock_t              lock;
+        spinlock_t lock;
-       struct list_head       chain;
+        struct plist_head chain;
 };
 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -138,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
                && key1->both.offset == key2->both.offset);
 }
-/*
+/**
- * Get parameters which are the keys for a futex.
+ * get_futex_key - Get parameters which are the keys for a futex.
+ * @uaddr: virtual address of the futex
+ * @shared: NULL for a PROCESS_PRIVATE futex,
+ *      &current->mm->mmap_sem for a PROCESS_SHARED futex
+ * @key: address where result is stored.
+ *
+ * Returns a negative error code or 0
+ * The key words are stored in *key on success.
 *
 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 * We can usually work out the index without swapping in the page.
 *
- * Returns: 0, or negative error code.
+ * fshared is NULL for PROCESS_PRIVATE futexes
- * The key words are stored in *key on success.
+ * For other futexes, it points to &current->mm->mmap_sem and
- *
+ * caller must have taken the reader lock. but NOT any spinlocks.
- * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
 */
-int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
+                  union futex_key *key)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
@@ -162,11 +184,25 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
         * The futex address must be "naturally" aligned.
         */
        key->both.offset = address % PAGE_SIZE;
-        if (unlikely((key->both.offset % sizeof(u32)) != 0))
+        if (unlikely((address % sizeof(u32)) != 0))
                return -EINVAL;
        address -= key->both.offset;
        /*
+         * PROCESS_PRIVATE futexes are fast.
+         * As the mm cannot disappear under us and the 'key' only needs
+         * virtual address, we dont even have to find the underlying vma.
+         * Note : We do have to check 'uaddr' is a valid user address,
+         *        but access_ok() should be faster than find_vma()
+         */
+        if (!fshared) {
+                if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+                        return -EFAULT;
+                key->private.mm = mm;
+                key->private.address = address;
+                return 0;
+        }
+        /*
         * The futex is hashed differently depending on whether
         * it's in a shared or private mapping.  So check vma first.
         */
@@ -180,6 +216,9 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
        if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
                return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+        /* Save the user address in the ley */
+        key->uaddr = uaddr;
        /*
         * Private mappings are handled in a simple way.
         *
@@ -190,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
         * mappings of _writable_ handles.
         */
        if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+                key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
                key->private.mm = mm;
                key->private.address = address;
                return 0;
@@ -199,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
         * Linear file mappings are also simple.
         */
        key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
-        key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
+        key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
        if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
                key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
                                     + vma->vm_pgoff);
@@ -227,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key);
 * Take a reference to the resource addressed by a key.
 * Can be called while holding spinlocks.
 *
- * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
- * function, if it is called at all.  mmap_sem keeps key->shared.inode valid.
 */
 inline void get_futex_key_refs(union futex_key *key)
 {
-        if (key->both.ptr != 0) {
+        if (key->both.ptr == 0)
-                if (key->both.offset & 1)
+                return;
+        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+                case FUT_OFF_INODE:
                        atomic_inc(&key->shared.inode->i_count);
-                else
+                        break;
+                case FUT_OFF_MMSHARED:
                        atomic_inc(&key->private.mm->mm_count);
+                        break;
        }
 }
 EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -247,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
 */
 void drop_futex_key_refs(union futex_key *key)
 {
-        if (key->both.ptr != 0) {
+        if (key->both.ptr == 0)
-                if (key->both.offset & 1)
+                return;
+        switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+                case FUT_OFF_INODE:
                        iput(key->shared.inode);
-                else
+                        break;
+                case FUT_OFF_MMSHARED:
                        mmdrop(key->private.mm);
+                        break;
        }
 }
 EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -268,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 }
 /*
- * Fault handling. Called with current->mm->mmap_sem held.
+ * Fault handling.
+ * if fshared is non NULL, current->mm->mmap_sem is already held
 */
-static int futex_handle_fault(unsigned long address, int attempt)
+static int futex_handle_fault(unsigned long address,
+                              struct rw_semaphore *fshared, int attempt)
 {
        struct vm_area_struct * vma;
        struct mm_struct *mm = current->mm;
+        int ret = -EFAULT;
-        if (attempt > 2 || !(vma = find_vma(mm, address)) ||
+        if (attempt > 2)
-            vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
+                return ret;
-                return -EFAULT;
-        switch (handle_mm_fault(mm, vma, address, 1)) {
+        if (!fshared)
-        case VM_FAULT_MINOR:
+                down_read(&mm->mmap_sem);
-                current->min_flt++;
+        vma = find_vma(mm, address);
-                break;
+        if (vma && address >= vma->vm_start &&
-        case VM_FAULT_MAJOR:
+            (vma->vm_flags & VM_WRITE)) {
-                current->maj_flt++;
+                switch (handle_mm_fault(mm, vma, address, 1)) {
-                break;
+                case VM_FAULT_MINOR:
-        default:
+                        ret = 0;
-                return -EFAULT;
+                        current->min_flt++;
+                        break;
+                case VM_FAULT_MAJOR:
+                        ret = 0;
+                        current->maj_flt++;
+                        break;
+                }
        }
-        return 0;
+        if (!fshared)
+                up_read(&mm->mmap_sem);
+        return ret;
 }
 /*
@@ -439,18 +495,19 @@ void exit_pi_state_list(struct task_struct *curr)
 }
 static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
+lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+                union futex_key *key, struct futex_pi_state **ps)
 {
        struct futex_pi_state *pi_state = NULL;
        struct futex_q *this, *next;
-        struct list_head *head;
+        struct plist_head *head;
        struct task_struct *p;
        pid_t pid;
        head = &hb->chain;
-        list_for_each_entry_safe(this, next, head, list) {
+        plist_for_each_entry_safe(this, next, head, list) {
-                if (match_futex(&this->key, &me->key)) {
+                if (match_futex(&this->key, key)) {
                        /*
                         * Another waiter already exists - bump up
                         * the refcount and return its pi_state:
@@ -465,7 +522,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
                        WARN_ON(!atomic_read(&pi_state->refcount));
                        atomic_inc(&pi_state->refcount);
-                        me->pi_state = pi_state;
+                        *ps = pi_state;
                        return 0;
                }
@@ -492,7 +549,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
        /* Store the key for possible exit cleanups: */
-        pi_state->key = me->key;
+        pi_state->key = *key;
        spin_lock_irq(&p->pi_lock);
        WARN_ON(!list_empty(&pi_state->list));
@@ -502,7 +559,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        put_task_struct(p);
-        me->pi_state = pi_state;
+        *ps = pi_state;
        return 0;
 }
@@ -513,12 +570,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 */
 static void wake_futex(struct futex_q *q)
 {
-        list_del_init(&q->list);
+        plist_del(&q->list, &q->list.plist);
        if (q->filp)
                send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
        /*
         * The lock in wake_up_all() is a crucial memory barrier after the
-         * list_del_init() and also before assigning to q->lock_ptr.
+         * plist_del() and also before assigning to q->lock_ptr.
         */
        wake_up_all(&q->waiters);
        /*
@@ -562,6 +619,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
         */
        if (!(uval & FUTEX_OWNER_DIED)) {
                newval = FUTEX_WAITERS | new_owner->pid;
+                /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
+                newval |= (uval & FUTEX_WAITER_REQUEUED);
                pagefault_disable();
                curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -629,17 +688,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 * Wake up all waiters hashed on the physical page that is mapped
 * to this virtual address:
 */
-static int futex_wake(u32 __user *uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
+                      int nr_wake)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
-        struct list_head *head;
+        struct plist_head *head;
        union futex_key key;
        int ret;
-        down_read(&current->mm->mmap_sem);
+        if (fshared)
+                down_read(fshared);
-        ret = get_futex_key(uaddr, &key);
+        ret = get_futex_key(uaddr, fshared, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -647,7 +708,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
        spin_lock(&hb->lock);
        head = &hb->chain;
-        list_for_each_entry_safe(this, next, head, list) {
+        plist_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key)) {
                        if (this->pi_state) {
                                ret = -EINVAL;
@@ -661,7 +722,261 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
        spin_unlock(&hb->lock);
 out:
-        up_read(&current->mm->mmap_sem);
+        if (fshared)
+                up_read(fshared);
+        return ret;
+}
+/*
+ * Called from futex_requeue_pi.
+ * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
+ * PI-futex value; search its associated pi_state if an owner exist
+ * or create a new one without owner.
+ */
+static inline int
+lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
+                            union futex_key *key,
+                            struct futex_pi_state **pi_state)
+{
+        u32 curval, uval, newval;
+retry:
+        /*
+         * We can't handle a fault cleanly because we can't
+         * release the locks here. Simply return the fault.
+         */
+        if (get_futex_value_locked(&curval, uaddr))
+                return -EFAULT;
+        /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
+        if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
+            != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
+                /*
+                 * No waiters yet, we prepare the futex to have some waiters.
+                 */
+                uval = curval;
+                newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
+                pagefault_disable();
+                curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+                pagefault_enable();
+                if (unlikely(curval == -EFAULT))
+                        return -EFAULT;
+                if (unlikely(curval != uval))
+                        goto retry;
+        }
+        if (!(curval & FUTEX_TID_MASK)
+            || lookup_pi_state(curval, hb, key, pi_state)) {
+                /* the futex has no owner (yet) or the lookup failed:
+                   allocate one pi_state without owner */
+                *pi_state = alloc_pi_state();
+                /* Already stores the key: */
+                (*pi_state)->key = *key;
+                /* init the mutex without owner */
+                __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
+        }
+        return 0;
+}
+/*
+ * Keep the first nr_wake waiter from futex1, wake up one,
+ * and requeue the next nr_requeue waiters following hashed on
+ * one physical page to another physical page (PI-futex uaddr2)
+ */
+static int futex_requeue_pi(u32 __user *uaddr1,
+                            struct rw_semaphore *fshared,
+                            u32 __user *uaddr2,
+                            int nr_wake, int nr_requeue, u32 *cmpval)
+{
+        union futex_key key1, key2;
+        struct futex_hash_bucket *hb1, *hb2;
+        struct plist_head *head1;
+        struct futex_q *this, *next;
+        struct futex_pi_state *pi_state2 = NULL;
+        struct rt_mutex_waiter *waiter, *top_waiter = NULL;
+        struct rt_mutex *lock2 = NULL;
+        int ret, drop_count = 0;
+        if (refill_pi_state_cache())
+                return -ENOMEM;
+retry:
+        /*
+         * First take all the futex related locks:
+         */
+        if (fshared)
+                down_read(fshared);
+        ret = get_futex_key(uaddr1, fshared, &key1);
+        if (unlikely(ret != 0))
+                goto out;
+        ret = get_futex_key(uaddr2, fshared, &key2);
+        if (unlikely(ret != 0))
+                goto out;
+        hb1 = hash_futex(&key1);
+        hb2 = hash_futex(&key2);
+        double_lock_hb(hb1, hb2);
+        if (likely(cmpval != NULL)) {
+                u32 curval;
+                ret = get_futex_value_locked(&curval, uaddr1);
+                if (unlikely(ret)) {
+                        spin_unlock(&hb1->lock);
+                        if (hb1 != hb2)
+                                spin_unlock(&hb2->lock);
+                        /*
+                         * If we would have faulted, release mmap_sem, fault
+                         * it in and start all over again.
+                         */
+                        if (fshared)
+                                up_read(fshared);
+                        ret = get_user(curval, uaddr1);
+                        if (!ret)
+                                goto retry;
+                        return ret;
+                }
+                if (curval != *cmpval) {
+                        ret = -EAGAIN;
+                        goto out_unlock;
+                }
+        }
+        head1 = &hb1->chain;
+        plist_for_each_entry_safe(this, next, head1, list) {
+                if (!match_futex (&this->key, &key1))
+                        continue;
+                if (++ret <= nr_wake) {
+                        wake_futex(this);
+                } else {
+                        /*
+                         * FIRST: get and set the pi_state
+                         */
+                        if (!pi_state2) {
+                                int s;
+                                /* do this only the first time we requeue someone */
+                                s = lookup_pi_state_for_requeue(uaddr2, hb2,
+                                                                &key2, &pi_state2);
+                                if (s) {
+                                        ret = s;
+                                        goto out_unlock;
+                                }
+                                lock2 = &pi_state2->pi_mutex;
+                                spin_lock(&lock2->wait_lock);
+                                /* Save the top waiter of the wait_list */
+                                if (rt_mutex_has_waiters(lock2))
+                                        top_waiter = rt_mutex_top_waiter(lock2);
+                        } else
+                                atomic_inc(&pi_state2->refcount);
+                        this->pi_state = pi_state2;
+                        /*
+                         * SECOND: requeue futex_q to the correct hashbucket
+                         */
+                        /*
+                         * If key1 and key2 hash to the same bucket, no need to
+                         * requeue.
+                         */
+                        if (likely(head1 != &hb2->chain)) {
+                                plist_del(&this->list, &hb1->chain);
+                                plist_add(&this->list, &hb2->chain);
+                                this->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+                                this->list.plist.lock = &hb2->lock;
+#endif
+                        }
+                        this->key = key2;
+                        get_futex_key_refs(&key2);
+                        drop_count++;
+                        /*
+                         * THIRD: queue it to lock2
+                         */
+                        spin_lock_irq(&this->task->pi_lock);
+                        waiter = &this->waiter;
+                        waiter->task = this->task;
+                        waiter->lock = lock2;
+                        plist_node_init(&waiter->list_entry, this->task->prio);
+                        plist_node_init(&waiter->pi_list_entry, this->task->prio);
+                        plist_add(&waiter->list_entry, &lock2->wait_list);
+                        this->task->pi_blocked_on = waiter;
+                        spin_unlock_irq(&this->task->pi_lock);
+                        if (ret - nr_wake >= nr_requeue)
+                                break;
+                }
+        }
+        /* If we've requeued some tasks and the top_waiter of the rt_mutex
+           has changed, we must adjust the priority of the owner, if any */
+        if (drop_count) {
+                struct task_struct *owner = rt_mutex_owner(lock2);
+                if (owner &&
+                    (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
+                        int chain_walk = 0;
+                        spin_lock_irq(&owner->pi_lock);
+                        if (top_waiter)
+                                plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
+                        else
+                                /*
+                                 * There was no waiters before the requeue,
+                                 * the flag must be updated
+                                 */
+                                mark_rt_mutex_waiters(lock2);
+                        plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
+                        __rt_mutex_adjust_prio(owner);
+                        if (owner->pi_blocked_on) {
+                                chain_walk = 1;
+                                get_task_struct(owner);
+                        }
+                        spin_unlock_irq(&owner->pi_lock);
+                        spin_unlock(&lock2->wait_lock);
+                        if (chain_walk)
+                                rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
+                                                           current);
+                } else {
+                        /* No owner or the top_waiter does not change */
+                        mark_rt_mutex_waiters(lock2);
+                        spin_unlock(&lock2->wait_lock);
+                }
+        }
+out_unlock:
+        spin_unlock(&hb1->lock);
+        if (hb1 != hb2)
+                spin_unlock(&hb2->lock);
+        /* drop_futex_key_refs() must be called outside the spinlocks. */
+        while (--drop_count >= 0)
+                drop_futex_key_refs(&key1);
+out:
+        if (fshared)
+                up_read(fshared);
        return ret;
 }
@@ -670,22 +985,24 @@ out:
 * to this virtual address:
 */
 static int
-futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
+              u32 __user *uaddr2,
              int nr_wake, int nr_wake2, int op)
 {
        union futex_key key1, key2;
        struct futex_hash_bucket *hb1, *hb2;
-        struct list_head *head;
+        struct plist_head *head;
        struct futex_q *this, *next;
        int ret, op_ret, attempt = 0;
 retryfull:
-        down_read(&current->mm->mmap_sem);
+        if (fshared)
+                down_read(fshared);
-        ret = get_futex_key(uaddr1, &key1);
+        ret = get_futex_key(uaddr1, fshared, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, &key2);
+        ret = get_futex_key(uaddr2, fshared, &key2);
        if (unlikely(ret != 0))
                goto out;
@@ -725,11 +1042,10 @@ retry:
                 * still holding the mmap_sem.
                 */
                if (attempt++) {
-                        if (futex_handle_fault((unsigned long)uaddr2,
+                        ret = futex_handle_fault((unsigned long)uaddr2,
-                                                attempt)) {
+                                                fshared, attempt);
-                                ret = -EFAULT;
+                        if (ret)
                                goto out;
-                        }
                        goto retry;
                }
@@ -737,7 +1053,8 @@ retry:
                 * If we would have faulted, release mmap_sem,
                 * fault it in and start all over again.
                 */
-                up_read(&current->mm->mmap_sem);
+                if (fshared)
+                        up_read(fshared);
                ret = get_user(dummy, uaddr2);
                if (ret)
@@ -748,7 +1065,7 @@ retry:
        head = &hb1->chain;
-        list_for_each_entry_safe(this, next, head, list) {
+        plist_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key1)) {
                        wake_futex(this);
                        if (++ret >= nr_wake)
@@ -760,7 +1077,7 @@ retry:
                head = &hb2->chain;
                op_ret = 0;
-                list_for_each_entry_safe(this, next, head, list) {
+                plist_for_each_entry_safe(this, next, head, list) {
                        if (match_futex (&this->key, &key2)) {
                                wake_futex(this);
                                if (++op_ret >= nr_wake2)
@@ -774,7 +1091,8 @@ retry:
        if (hb1 != hb2)
                spin_unlock(&hb2->lock);
 out:
-        up_read(&current->mm->mmap_sem);
+        if (fshared)
+                up_read(fshared);
        return ret;
 }
@@ -782,22 +1100,24 @@ out:
 * Requeue all waiters hashed on one physical page to another
 * physical page.
 */
-static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
+                         u32 __user *uaddr2,
                         int nr_wake, int nr_requeue, u32 *cmpval)
 {
        union futex_key key1, key2;
        struct futex_hash_bucket *hb1, *hb2;
-        struct list_head *head1;
+        struct plist_head *head1;
        struct futex_q *this, *next;
        int ret, drop_count = 0;
 retry:
-        down_read(&current->mm->mmap_sem);
+        if (fshared)
+                down_read(fshared);
-        ret = get_futex_key(uaddr1, &key1);
+        ret = get_futex_key(uaddr1, fshared, &key1);
        if (unlikely(ret != 0))
                goto out;
-        ret = get_futex_key(uaddr2, &key2);
+        ret = get_futex_key(uaddr2, fshared, &key2);
        if (unlikely(ret != 0))
                goto out;
@@ -820,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
                         * If we would have faulted, release mmap_sem, fault
                         * it in and start all over again.
                         */
-                        up_read(&current->mm->mmap_sem);
+                        if (fshared)
+                                up_read(fshared);
                        ret = get_user(curval, uaddr1);
@@ -836,7 +1157,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
        }
        head1 = &hb1->chain;
-        list_for_each_entry_safe(this, next, head1, list) {
+        plist_for_each_entry_safe(this, next, head1, list) {
                if (!match_futex (&this->key, &key1))
                        continue;
                if (++ret <= nr_wake) {
@@ -847,9 +1168,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
                         * requeue.
                         */
                        if (likely(head1 != &hb2->chain)) {
-                                list_move_tail(&this->list, &hb2->chain);
+                                plist_del(&this->list, &hb1->chain);
+                                plist_add(&this->list, &hb2->chain);
                                this->lock_ptr = &hb2->lock;
-                        }
+#ifdef CONFIG_DEBUG_PI_LIST
+                                this->list.plist.lock = &hb2->lock;
+#endif
+                        }
                        this->key = key2;
                        get_futex_key_refs(&key2);
                        drop_count++;
@@ -869,7 +1194,8 @@ out_unlock:
                drop_futex_key_refs(&key1);
 out:
-        up_read(&current->mm->mmap_sem);
+        if (fshared)
+                up_read(fshared);
        return ret;
 }
@@ -894,7 +1220,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
 static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
 {
-        list_add_tail(&q->list, &hb->chain);
+        int prio;
+        /*
+         * The priority used to register this element is
+         * - either the real thread-priority for the real-time threads
+         * (i.e. threads with a priority lower than MAX_RT_PRIO)
+         * - or MAX_RT_PRIO for non-RT threads.
+         * Thus, all RT-threads are woken first in priority order, and
+         * the others are woken last, in FIFO order.
+         */
+        prio = min(current->normal_prio, MAX_RT_PRIO);
+        plist_node_init(&q->list, prio);
+#ifdef CONFIG_DEBUG_PI_LIST
+        q->list.plist.lock = &hb->lock;
+#endif
+        plist_add(&q->list, &hb->chain);
        q->task = current;
        spin_unlock(&hb->lock);
 }
@@ -949,8 +1291,8 @@ static int unqueue_me(struct futex_q *q)
                        spin_unlock(lock_ptr);
                        goto retry;
                }
-                WARN_ON(list_empty(&q->list));
+                WARN_ON(plist_node_empty(&q->list));
-                list_del(&q->list);
+                plist_del(&q->list, &q->list.plist);
                BUG_ON(q->pi_state);
@@ -964,39 +1306,104 @@ static int unqueue_me(struct futex_q *q)
 /*
 * PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock is held on entry and dropped here.
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
+ * and dropped here.
 */
-static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
+static void unqueue_me_pi(struct futex_q *q)
 {
-        WARN_ON(list_empty(&q->list));
+        WARN_ON(plist_node_empty(&q->list));
-        list_del(&q->list);
+        plist_del(&q->list, &q->list.plist);
        BUG_ON(!q->pi_state);
        free_pi_state(q->pi_state);
        q->pi_state = NULL;
-        spin_unlock(&hb->lock);
+        spin_unlock(q->lock_ptr);
        drop_futex_key_refs(&q->key);
 }
+/*
+ * Fixup the pi_state owner with current.
+ *
+ * The cur->mm semaphore must be  held, it is released at return of this
+ * function.
+ */
+static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
+                                struct futex_q *q,
+                                struct futex_hash_bucket *hb,
+                                struct task_struct *curr)
+{
+        u32 newtid = curr->pid | FUTEX_WAITERS;
+        struct futex_pi_state *pi_state = q->pi_state;
+        u32 uval, curval, newval;
+        int ret;
+        /* Owner died? */
+        if (pi_state->owner != NULL) {
+                spin_lock_irq(&pi_state->owner->pi_lock);
+                WARN_ON(list_empty(&pi_state->list));
+                list_del_init(&pi_state->list);
+                spin_unlock_irq(&pi_state->owner->pi_lock);
+        } else
+                newtid |= FUTEX_OWNER_DIED;
+        pi_state->owner = curr;
+        spin_lock_irq(&curr->pi_lock);
+        WARN_ON(!list_empty(&pi_state->list));
+        list_add(&pi_state->list, &curr->pi_state_list);
+        spin_unlock_irq(&curr->pi_lock);
+        /* Unqueue and drop the lock */
+        unqueue_me_pi(q);
+        if (fshared)
+                up_read(fshared);
+        /*
+         * We own it, so we have to replace the pending owner
+         * TID. This must be atomic as we have preserve the
+         * owner died bit here.
+         */
+        ret = get_user(uval, uaddr);
+        while (!ret) {
+                newval = (uval & FUTEX_OWNER_DIED) | newtid;
+                newval |= (uval & FUTEX_WAITER_REQUEUED);
+                curval = futex_atomic_cmpxchg_inatomic(uaddr,
+                                                       uval, newval);
+                if (curval == -EFAULT)
+                        ret = -EFAULT;
+                if (curval == uval)
+                        break;
+                uval = curval;
+        }
+        return ret;
+}
+/*
+ * In case we must use restart_block to restart a futex_wait,
+ * we encode in the 'arg3' shared capability
+ */
+#define ARG3_SHARED  1
 static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait_abstime(u32 __user *uaddr, u32 val,
+static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
-                        int timed, unsigned long abs_time)
+                      u32 val, ktime_t *abs_time)
 {
        struct task_struct *curr = current;
        DECLARE_WAITQUEUE(wait, curr);
        struct futex_hash_bucket *hb;
        struct futex_q q;
-        unsigned long time_left = 0;
        u32 uval;
        int ret;
+        struct hrtimer_sleeper t, *to = NULL;
+        int rem = 0;
        q.pi_state = NULL;
 retry:
-        down_read(&curr->mm->mmap_sem);
+        if (fshared)
+                down_read(fshared);
-        ret = get_futex_key(uaddr, &q.key);
+        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out_release_sem;
@@ -1019,8 +1426,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
         * a wakeup when *uaddr != val on entry to the syscall.  This is
         * rare, but normal.
         *
-         * We hold the mmap semaphore, so the mapping cannot have changed
+         * for shared futexes, we hold the mmap semaphore, so the mapping
-         * since we looked it up in get_futex_key.
+         * cannot have changed since we looked it up in get_futex_key.
         */
        ret = get_futex_value_locked(&uval, uaddr);
@@ -1031,7 +1438,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
                 * If we would have faulted, release mmap_sem, fault it in and
                 * start all over again.
                 */
-                up_read(&curr->mm->mmap_sem);
+                if (fshared)
+                        up_read(fshared);
                ret = get_user(uval, uaddr);
@@ -1043,6 +1451,14 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
        if (uval != val)
                goto out_unlock_release_sem;
+        /*
+         * This rt_mutex_waiter structure is prepared here and will
+         * be used only if this task is requeued from a normal futex to
+         * a PI-futex with futex_requeue_pi.
+         */
+        debug_rt_mutex_init_waiter(&q.waiter);
+        q.waiter.task = NULL;
        /* Only actually queue if *uaddr contained val.  */
        __queue_me(&q, hb);
@@ -1050,7 +1466,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
         * Now the futex is queued and we have checked the data, we
         * don't want to hold mmap_sem while we sleep.
         */
-        up_read(&curr->mm->mmap_sem);
+        if (fshared)
+                up_read(fshared);
        /*
         * There might have been scheduling since the queue_me(), as we
@@ -1065,23 +1482,33 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
        __set_current_state(TASK_INTERRUPTIBLE);
        add_wait_queue(&q.waiters, &wait);
        /*
-         * !list_empty() is safe here without any lock.
+         * !plist_node_empty() is safe here without any lock.
         * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
         */
-        time_left = 0;
+        if (likely(!plist_node_empty(&q.list))) {
-        if (likely(!list_empty(&q.list))) {
+                if (!abs_time)
-                unsigned long rel_time;
+                        schedule();
+                else {
-                if (timed) {
+                        to = &t;
-                        unsigned long now = jiffies;
+                        hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-                        if (time_after(now, abs_time))
+                        hrtimer_init_sleeper(&t, current);
-                                rel_time = 0;
+                        t.timer.expires = *abs_time;
-                        else
-                                rel_time = abs_time - now;
-                } else
-                        rel_time = MAX_SCHEDULE_TIMEOUT;
-                time_left = schedule_timeout(rel_time);
+                        hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
+                        /*
+                         * the timer could have already expired, in which
+                         * case current would be flagged for rescheduling.
+                         * Don't bother calling schedule.
+                         */
+                        if (likely(t.task))
+                                schedule();
+                        hrtimer_cancel(&t.timer);
+                        /* Flag if a timeout occured */
+                        rem = (t.task == NULL);
+                }
        }
        __set_current_state(TASK_RUNNING);
@@ -1090,17 +1517,80 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
         * we are the only user of it.
         */
+        if (q.pi_state) {
+                /*
+                 * We were woken but have been requeued on a PI-futex.
+                 * We have to complete the lock acquisition by taking
+                 * the rtmutex.
+                 */
+                struct rt_mutex *lock = &q.pi_state->pi_mutex;
+                spin_lock(&lock->wait_lock);
+                if (unlikely(q.waiter.task)) {
+                        remove_waiter(lock, &q.waiter);
+                }
+                spin_unlock(&lock->wait_lock);
+                if (rem)
+                        ret = -ETIMEDOUT;
+                else
+                        ret = rt_mutex_timed_lock(lock, to, 1);
+                if (fshared)
+                        down_read(fshared);
+                spin_lock(q.lock_ptr);
+                /*
+                 * Got the lock. We might not be the anticipated owner if we
+                 * did a lock-steal - fix up the PI-state in that case.
+                 */
+                if (!ret && q.pi_state->owner != curr) {
+                        /*
+                         * We MUST play with the futex we were requeued on,
+                         * NOT the current futex.
+                         * We can retrieve it from the key of the pi_state
+                         */
+                        uaddr = q.pi_state->key.uaddr;
+                        /* mmap_sem and hash_bucket lock are unlocked at
+                           return of this function */
+                        ret = fixup_pi_state_owner(uaddr, fshared,
+                                                   &q, hb, curr);
+                } else {
+                        /*
+                         * Catch the rare case, where the lock was released
+                         * when we were on the way back before we locked
+                         * the hash bucket.
+                         */
+                        if (ret && q.pi_state->owner == curr) {
+                                if (rt_mutex_trylock(&q.pi_state->pi_mutex))
+                                        ret = 0;
+                        }
+                        /* Unqueue and drop the lock */
+                        unqueue_me_pi(&q);
+                        if (fshared)
+                                up_read(fshared);
+                }
+                debug_rt_mutex_free_waiter(&q.waiter);
+                return ret;
+        }
+        debug_rt_mutex_free_waiter(&q.waiter);
        /* If we were woken (and unqueued), we succeeded, whatever. */
        if (!unqueue_me(&q))
                return 0;
-        if (time_left == 0)
+        if (rem)
                return -ETIMEDOUT;
        /*
         * We expect signal_pending(current), but another thread may
         * have handled it for us already.
         */
-        if (time_left == MAX_SCHEDULE_TIMEOUT)
+        if (!abs_time)
                return -ERESTARTSYS;
        else {
                struct restart_block *restart;
@@ -1108,8 +1598,10 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
                restart->fn = futex_wait_restart;
                restart->arg0 = (unsigned long)uaddr;
                restart->arg1 = (unsigned long)val;
-                restart->arg2 = (unsigned long)timed;
+                restart->arg2 = (unsigned long)abs_time;
-                restart->arg3 = abs_time;
+                restart->arg3 = 0;
+                if (fshared)
+                        restart->arg3 |= ARG3_SHARED;
                return -ERESTART_RESTARTBLOCK;
        }
@@ -1117,65 +1609,111 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
        queue_unlock(&q, hb);
 out_release_sem:
-        up_read(&curr->mm->mmap_sem);
+        if (fshared)
+                up_read(fshared);
        return ret;
 }
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
-{
-        int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
-        return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
-}
 static long futex_wait_restart(struct restart_block *restart)
 {
        u32 __user *uaddr = (u32 __user *)restart->arg0;
        u32 val = (u32)restart->arg1;
-        int timed = (int)restart->arg2;
+        ktime_t *abs_time = (ktime_t *)restart->arg2;
-        unsigned long abs_time = restart->arg3;
+        struct rw_semaphore *fshared = NULL;
        restart->fn = do_no_restart_syscall;
-        return (long)futex_wait_abstime(uaddr, val, timed, abs_time);
+        if (restart->arg3 & ARG3_SHARED)
+                fshared = &current->mm->mmap_sem;
+        return (long)futex_wait(uaddr, fshared, val, abs_time);
 }
+static void set_pi_futex_owner(struct futex_hash_bucket *hb,
+                               union futex_key *key, struct task_struct *p)
+{
+        struct plist_head *head;
+        struct futex_q *this, *next;
+        struct futex_pi_state *pi_state = NULL;
+        struct rt_mutex *lock;
+        /* Search a waiter that should already exists */
+        head = &hb->chain;
+        plist_for_each_entry_safe(this, next, head, list) {
+                if (match_futex (&this->key, key)) {
+                        pi_state = this->pi_state;
+                        break;
+                }
+        }
+        BUG_ON(!pi_state);
+        /* set p as pi_state's owner */
+        lock = &pi_state->pi_mutex;
+        spin_lock(&lock->wait_lock);
+        spin_lock_irq(&p->pi_lock);
+        list_add(&pi_state->list, &p->pi_state_list);
+        pi_state->owner = p;
+        /* set p as pi_mutex's owner */
+        debug_rt_mutex_proxy_lock(lock, p);
+        WARN_ON(rt_mutex_owner(lock));
+        rt_mutex_set_owner(lock, p, 0);
+        rt_mutex_deadlock_account_lock(lock, p);
+        plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
+                  &p->pi_waiters);
+        __rt_mutex_adjust_prio(p);
+        spin_unlock_irq(&p->pi_lock);
+        spin_unlock(&lock->wait_lock);
+}
 /*
 * Userspace tried a 0 -> TID atomic transition of the futex value
 * and failed. The kernel side here does the whole locking operation:
 * if there are waiters then it will block, it does PI, etc. (Due to
 * races the kernel might see a 0 value of the futex too.)
 */
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
+static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
-                         long nsec, int trylock)
+                         int detect, ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
        struct task_struct *curr = current;
        struct futex_hash_bucket *hb;
        u32 uval, newval, curval;
        struct futex_q q;
-        int ret, attempt = 0;
+        int ret, lock_held, attempt = 0;
        if (refill_pi_state_cache())
                return -ENOMEM;
-        if (sec != MAX_SCHEDULE_TIMEOUT) {
+        if (time) {
                to = &timeout;
                hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
                hrtimer_init_sleeper(to, current);
-                to->timer.expires = ktime_set(sec, nsec);
+                to->timer.expires = *time;
        }
        q.pi_state = NULL;
 retry:
-        down_read(&curr->mm->mmap_sem);
+        if (fshared)
+                down_read(fshared);
-        ret = get_futex_key(uaddr, &q.key);
+        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out_release_sem;
        hb = queue_lock(&q, -1, NULL);
 retry_locked:
+        lock_held = 0;
        /*
         * To avoid races, we attempt to take the lock here again
         * (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1194,7 +1732,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
                if (!detect && 0)
                        force_sig(SIGKILL, current);
-                ret = -EDEADLK;
+                /*
+                 * Normally, this check is done in user space.
+                 * In case of requeue, the owner may attempt to lock this futex,
+                 * even if the ownership has already been given by the previous
+                 * waker.
+                 * In the usual case, this is a case of deadlock, but not in case
+                 * of REQUEUE_PI.
+                 */
+                if (!(curval & FUTEX_WAITER_REQUEUED))
+                        ret = -EDEADLK;
                goto out_unlock_release_sem;
        }
@@ -1206,7 +1753,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
                goto out_unlock_release_sem;
        uval = curval;
-        newval = uval | FUTEX_WAITERS;
+        /*
+         * In case of a requeue, check if there already is an owner
+         * If not, just take the futex.
+         */
+        if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
+                /* set current as futex owner */
+                newval = curval | current->pid;
+                lock_held = 1;
+        } else
+                /* Set the WAITERS flag, so the owner will know it has someone
+                   to wake at next unlock */
+                newval = curval | FUTEX_WAITERS;
        pagefault_disable();
        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1217,11 +1775,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        if (unlikely(curval != uval))
                goto retry_locked;
+        if (lock_held) {
+                set_pi_futex_owner(hb, &q.key, curr);
+                goto out_unlock_release_sem;
+        }
        /*
         * We dont have the lock. Look up the PI state (or create it if
         * we are the first waiter):
         */
-        ret = lookup_pi_state(uval, hb, &q);
+        ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
        if (unlikely(ret)) {
                /*
@@ -1263,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
         * Now the futex is queued and we have checked the data, we
         * don't want to hold mmap_sem while we sleep.
         */
-        up_read(&curr->mm->mmap_sem);
+        if (fshared)
+                up_read(fshared);
        WARN_ON(!q.pi_state);
        /*
@@ -1277,52 +1841,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
                ret = ret ? 0 : -EWOULDBLOCK;
        }
-        down_read(&curr->mm->mmap_sem);
+        if (fshared)
+                down_read(fshared);
        spin_lock(q.lock_ptr);
        /*
         * Got the lock. We might not be the anticipated owner if we
         * did a lock-steal - fix up the PI-state in that case.
         */
-        if (!ret && q.pi_state->owner != curr) {
+        if (!ret && q.pi_state->owner != curr)
-                u32 newtid = current->pid | FUTEX_WAITERS;
+                /* mmap_sem is unlocked at return of this function */
+                ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
-                /* Owner died? */
+        else {
-                if (q.pi_state->owner != NULL) {
-                        spin_lock_irq(&q.pi_state->owner->pi_lock);
-                        WARN_ON(list_empty(&q.pi_state->list));
-                        list_del_init(&q.pi_state->list);
-                        spin_unlock_irq(&q.pi_state->owner->pi_lock);
-                } else
-                        newtid |= FUTEX_OWNER_DIED;
-                q.pi_state->owner = current;
-                spin_lock_irq(&current->pi_lock);
-                WARN_ON(!list_empty(&q.pi_state->list));
-                list_add(&q.pi_state->list, &current->pi_state_list);
-                spin_unlock_irq(&current->pi_lock);
-                /* Unqueue and drop the lock */
-                unqueue_me_pi(&q, hb);
-                up_read(&curr->mm->mmap_sem);
-                /*
-                 * We own it, so we have to replace the pending owner
-                 * TID. This must be atomic as we have preserve the
-                 * owner died bit here.
-                 */
-                ret = get_user(uval, uaddr);
-                while (!ret) {
-                        newval = (uval & FUTEX_OWNER_DIED) | newtid;
-                        curval = futex_atomic_cmpxchg_inatomic(uaddr,
-                                                               uval, newval);
-                        if (curval == -EFAULT)
-                                ret = -EFAULT;
-                        if (curval == uval)
-                                break;
-                        uval = curval;
-                }
-        } else {
                /*
                 * Catch the rare case, where the lock was released
                 * when we were on the way back before we locked
@@ -1333,8 +1863,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
                                ret = 0;
                }
                /* Unqueue and drop the lock */
-                unqueue_me_pi(&q, hb);
+                unqueue_me_pi(&q);
-                up_read(&curr->mm->mmap_sem);
+                if (fshared)
+                        up_read(fshared);
        }
        if (!detect && ret == -EDEADLK && 0)
@@ -1346,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        queue_unlock(&q, hb);
 out_release_sem:
-        up_read(&curr->mm->mmap_sem);
+        if (fshared)
+                up_read(fshared);
        return ret;
 uaddr_faulted:
@@ -1357,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
         * still holding the mmap_sem.
         */
        if (attempt++) {
-                if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+                ret = futex_handle_fault((unsigned long)uaddr, fshared,
-                        ret = -EFAULT;
+                                         attempt);
+                if (ret)
                        goto out_unlock_release_sem;
-                }
                goto retry_locked;
        }
        queue_unlock(&q, hb);
-        up_read(&curr->mm->mmap_sem);
+        if (fshared)
+                up_read(fshared);
        ret = get_user(uval, uaddr);
        if (!ret && (uval != -EFAULT))
@@ -1379,12 +1912,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
 * This is the in-kernel slowpath: we look up the PI state (if any),
 * and do the rt-mutex unlock.
 */
-static int futex_unlock_pi(u32 __user *uaddr)
+static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
        u32 uval;
-        struct list_head *head;
+        struct plist_head *head;
        union futex_key key;
        int ret, attempt = 0;
@@ -1399,9 +1932,10 @@ retry:
        /*
         * First take all the futex related locks:
         */
-        down_read(&current->mm->mmap_sem);
+        if (fshared)
+                down_read(fshared);
-        ret = get_futex_key(uaddr, &key);
+        ret = get_futex_key(uaddr, fshared, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -1435,7 +1969,7 @@ retry_locked:
         */
        head = &hb->chain;
-        list_for_each_entry_safe(this, next, head, list) {
+        plist_for_each_entry_safe(this, next, head, list) {
                if (!match_futex (&this->key, &key))
                        continue;
                ret = wake_futex_pi(uaddr, uval, this);
@@ -1460,7 +1994,8 @@ retry_locked:
 out_unlock:
        spin_unlock(&hb->lock);
 out:
-        up_read(&current->mm->mmap_sem);
+        if (fshared)
+                up_read(fshared);
        return ret;
@@ -1472,15 +2007,16 @@ pi_faulted:
         * still holding the mmap_sem.
         */
        if (attempt++) {
-                if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+                ret = futex_handle_fault((unsigned long)uaddr, fshared,
-                        ret = -EFAULT;
+                                         attempt);
+                if (ret)
                        goto out_unlock;
-                }
                goto retry_locked;
        }
        spin_unlock(&hb->lock);
-        up_read(&current->mm->mmap_sem);
+        if (fshared)
+                up_read(fshared);
        ret = get_user(uval, uaddr);
        if (!ret && (uval != -EFAULT))
@@ -1509,10 +2045,10 @@ static unsigned int futex_poll(struct file *filp,
        poll_wait(filp, &q->waiters, wait);
        /*
-         * list_empty() is safe here without any lock.
+         * plist_node_empty() is safe here without any lock.
         * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
         */
-        if (list_empty(&q->list))
+        if (plist_node_empty(&q->list))
                ret = POLLIN | POLLRDNORM;
        return ret;
@@ -1532,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
        struct futex_q *q;
        struct file *filp;
        int ret, err;
+        struct rw_semaphore *fshared;
        static unsigned long printk_interval;
        if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -1573,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
        }
        q->pi_state = NULL;
-        down_read(&current->mm->mmap_sem);
+        fshared = &current->mm->mmap_sem;
-        err = get_futex_key(uaddr, &q->key);
+        down_read(fshared);
+        err = get_futex_key(uaddr, fshared, &q->key);
        if (unlikely(err != 0)) {
-                up_read(&current->mm->mmap_sem);
+                up_read(fshared);
                kfree(q);
                goto error;
        }
@@ -1589,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
        filp->private_data = q;
        queue_me(q, ret, filp);
-        up_read(&current->mm->mmap_sem);
+        up_read(fshared);
        /* Now we map fd to filp, so userspace can access it */
        fd_install(ret, filp);
@@ -1702,6 +2240,8 @@ retry:
                 * userspace.
                 */
                mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+                /* Also keep the FUTEX_WAITER_REQUEUED flag if set */
+                mval |= (uval & FUTEX_WAITER_REQUEUED);
                nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
                if (nval == -EFAULT)
@@ -1716,7 +2256,7 @@ retry:
                 */
                if (!pi) {
                        if (uval & FUTEX_WAITERS)
-                                futex_wake(uaddr, 1);
+                                futex_wake(uaddr, &curr->mm->mmap_sem, 1);
                }
        }
        return 0;
@@ -1772,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)
                return;
        if (pending)
-                handle_futex_death((void __user *)pending + futex_offset, curr, pip);
+                handle_futex_death((void __user *)pending + futex_offset,
+                                   curr, pip);
        while (entry != &head->list) {
                /*
@@ -1798,39 +2339,47 @@ void exit_robust_list(struct task_struct *curr)
        }
 }
-long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
 {
        int ret;
+        int cmd = op & FUTEX_CMD_MASK;
+        struct rw_semaphore *fshared = NULL;
+        if (!(op & FUTEX_PRIVATE_FLAG))
+                fshared = &current->mm->mmap_sem;
-        switch (op) {
+        switch (cmd) {
        case FUTEX_WAIT:
-                ret = futex_wait(uaddr, val, timeout);
+                ret = futex_wait(uaddr, fshared, val, timeout);
                break;
        case FUTEX_WAKE:
-                ret = futex_wake(uaddr, val);
+                ret = futex_wake(uaddr, fshared, val);
                break;
        case FUTEX_FD:
                /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
                ret = futex_fd(uaddr, val);
                break;
        case FUTEX_REQUEUE:
-                ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
                break;
        case FUTEX_CMP_REQUEUE:
-                ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
+                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
                break;
        case FUTEX_WAKE_OP:
-                ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+                ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
                break;
        case FUTEX_LOCK_PI:
-                ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
+                ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
                break;
        case FUTEX_UNLOCK_PI:
-                ret = futex_unlock_pi(uaddr);
+                ret = futex_unlock_pi(uaddr, fshared);
                break;
        case FUTEX_TRYLOCK_PI:
-                ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
+                ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+                break;
+        case FUTEX_CMP_REQUEUE_PI:
+                ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
                break;
        default:
                ret = -ENOSYS;
@@ -1843,29 +2392,30 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
                          struct timespec __user *utime, u32 __user *uaddr2,
                          u32 val3)
 {
-        struct timespec t;
+        struct timespec ts;
-        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+        ktime_t t, *tp = NULL;
        u32 val2 = 0;
+        int cmd = op & FUTEX_CMD_MASK;
-        if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
+        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
-                if (copy_from_user(&t, utime, sizeof(t)) != 0)
+                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
                        return -EFAULT;
-                if (!timespec_valid(&t))
+                if (!timespec_valid(&ts))
                        return -EINVAL;
-                if (op == FUTEX_WAIT)
-                        timeout = timespec_to_jiffies(&t) + 1;
+                t = timespec_to_ktime(ts);
-                else {
+                if (cmd == FUTEX_WAIT)
-                        timeout = t.tv_sec;
+                        t = ktime_add(ktime_get(), t);
-                        val2 = t.tv_nsec;
+                tp = &t;
-                }
        }
        /*
-         * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
+         * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
         */
-        if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
+            || cmd == FUTEX_CMP_REQUEUE_PI)
                val2 = (u32) (unsigned long) utime;
-        return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 static int futexfs_get_sb(struct file_system_type *fs_type,
@@ -1895,7 +2445,7 @@ static int __init init(void)
        }
        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
-                INIT_LIST_HEAD(&futex_queues[i].chain);
+                plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
                spin_lock_init(&futex_queues[i].lock);
        }
        return 0;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 50f24eea6cd0..338a9b489fbc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -141,24 +141,24 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
                struct compat_timespec __user *utime, u32 __user *uaddr2,
                u32 val3)
 {
-        struct timespec t;
+        struct timespec ts;
-        unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
+        ktime_t t, *tp = NULL;
        int val2 = 0;
        if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
-                if (get_compat_timespec(&t, utime))
+                if (get_compat_timespec(&ts, utime))
                        return -EFAULT;
-                if (!timespec_valid(&t))
+                if (!timespec_valid(&ts))
                        return -EINVAL;
+                t = timespec_to_ktime(ts);
                if (op == FUTEX_WAIT)
-                        timeout = timespec_to_jiffies(&t) + 1;
+                        t = ktime_add(ktime_get(), t);
-                else {
+                tp = &t;
-                        timeout = t.tv_sec;
-                        val2 = t.tv_nsec;
-                }
        }
-        if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
+        if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
+            || op == FUTEX_CMP_REQUEUE_PI)
                val2 = (int) (unsigned long) utime;
-        return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
+        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c9f4f044a8a8..23c03f43e196 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1411,11 +1411,13 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
        switch (action) {
        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
                init_hrtimers_cpu(cpu);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
                migrate_hrtimers(cpu);
                break;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 32e1ab1477d1..e391cbb1f566 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,7 +22,6 @@
 * handle_bad_irq - handle spurious and unhandled irqs
 * @irq:       the interrupt number
 * @desc:      description of the interrupt
- * @regs:      pointer to a register structure
 *
 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
 */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 49cc4b9c1a8d..4d32eb077179 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -135,7 +135,6 @@ static int ____call_usermodehelper(void *data)
        /* Unblock all signals and set the session keyring. */
        new_session = key_get(sub_info->ring);
-        flush_signals(current);
        spin_lock_irq(&current->sighand->siglock);
        old_session = __install_session_keyring(current, new_session);
        flush_signal_handlers(current, 1);
@@ -186,14 +185,9 @@ static int wait_for_helper(void *data)
 {
        struct subprocess_info *sub_info = data;
        pid_t pid;
-        struct k_sigaction sa;
        /* Install a handler: if SIGCLD isn't handled sys_wait4 won't
         * populate the status, but will return -ECHILD. */
-        sa.sa.sa_handler = SIG_IGN;
-        sa.sa.sa_flags = 0;
-        siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
-        do_sigaction(SIGCHLD, &sa, NULL);
        allow_signal(SIGCHLD);
        pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 87c50ccd1d4e..df8a8e8f6ca4 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,7 +1,7 @@
 /* Kernel thread helper functions.
 *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
 *
- * Creation is done via keventd, so that we get a clean environment
+ * Creation is done via kthreadd, so that we get a clean environment
 * even if we're invoked from userspace (think modprobe, hotplug cpu,
 * etc.).
 */
@@ -15,24 +15,22 @@
 #include <linux/mutex.h>
 #include <asm/semaphore.h>
-/*
+static DEFINE_SPINLOCK(kthread_create_lock);
- * We dont want to execute off keventd since it might
+static LIST_HEAD(kthread_create_list);
- * hold a semaphore our callers hold too:
+struct task_struct *kthreadd_task;
- */
-static struct workqueue_struct *helper_wq;
 struct kthread_create_info
 {
-        /* Information passed to kthread() from keventd. */
+        /* Information passed to kthread() from kthreadd. */
        int (*threadfn)(void *data);
        void *data;
        struct completion started;
-        /* Result passed back to kthread_create() from keventd. */
+        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
        struct completion done;
-        struct work_struct work;
+        struct list_head list;
 };
 struct kthread_stop_info
@@ -60,42 +58,17 @@ int kthread_should_stop(void)
 }
 EXPORT_SYMBOL(kthread_should_stop);
-static void kthread_exit_files(void)
-{
-        struct fs_struct *fs;
-        struct task_struct *tsk = current;
-        exit_fs(tsk);           /* current->fs->count--; */
-        fs = init_task.fs;
-        tsk->fs = fs;
-        atomic_inc(&fs->count);
-        exit_files(tsk);
-        current->files = init_task.files;
-        atomic_inc(&tsk->files->count);
-}
 static int kthread(void *_create)
 {
        struct kthread_create_info *create = _create;
        int (*threadfn)(void *data);
        void *data;
-        sigset_t blocked;
        int ret = -EINTR;
-        kthread_exit_files();
+        /* Copy data: it's on kthread's stack */
-        /* Copy data: it's on keventd's stack */
        threadfn = create->threadfn;
        data = create->data;
-        /* Block and flush all signals (in case we're not from keventd). */
-        sigfillset(&blocked);
-        sigprocmask(SIG_BLOCK, &blocked, NULL);
-        flush_signals(current);
-        /* By default we can run anywhere, unlike keventd. */
-        set_cpus_allowed(current, CPU_MASK_ALL);
        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_INTERRUPTIBLE);
        complete(&create->started);
@@ -112,11 +85,8 @@ static int kthread(void *_create)
        return 0;
 }
-/* We are keventd: create a thread. */
+static void create_kthread(struct kthread_create_info *create)
-static void keventd_create_kthread(struct work_struct *work)
 {
-        struct kthread_create_info *create =
-                container_of(work, struct kthread_create_info, work);
        int pid;
        /* We want our own signal handler (we take no signals by default). */
@@ -162,17 +132,14 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
        create.data = data;
        init_completion(&create.started);
        init_completion(&create.done);
-        INIT_WORK(&create.work, keventd_create_kthread);
+        spin_lock(&kthread_create_lock);
-        /*
+        list_add_tail(&create.list, &kthread_create_list);
-         * The workqueue needs to start up first:
+        wake_up_process(kthreadd_task);
-         */
+        spin_unlock(&kthread_create_lock);
-        if (!helper_wq)
-                create.work.func(&create.work);
+        wait_for_completion(&create.done);
-        else {
-                queue_work(helper_wq, &create.work);
-                wait_for_completion(&create.done);
-        }
        if (!IS_ERR(create.result)) {
                va_list args;
                va_start(args, namefmt);
@@ -180,7 +147,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
                          namefmt, args);
                va_end(args);
        }
        return create.result;
 }
 EXPORT_SYMBOL(kthread_create);
@@ -245,12 +211,47 @@ int kthread_stop(struct task_struct *k)
 }
 EXPORT_SYMBOL(kthread_stop);
-static __init int helper_init(void)
+static __init void kthreadd_setup(void)
 {
-        helper_wq = create_singlethread_workqueue("kthread");
+        struct task_struct *tsk = current;
-        BUG_ON(!helper_wq);
-        return 0;
+        set_task_comm(tsk, "kthreadd");
+        ignore_signals(tsk);
+        set_user_nice(tsk, -5);
+        set_cpus_allowed(tsk, CPU_MASK_ALL);
 }
-core_initcall(helper_init);
+int kthreadd(void *unused)
+{
+        /* Setup a clean context for our children to inherit. */
+        kthreadd_setup();
+        current->flags |= PF_NOFREEZE;
+        for (;;) {
+                set_current_state(TASK_INTERRUPTIBLE);
+                if (list_empty(&kthread_create_list))
+                        schedule();
+                __set_current_state(TASK_RUNNING);
+                spin_lock(&kthread_create_lock);
+                while (!list_empty(&kthread_create_list)) {
+                        struct kthread_create_info *create;
+                        create = list_entry(kthread_create_list.next,
+                                            struct kthread_create_info, list);
+                        list_del_init(&create->list);
+                        spin_unlock(&kthread_create_lock);
+                        create_kthread(create);
+                        spin_lock(&kthread_create_lock);
+                }
+                spin_unlock(&kthread_create_lock);
+        }
+        return 0;
+}
diff --git a/kernel/module.c b/kernel/module.c
index d36e45477fac..9bd93de01f4a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -96,9 +96,9 @@ static inline void add_taint_module(struct module *mod, unsigned flag)
        mod->taints |= flag;
 }
-/* A thread that wants to hold a reference to a module only while it
+/*
- * is running can call ths to safely exit.
+ * A thread that wants to hold a reference to a module only while it
- * nfsd and lockd use this.
+ * is running can call this to safely exit.  nfsd and lockd use this.
 */
 void __module_put_and_exit(struct module *mod, long code)
 {
@@ -1199,7 +1199,7 @@ static int __unlink_module(void *_mod)
        return 0;
 }
-/* Free a module, remove from lists, etc (must hold module mutex). */
+/* Free a module, remove from lists, etc (must hold module_mutex). */
 static void free_module(struct module *mod)
 {
        /* Delete from various lists */
@@ -1246,7 +1246,7 @@ EXPORT_SYMBOL_GPL(__symbol_get);
 /*
 * Ensure that an exported symbol [global namespace] does not already exist
- * in the Kernel or in some other modules exported symbol table.
+ * in the kernel or in some other module's exported symbol table.
 */
 static int verify_export_symbols(struct module *mod)
 {
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e7cbbb82765b..303eab18484b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -133,7 +133,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
        debug_mutex_lock_common(lock, &waiter);
        mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-        debug_mutex_add_waiter(lock, &waiter, task->thread_info);
+        debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
        /* add waiting tasks to the end of the waitqueue (FIFO): */
        list_add_tail(&waiter.list, &lock->wait_list);
@@ -159,7 +159,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
                 */
                if (unlikely(state == TASK_INTERRUPTIBLE &&
                                                signal_pending(task))) {
-                        mutex_remove_waiter(lock, &waiter, task->thread_info);
+                        mutex_remove_waiter(lock, &waiter, task_thread_info(task));
                        mutex_release(&lock->dep_map, 1, _RET_IP_);
                        spin_unlock_mutex(&lock->wait_lock, flags);
@@ -175,8 +175,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
        }
        /* got the lock - rejoice! */
-        mutex_remove_waiter(lock, &waiter, task->thread_info);
+        mutex_remove_waiter(lock, &waiter, task_thread_info(task));
-        debug_mutex_set_owner(lock, task->thread_info);
+        debug_mutex_set_owner(lock, task_thread_info(task));
        /* set it to 0 if there are no waiters left: */
        if (likely(list_empty(&lock->wait_list)))
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 06331374d862..b5f0543ed84d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,30 +30,69 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
+enum {
+        HIBERNATION_INVALID,
+        HIBERNATION_PLATFORM,
+        HIBERNATION_TEST,
+        HIBERNATION_TESTPROC,
+        HIBERNATION_SHUTDOWN,
+        HIBERNATION_REBOOT,
+        /* keep last */
+        __HIBERNATION_AFTER_LAST
+};
+#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
+#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
+static int hibernation_mode = HIBERNATION_SHUTDOWN;
+struct hibernation_ops *hibernation_ops;
+/**
+ * hibernation_set_ops - set the global hibernate operations
+ * @ops: the hibernation operations to use in subsequent hibernation transitions
+ */
+void hibernation_set_ops(struct hibernation_ops *ops)
+{
+        if (ops && !(ops->prepare && ops->enter && ops->finish)) {
+                WARN_ON(1);
+                return;
+        }
+        mutex_lock(&pm_mutex);
+        hibernation_ops = ops;
+        if (ops)
+                hibernation_mode = HIBERNATION_PLATFORM;
+        else if (hibernation_mode == HIBERNATION_PLATFORM)
+                hibernation_mode = HIBERNATION_SHUTDOWN;
+        mutex_unlock(&pm_mutex);
+}
 /**
 *      platform_prepare - prepare the machine for hibernation using the
 *      platform driver if so configured and return an error code if it fails
 */
-static inline int platform_prepare(void)
+static int platform_prepare(void)
 {
-        int error = 0;
+        return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ?
+                hibernation_ops->prepare() : 0;
+}
-        switch (pm_disk_mode) {
+/**
-        case PM_DISK_TEST:
+ *      platform_finish - switch the machine to the normal mode of operation
-        case PM_DISK_TESTPROC:
+ *      using the platform driver (must be called after platform_prepare())
-        case PM_DISK_SHUTDOWN:
+ */
-        case PM_DISK_REBOOT:
-                break;
+static void platform_finish(void)
-        default:
+{
-                if (pm_ops && pm_ops->prepare)
+        if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops)
-                        error = pm_ops->prepare(PM_SUSPEND_DISK);
+                hibernation_ops->finish();
-        }
-        return error;
 }
 /**
- *      power_down - Shut machine down for hibernate.
+ *      power_down - Shut the machine down for hibernation.
 *
 *      Use the platform driver, if configured so; otherwise try
 *      to power off or reboot.
@@ -61,20 +100,20 @@ static inline int platform_prepare(void)
 static void power_down(void)
 {
-        switch (pm_disk_mode) {
+        switch (hibernation_mode) {
-        case PM_DISK_TEST:
+        case HIBERNATION_TEST:
-        case PM_DISK_TESTPROC:
+        case HIBERNATION_TESTPROC:
                break;
-        case PM_DISK_SHUTDOWN:
+        case HIBERNATION_SHUTDOWN:
                kernel_power_off();
                break;
-        case PM_DISK_REBOOT:
+        case HIBERNATION_REBOOT:
                kernel_restart(NULL);
                break;
-        default:
+        case HIBERNATION_PLATFORM:
-                if (pm_ops && pm_ops->enter) {
+                if (hibernation_ops) {
                        kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-                        pm_ops->enter(PM_SUSPEND_DISK);
+                        hibernation_ops->enter();
                        break;
                }
        }
@@ -87,20 +126,6 @@ static void power_down(void)
        while(1);
 }
-static inline void platform_finish(void)
-{
-        switch (pm_disk_mode) {
-        case PM_DISK_TEST:
-        case PM_DISK_TESTPROC:
-        case PM_DISK_SHUTDOWN:
-        case PM_DISK_REBOOT:
-                break;
-        default:
-                if (pm_ops && pm_ops->finish)
-                        pm_ops->finish(PM_SUSPEND_DISK);
-        }
-}
 static void unprepare_processes(void)
 {
        thaw_processes();
@@ -120,13 +145,10 @@ static int prepare_processes(void)
 }
 /**
- *      pm_suspend_disk - The granpappy of hibernation power management.
+ *      hibernate - The granpappy of the built-in hibernation management
- *
- *      If not, then call swsusp to do its thing, then figure out how
- *      to power down the system.
 */
-int pm_suspend_disk(void)
+int hibernate(void)
 {
        int error;
@@ -143,7 +165,8 @@ int pm_suspend_disk(void)
        if (error)
                goto Finish;
-        if (pm_disk_mode == PM_DISK_TESTPROC) {
+        mutex_lock(&pm_mutex);
+        if (hibernation_mode == HIBERNATION_TESTPROC) {
                printk("swsusp debug: Waiting for 5 seconds.\n");
                mdelay(5000);
                goto Thaw;
@@ -168,7 +191,7 @@ int pm_suspend_disk(void)
        if (error)
                goto Enable_cpus;
-        if (pm_disk_mode == PM_DISK_TEST) {
+        if (hibernation_mode == HIBERNATION_TEST) {
                printk("swsusp debug: Waiting for 5 seconds.\n");
                mdelay(5000);
                goto Enable_cpus;
@@ -205,6 +228,7 @@ int pm_suspend_disk(void)
        device_resume();
        resume_console();
 Thaw:
+        mutex_unlock(&pm_mutex);
        unprepare_processes();
 Finish:
        free_basic_memory_bitmaps();
@@ -220,7 +244,7 @@ int pm_suspend_disk(void)
 *      Called as a late_initcall (so all devices are discovered and
 *      initialized), we call swsusp to see if we have a saved image or not.
 *      If so, we quiesce devices, the restore the saved image. We will
- *      return above (in pm_suspend_disk() ) if everything goes well.
+ *      return above (in hibernate() ) if everything goes well.
 *      Otherwise, we fail gracefully and return to the normally
 *      scheduled program.
 *
@@ -315,25 +339,26 @@ static int software_resume(void)
 late_initcall(software_resume);
-static const char * const pm_disk_modes[] = {
+static const char * const hibernation_modes[] = {
-        [PM_DISK_PLATFORM]      = "platform",
+        [HIBERNATION_PLATFORM]  = "platform",
-        [PM_DISK_SHUTDOWN]      = "shutdown",
+        [HIBERNATION_SHUTDOWN]  = "shutdown",
-        [PM_DISK_REBOOT]        = "reboot",
+        [HIBERNATION_REBOOT]    = "reboot",
-        [PM_DISK_TEST]          = "test",
+        [HIBERNATION_TEST]      = "test",
-        [PM_DISK_TESTPROC]      = "testproc",
+        [HIBERNATION_TESTPROC]  = "testproc",
 };
 /**
- *      disk - Control suspend-to-disk mode
+ *      disk - Control hibernation mode
 *
 *      Suspend-to-disk can be handled in several ways. We have a few options
 *      for putting the system to sleep - using the platform driver (e.g. ACPI
- *      or other pm_ops), powering off the system or rebooting the system
+ *      or other hibernation_ops), powering off the system or rebooting the
- *      (for testing) as well as the two test modes.
+ *      system (for testing) as well as the two test modes.
 *
 *      The system can support 'platform', and that is known a priori (and
- *      encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot'
+ *      encoded by the presence of hibernation_ops). However, the user may
- *      as alternatives, as well as the test modes 'test' and 'testproc'.
+ *      choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
+ *      test modes, 'test' or 'testproc'.
 *
 *      show() will display what the mode is currently set to.
 *      store() will accept one of
@@ -345,7 +370,7 @@ static const char * const pm_disk_modes[] = {
 *      'testproc'
 *
 *      It will only change to 'platform' if the system
- *      supports it (as determined from pm_ops->pm_disk_mode).
+ *      supports it (as determined by having hibernation_ops).
 */
 static ssize_t disk_show(struct kset *kset, char *buf)
@@ -353,28 +378,25 @@ static ssize_t disk_show(struct kset *kset, char *buf)
        int i;
        char *start = buf;
-        for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
+        for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
-                if (!pm_disk_modes[i])
+                if (!hibernation_modes[i])
                        continue;
                switch (i) {
-                case PM_DISK_SHUTDOWN:
+                case HIBERNATION_SHUTDOWN:
-                case PM_DISK_REBOOT:
+                case HIBERNATION_REBOOT:
-                case PM_DISK_TEST:
+                case HIBERNATION_TEST:
-                case PM_DISK_TESTPROC:
+                case HIBERNATION_TESTPROC:
                        break;
-                default:
+                case HIBERNATION_PLATFORM:
-                        if (pm_ops && pm_ops->enter &&
+                        if (hibernation_ops)
-                            (i == pm_ops->pm_disk_mode))
                                break;
                        /* not a valid mode, continue with loop */
                        continue;
                }
-                if (i == pm_disk_mode)
+                if (i == hibernation_mode)
-                        buf += sprintf(buf, "[%s]", pm_disk_modes[i]);
+                        buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
                else
-                        buf += sprintf(buf, "%s", pm_disk_modes[i]);
+                        buf += sprintf(buf, "%s ", hibernation_modes[i]);
-                if (i+1 != PM_DISK_MAX)
-                        buf += sprintf(buf, " ");
        }
        buf += sprintf(buf, "\n");
        return buf-start;
@@ -387,39 +409,38 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
        int i;
        int len;
        char *p;
-        suspend_disk_method_t mode = 0;
+        int mode = HIBERNATION_INVALID;
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
        mutex_lock(&pm_mutex);
-        for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
+        for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
-                if (!strncmp(buf, pm_disk_modes[i], len)) {
+                if (!strncmp(buf, hibernation_modes[i], len)) {
                        mode = i;
                        break;
                }
        }
-        if (mode) {
+        if (mode != HIBERNATION_INVALID) {
                switch (mode) {
-                case PM_DISK_SHUTDOWN:
+                case HIBERNATION_SHUTDOWN:
-                case PM_DISK_REBOOT:
+                case HIBERNATION_REBOOT:
-                case PM_DISK_TEST:
+                case HIBERNATION_TEST:
-                case PM_DISK_TESTPROC:
+                case HIBERNATION_TESTPROC:
-                        pm_disk_mode = mode;
+                        hibernation_mode = mode;
                        break;
-                default:
+                case HIBERNATION_PLATFORM:
-                        if (pm_ops && pm_ops->enter &&
+                        if (hibernation_ops)
-                            (mode == pm_ops->pm_disk_mode))
+                                hibernation_mode = mode;
-                                pm_disk_mode = mode;
                        else
                                error = -EINVAL;
                }
-        } else {
+        } else
                error = -EINVAL;
-        }
-        pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+        if (!error)
-                 pm_disk_modes[mode]);
+                pr_debug("PM: suspend-to-disk mode set to '%s'\n",
+                         hibernation_modes[mode]);
        mutex_unlock(&pm_mutex);
        return error ? error : n;
 }
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6dda685e7e2..40d56a31245e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -30,7 +30,6 @@
 DEFINE_MUTEX(pm_mutex);
 struct pm_ops *pm_ops;
-suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
 /**
 *      pm_set_ops - Set the global power method table. 
@@ -41,10 +40,6 @@ void pm_set_ops(struct pm_ops * ops)
 {
        mutex_lock(&pm_mutex);
        pm_ops = ops;
-        if (ops && ops->pm_disk_mode != PM_DISK_INVALID) {
-                pm_disk_mode = ops->pm_disk_mode;
-        } else
-                pm_disk_mode = PM_DISK_SHUTDOWN;
        mutex_unlock(&pm_mutex);
 }
@@ -184,24 +179,12 @@ static void suspend_finish(suspend_state_t state)
 static const char * const pm_states[PM_SUSPEND_MAX] = {
        [PM_SUSPEND_STANDBY]    = "standby",
        [PM_SUSPEND_MEM]        = "mem",
-        [PM_SUSPEND_DISK]       = "disk",
 };
 static inline int valid_state(suspend_state_t state)
 {
-        /* Suspend-to-disk does not really need low-level support.
+        /* All states need lowlevel support and need to be valid
-         * It can work with shutdown/reboot if needed. If it isn't
+         * to the lowlevel implementation, no valid callback
-         * configured, then it cannot be supported.
-         */
-        if (state == PM_SUSPEND_DISK)
-#ifdef CONFIG_SOFTWARE_SUSPEND
-                return 1;
-#else
-                return 0;
-#endif
-        /* all other states need lowlevel support and need to be
-         * valid to the lowlevel implementation, no valid callback
         * implies that none are valid. */
        if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))
                return 0;
@@ -229,11 +212,6 @@ static int enter_state(suspend_state_t state)
        if (!mutex_trylock(&pm_mutex))
                return -EBUSY;
-        if (state == PM_SUSPEND_DISK) {
-                error = pm_suspend_disk();
-                goto Unlock;
-        }
        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
        if ((error = suspend_prepare(state)))
                goto Unlock;
@@ -251,7 +229,7 @@ static int enter_state(suspend_state_t state)
 /**
 *      pm_suspend - Externally visible function for suspending system.
- *      @state:         Enumarted value of state to enter.
+ *      @state:         Enumerated value of state to enter.
 *
 *      Determine whether or not value is within range, get state 
 *      structure, and enter (above).
@@ -289,7 +267,13 @@ static ssize_t state_show(struct kset *kset, char *buf)
                if (pm_states[i] && valid_state(i))
                        s += sprintf(s,"%s ", pm_states[i]);
        }
-        s += sprintf(s,"\n");
+#ifdef CONFIG_SOFTWARE_SUSPEND
+        s += sprintf(s, "%s\n", "disk");
+#else
+        if (s != buf)
+                /* convert the last space to a newline */
+                *(s-1) = '\n';
+#endif
        return (s - buf);
 }
@@ -304,6 +288,12 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
+        /* First, check if we are requested to hibernate */
+        if (!strncmp(buf, "disk", len)) {
+                error = hibernate();
+                return error ? error : n;
+        }
        for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
                if (*s && !strncmp(buf, *s, len))
                        break;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 34b43542785a..51381487103f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,12 +25,7 @@ struct swsusp_info {
 */
 #define SPARE_PAGES     ((1024 * 1024) >> PAGE_SHIFT)
-extern int pm_suspend_disk(void);
+extern struct hibernation_ops *hibernation_ops;
-#else
-static inline int pm_suspend_disk(void)
-{
-        return -EPERM;
-}
 #endif
 extern int pfn_is_nosave(unsigned long);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 59fb89ba9a4d..a3b7854b8f7c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1233,7 +1233,7 @@ asmlinkage int swsusp_save(void)
        nr_copy_pages = nr_pages;
        nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
-        printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+        printk("swsusp: critical section: done (%d pages copied)\n", nr_pages);
        return 0;
 }
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 040560d9c312..24d7d78e6f42 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -130,16 +130,16 @@ static inline int platform_prepare(void)
 {
        int error = 0;
-        if (pm_ops && pm_ops->prepare)
+        if (hibernation_ops)
-                error = pm_ops->prepare(PM_SUSPEND_DISK);
+                error = hibernation_ops->prepare();
        return error;
 }
 static inline void platform_finish(void)
 {
-        if (pm_ops && pm_ops->finish)
+        if (hibernation_ops)
-                pm_ops->finish(PM_SUSPEND_DISK);
+                hibernation_ops->finish();
 }
 static inline int snapshot_suspend(int platform_suspend)
@@ -384,7 +384,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                switch (arg) {
                case PMOPS_PREPARE:
-                        if (pm_ops && pm_ops->enter) {
+                        if (hibernation_ops) {
                                data->platform_suspend = 1;
                                error = 0;
                        } else {
@@ -395,8 +395,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                case PMOPS_ENTER:
                        if (data->platform_suspend) {
                                kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-                                error = pm_ops->enter(PM_SUSPEND_DISK);
+                                error = hibernation_ops->enter();
-                                error = 0;
                        }
                        break;
diff --git a/kernel/profile.c b/kernel/profile.c
index 9bfadb248dd8..cc91b9bf759d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -340,6 +340,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
        switch (action) {
        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
                node = cpu_to_node(cpu);
                per_cpu(cpu_profile_flip, cpu) = 0;
                if (!per_cpu(cpu_profile_hits, cpu)[1]) {
@@ -365,10 +366,13 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
                __free_page(page);
                return NOTIFY_BAD;
        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
                cpu_set(cpu, prof_cpu_mask);
                break;
        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                cpu_clear(cpu, prof_cpu_mask);
                if (per_cpu(cpu_profile_hits, cpu)[0]) {
                        page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 3554b76da84c..2c2dd8410dc4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -558,9 +558,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        long cpu = (long)hcpu;
        switch (action) {
        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
                rcu_online_cpu(cpu);
                break;
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                rcu_offline_cpu(cpu);
                break;
        default:
diff --git a/kernel/relay.c b/kernel/relay.c
index 577f251c7e28..4311101b0ca7 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -310,16 +310,13 @@ static struct rchan_callbacks default_channel_callbacks = {
 /**
 *      wakeup_readers - wake up readers waiting on a channel
- *      @work: work struct that contains the the channel buffer
+ *      @data: contains the channel buffer
 *
- *      This is the work function used to defer reader waking.  The
+ *      This is the timer function used to defer reader waking.
- *      reason waking is deferred is that calling directly from write
- *      causes problems if you're writing from say the scheduler.
 */
-static void wakeup_readers(struct work_struct *work)
+static void wakeup_readers(unsigned long data)
 {
-        struct rchan_buf *buf =
+        struct rchan_buf *buf = (struct rchan_buf *)data;
-                container_of(work, struct rchan_buf, wake_readers.work);
        wake_up_interruptible(&buf->read_wait);
 }
@@ -337,11 +334,9 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
        if (init) {
                init_waitqueue_head(&buf->read_wait);
                kref_init(&buf->kref);
-                INIT_DELAYED_WORK(&buf->wake_readers, NULL);
+                setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
-        } else {
+        } else
-                cancel_delayed_work(&buf->wake_readers);
+                del_timer_sync(&buf->timer);
-                flush_scheduled_work();
-        }
        buf->subbufs_produced = 0;
        buf->subbufs_consumed = 0;
@@ -447,8 +442,7 @@ end:
 static void relay_close_buf(struct rchan_buf *buf)
 {
        buf->finalized = 1;
-        cancel_delayed_work(&buf->wake_readers);
+        del_timer_sync(&buf->timer);
-        flush_scheduled_work();
        kref_put(&buf->kref, relay_remove_buf);
 }
@@ -490,6 +484,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
        switch(action) {
        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
                mutex_lock(&relay_channels_mutex);
                list_for_each_entry(chan, &relay_channels, list) {
                        if (chan->buf[hotcpu])
@@ -506,6 +501,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
                mutex_unlock(&relay_channels_mutex);
                break;
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                /* No need to flush the cpu : will be flushed upon
                 * final relay_flush() call. */
                break;
@@ -608,11 +604,14 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
                buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
                        buf->padding[old_subbuf];
                smp_mb();
-                if (waitqueue_active(&buf->read_wait)) {
+                if (waitqueue_active(&buf->read_wait))
-                        PREPARE_DELAYED_WORK(&buf->wake_readers,
+                        /*
-                                             wakeup_readers);
+                         * Calling wake_up_interruptible() from here
-                        schedule_delayed_work(&buf->wake_readers, 1);
+                         * will deadlock if we happen to be logging
-                }
+                         * from the scheduler (trying to re-grab
+                         * rq->lock), so defer it.
+                         */
+                        __mod_timer(&buf->timer, jiffies + 1);
        }
        old = buf->data;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 180978cb2f75..12879f6c1ec3 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -56,7 +56,7 @@
 * state.
 */
-static void
+void
 rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
                   unsigned long mask)
 {
@@ -81,29 +81,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 }
 /*
- * We can speed up the acquire/release, if the architecture
- * supports cmpxchg and if there's no debugging state to be set up
- */
-#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
-# define rt_mutex_cmpxchg(l,c,n)        (cmpxchg(&l->owner, c, n) == c)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
-        unsigned long owner, *p = (unsigned long *) &lock->owner;
-        do {
-                owner = *p;
-        } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
-}
-#else
-# define rt_mutex_cmpxchg(l,c,n)        (0)
-static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
-{
-        lock->owner = (struct task_struct *)
-                        ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
-}
-#endif
-/*
 * Calculate task priority from the waiter list priority
 *
 * Return task->normal_prio when the waiter list is empty or when
@@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task)
 *
 * This can be both boosting and unboosting. task->pi_lock must be held.
 */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
+void __rt_mutex_adjust_prio(struct task_struct *task)
 {
        int prio = rt_mutex_getprio(task);
@@ -159,11 +136,11 @@ int max_lock_depth = 1024;
 * Decreases task's usage by one - may thus free the task.
 * Returns 0 or -EDEADLK.
 */
-static int rt_mutex_adjust_prio_chain(struct task_struct *task,
+int rt_mutex_adjust_prio_chain(struct task_struct *task,
-                                      int deadlock_detect,
+                               int deadlock_detect,
-                                      struct rt_mutex *orig_lock,
+                               struct rt_mutex *orig_lock,
-                                      struct rt_mutex_waiter *orig_waiter,
+                               struct rt_mutex_waiter *orig_waiter,
-                                      struct task_struct *top_task)
+                               struct task_struct *top_task)
 {
        struct rt_mutex *lock;
        struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
 *
 * Must be called with lock->wait_lock held
 */
-static void remove_waiter(struct rt_mutex *lock,
+void remove_waiter(struct rt_mutex *lock,
-                          struct rt_mutex_waiter *waiter)
+                   struct rt_mutex_waiter *waiter)
 {
        int first = (waiter == rt_mutex_top_waiter(lock));
        struct task_struct *owner = rt_mutex_owner(lock);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856e791e..242ec7ee740b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -113,6 +113,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
 }
 /*
+ * We can speed up the acquire/release, if the architecture
+ * supports cmpxchg and if there's no debugging state to be set up
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
+# define rt_mutex_cmpxchg(l,c,n)        (cmpxchg(&l->owner, c, n) == c)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+        unsigned long owner, *p = (unsigned long *) &lock->owner;
+        do {
+                owner = *p;
+        } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+}
+#else
+# define rt_mutex_cmpxchg(l,c,n)        (0)
+static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
+{
+        lock->owner = (struct task_struct *)
+                        ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
+}
+#endif
+/*
 * PI-futex support (proxy locking functions, etc.):
 */
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
                                       struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                                  struct task_struct *proxy_owner);
+extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
+                               unsigned long mask);
+extern void __rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
+                                      int deadlock_detect,
+                                      struct rt_mutex *orig_lock,
+                                      struct rt_mutex_waiter *orig_waiter,
+                                      struct task_struct *top_task);
+extern void remove_waiter(struct rt_mutex *lock,
+                          struct rt_mutex_waiter *waiter);
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 66bd7ff23f18..799d23b4e35d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -305,6 +305,7 @@ struct rq {
 };
 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_MUTEX(sched_hotcpu_mutex);
 static inline int cpu_of(struct rq *rq)
 {
@@ -4520,13 +4521,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
        struct task_struct *p;
        int retval;
-        lock_cpu_hotplug();
+        mutex_lock(&sched_hotcpu_mutex);
        read_lock(&tasklist_lock);
        p = find_process_by_pid(pid);
        if (!p) {
                read_unlock(&tasklist_lock);
-                unlock_cpu_hotplug();
+                mutex_unlock(&sched_hotcpu_mutex);
                return -ESRCH;
        }
@@ -4553,7 +4554,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 out_unlock:
        put_task_struct(p);
-        unlock_cpu_hotplug();
+        mutex_unlock(&sched_hotcpu_mutex);
        return retval;
 }
@@ -4610,7 +4611,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
        struct task_struct *p;
        int retval;
-        lock_cpu_hotplug();
+        mutex_lock(&sched_hotcpu_mutex);
        read_lock(&tasklist_lock);
        retval = -ESRCH;
@@ -4626,7 +4627,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 out_unlock:
        read_unlock(&tasklist_lock);
-        unlock_cpu_hotplug();
+        mutex_unlock(&sched_hotcpu_mutex);
        if (retval)
                return retval;
@@ -5388,7 +5389,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        struct rq *rq;
        switch (action) {
+        case CPU_LOCK_ACQUIRE:
+                mutex_lock(&sched_hotcpu_mutex);
+                break;
        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
                p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
                if (IS_ERR(p))
                        return NOTIFY_BAD;
@@ -5402,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
                /* Strictly unneccessary, as first user will wake it. */
                wake_up_process(cpu_rq(cpu)->migration_thread);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
                if (!cpu_rq(cpu)->migration_thread)
                        break;
                /* Unbind it from offline cpu so it can run.  Fall thru. */
@@ -5418,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                migrate_live_tasks(cpu);
                rq = cpu_rq(cpu);
                kthread_stop(rq->migration_thread);
@@ -5433,7 +5442,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                BUG_ON(rq->nr_running != 0);
                /* No need to migrate the tasks: it was best-effort if
-                 * they didn't do lock_cpu_hotplug().  Just wake up
+                 * they didn't take sched_hotcpu_mutex.  Just wake up
                 * the requestors. */
                spin_lock_irq(&rq->lock);
                while (!list_empty(&rq->migration_queue)) {
@@ -5447,6 +5456,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                spin_unlock_irq(&rq->lock);
                break;
 #endif
+        case CPU_LOCK_RELEASE:
+                mutex_unlock(&sched_hotcpu_mutex);
+                break;
        }
        return NOTIFY_OK;
 }
@@ -6822,10 +6834,10 @@ int arch_reinit_sched_domains(void)
 {
        int err;
-        lock_cpu_hotplug();
+        mutex_lock(&sched_hotcpu_mutex);
        detach_destroy_domains(&cpu_online_map);
        err = arch_init_sched_domains(&cpu_online_map);
-        unlock_cpu_hotplug();
+        mutex_unlock(&sched_hotcpu_mutex);
        return err;
 }
@@ -6904,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb,
 {
        switch (action) {
        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
                detach_destroy_domains(&cpu_online_map);
                return NOTIFY_OK;
        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                /*
                 * Fall through and re-initialise the domains.
                 */
@@ -6930,12 +6948,12 @@ void __init sched_init_smp(void)
 {
        cpumask_t non_isolated_cpus;
-        lock_cpu_hotplug();
+        mutex_lock(&sched_hotcpu_mutex);
        arch_init_sched_domains(&cpu_online_map);
        cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
        if (cpus_empty(non_isolated_cpus))
                cpu_set(smp_processor_id(), non_isolated_cpus);
-        unlock_cpu_hotplug();
+        mutex_unlock(&sched_hotcpu_mutex);
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);
diff --git a/kernel/signal.c b/kernel/signal.c
index 1368e67c8482..2ac3a668d9dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -38,125 +38,6 @@
 static struct kmem_cache *sigqueue_cachep;
-/*
- * In POSIX a signal is sent either to a specific thread (Linux task)
- * or to the process as a whole (Linux thread group).  How the signal
- * is sent determines whether it's to one thread or the whole group,
- * which determines which signal mask(s) are involved in blocking it
- * from being delivered until later.  When the signal is delivered,
- * either it's caught or ignored by a user handler or it has a default
- * effect that applies to the whole thread group (POSIX process).
- *
- * The possible effects an unblocked signal set to SIG_DFL can have are:
- *   ignore     - Nothing Happens
- *   terminate  - kill the process, i.e. all threads in the group,
- *                similar to exit_group.  The group leader (only) reports
- *                WIFSIGNALED status to its parent.
- *   coredump   - write a core dump file describing all threads using
- *                the same mm and then kill all those threads
- *   stop       - stop all the threads in the group, i.e. TASK_STOPPED state
- *
- * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
- * Other signals when not blocked and set to SIG_DFL behaves as follows.
- * The job control signals also have other special effects.
- *
- *      +--------------------+------------------+
- *      |  POSIX signal      |  default action  |
- *      +--------------------+------------------+
- *      |  SIGHUP            |  terminate       |
- *      |  SIGINT            |  terminate       |
- *      |  SIGQUIT           |  coredump        |
- *      |  SIGILL            |  coredump        |
- *      |  SIGTRAP           |  coredump        |
- *      |  SIGABRT/SIGIOT    |  coredump        |
- *      |  SIGBUS            |  coredump        |
- *      |  SIGFPE            |  coredump        |
- *      |  SIGKILL           |  terminate(+)    |
- *      |  SIGUSR1           |  terminate       |
- *      |  SIGSEGV           |  coredump        |
- *      |  SIGUSR2           |  terminate       |
- *      |  SIGPIPE           |  terminate       |
- *      |  SIGALRM           |  terminate       |
- *      |  SIGTERM           |  terminate       |
- *      |  SIGCHLD           |  ignore          |
- *      |  SIGCONT           |  ignore(*)       |
- *      |  SIGSTOP           |  stop(*)(+)      |
- *      |  SIGTSTP           |  stop(*)         |
- *      |  SIGTTIN           |  stop(*)         |
- *      |  SIGTTOU           |  stop(*)         |
- *      |  SIGURG            |  ignore          |
- *      |  SIGXCPU           |  coredump        |
- *      |  SIGXFSZ           |  coredump        |
- *      |  SIGVTALRM         |  terminate       |
- *      |  SIGPROF           |  terminate       |
- *      |  SIGPOLL/SIGIO     |  terminate       |
- *      |  SIGSYS/SIGUNUSED  |  coredump        |
- *      |  SIGSTKFLT         |  terminate       |
- *      |  SIGWINCH          |  ignore          |
- *      |  SIGPWR            |  terminate       |
- *      |  SIGRTMIN-SIGRTMAX |  terminate       |
- *      +--------------------+------------------+
- *      |  non-POSIX signal  |  default action  |
- *      +--------------------+------------------+
- *      |  SIGEMT            |  coredump        |
- *      +--------------------+------------------+
- *
- * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
- * (*) Special job control effects:
- * When SIGCONT is sent, it resumes the process (all threads in the group)
- * from TASK_STOPPED state and also clears any pending/queued stop signals
- * (any of those marked with "stop(*)").  This happens regardless of blocking,
- * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
- * any pending/queued SIGCONT signals; this happens regardless of blocking,
- * catching, or ignored the stop signal, though (except for SIGSTOP) the
- * default action of stopping the process may happen later or never.
- */
-#ifdef SIGEMT
-#define M_SIGEMT        M(SIGEMT)
-#else
-#define M_SIGEMT        0
-#endif
-#if SIGRTMIN > BITS_PER_LONG
-#define M(sig) (1ULL << ((sig)-1))
-#else
-#define M(sig) (1UL << ((sig)-1))
-#endif
-#define T(sig, mask) (M(sig) & (mask))
-#define SIG_KERNEL_ONLY_MASK (\
-        M(SIGKILL)   |  M(SIGSTOP)                                   )
-#define SIG_KERNEL_STOP_MASK (\
-        M(SIGSTOP)   |  M(SIGTSTP)   |  M(SIGTTIN)   |  M(SIGTTOU)   )
-#define SIG_KERNEL_COREDUMP_MASK (\
-        M(SIGQUIT)   |  M(SIGILL)    |  M(SIGTRAP)   |  M(SIGABRT)   | \
-        M(SIGFPE)    |  M(SIGSEGV)   |  M(SIGBUS)    |  M(SIGSYS)    | \
-        M(SIGXCPU)   |  M(SIGXFSZ)   |  M_SIGEMT                     )
-#define SIG_KERNEL_IGNORE_MASK (\
-        M(SIGCONT)   |  M(SIGCHLD)   |  M(SIGWINCH)  |  M(SIGURG)    )
-#define sig_kernel_only(sig) \
-                (((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_ONLY_MASK))
-#define sig_kernel_coredump(sig) \
-                (((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_COREDUMP_MASK))
-#define sig_kernel_ignore(sig) \
-                (((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_IGNORE_MASK))
-#define sig_kernel_stop(sig) \
-                (((sig) < SIGRTMIN)  && T(sig, SIG_KERNEL_STOP_MASK))
-#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
-#define sig_user_defined(t, signr) \
-        (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&  \
-         ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
-#define sig_fatal(t, signr) \
-        (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
-         (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
 static int sig_ignored(struct task_struct *t, int sig)
 {
@@ -328,6 +209,16 @@ void flush_signals(struct task_struct *t)
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
+void ignore_signals(struct task_struct *t)
+{
+        int i;
+        for (i = 0; i < _NSIG; ++i)
+                t->sighand->action[i].sa.sa_handler = SIG_IGN;
+        flush_signals(t);
+}
 /*
 * Flush all handlers for a task.
 */
@@ -1032,17 +923,6 @@ void zap_other_threads(struct task_struct *p)
                if (t->exit_state)
                        continue;
-                /*
-                 * We don't want to notify the parent, since we are
-                 * killed as part of a thread group due to another
-                 * thread doing an execve() or similar. So set the
-                 * exit signal to -1 to allow immediate reaping of
-                 * the process.  But don't detach the thread group
-                 * leader.
-                 */
-                if (t != p->group_leader)
-                        t->exit_signal = -1;
                /* SIGKILL will be handled before any pending SIGSTOP */
                sigaddset(&t->pending.signal, SIGKILL);
                signal_wake_up(t, 1);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8b75008e2bd8..0b9886a00e74 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -593,6 +593,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
                p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk("ksoftirqd for %i failed\n", hotcpu);
@@ -602,16 +603,19 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
                per_cpu(ksoftirqd, hotcpu) = p;
                break;
        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
                wake_up_process(per_cpu(ksoftirqd, hotcpu));
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
                if (!per_cpu(ksoftirqd, hotcpu))
                        break;
                /* Unbind so it can run.  Fall thru. */
                kthread_bind(per_cpu(ksoftirqd, hotcpu),
                             any_online_cpu(cpu_online_map));
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                p = per_cpu(ksoftirqd, hotcpu);
                per_cpu(ksoftirqd, hotcpu) = NULL;
                kthread_stop(p);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 8fa7040247ad..0131e296ffb4 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -146,6 +146,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
        switch (action) {
        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
                BUG_ON(per_cpu(watchdog_task, hotcpu));
                p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
                if (IS_ERR(p)) {
@@ -157,16 +158,19 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
                kthread_bind(p, hotcpu);
                break;
        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
                wake_up_process(per_cpu(watchdog_task, hotcpu));
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
                if (!per_cpu(watchdog_task, hotcpu))
                        break;
                /* Unbind so it can run.  Fall thru. */
                kthread_bind(per_cpu(watchdog_task, hotcpu),
                             any_online_cpu(cpu_online_map));
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                p = per_cpu(watchdog_task, hotcpu);
                per_cpu(watchdog_task, hotcpu) = NULL;
                kthread_stop(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 926bf9d7ac45..cdb7e9457ba6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -134,19 +134,39 @@ static int notifier_chain_unregister(struct notifier_block **nl,
        return -ENOENT;
 }
+/**
+ * notifier_call_chain - Informs the registered notifiers about an event.
+ *      @nl:            Pointer to head of the blocking notifier chain
+ *      @val:           Value passed unmodified to notifier function
+ *      @v:             Pointer passed unmodified to notifier function
+ *      @nr_to_call:    Number of notifier functions to be called. Don't care
+ *                      value of this parameter is -1.
+ *      @nr_calls:      Records the number of notifications sent. Don't care
+ *                      value of this field is NULL.
+ *      @returns:       notifier_call_chain returns the value returned by the
+ *                      last notifier function called.
+ */
 static int __kprobes notifier_call_chain(struct notifier_block **nl,
-                unsigned long val, void *v)
+                                        unsigned long val, void *v,
+                                        int nr_to_call, int *nr_calls)
 {
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;
        nb = rcu_dereference(*nl);
-        while (nb) {
+        while (nb && nr_to_call) {
                next_nb = rcu_dereference(nb->next);
                ret = nb->notifier_call(nb, val, v);
+                if (nr_calls)
+                        (*nr_calls)++;
                if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
                        break;
                nb = next_nb;
+                nr_to_call--;
        }
        return ret;
 }
@@ -205,10 +225,12 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
 EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
 /**
- *      atomic_notifier_call_chain - Call functions in an atomic notifier chain
+ *      __atomic_notifier_call_chain - Call functions in an atomic notifier chain
 *      @nh: Pointer to head of the atomic notifier chain
 *      @val: Value passed unmodified to notifier function
 *      @v: Pointer passed unmodified to notifier function
+ *      @nr_to_call: See the comment for notifier_call_chain.
+ *      @nr_calls: See the comment for notifier_call_chain.
 *
 *      Calls each function in a notifier chain in turn.  The functions
 *      run in an atomic context, so they must not block.
@@ -222,19 +244,27 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
 *      of the last notifier function called.
 */
 
-int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
-                unsigned long val, void *v)
+                                        unsigned long val, void *v,
+                                        int nr_to_call, int *nr_calls)
 {
        int ret;
        rcu_read_lock();
-        ret = notifier_call_chain(&nh->head, val, v);
+        ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
        rcu_read_unlock();
        return ret;
 }
-EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+                unsigned long val, void *v)
+{
+        return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
+}
+EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
 /*
 *      Blocking notifier chain routines.  All access to the chain is
 *      synchronized by an rwsem.
@@ -304,10 +334,12 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
 EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
 /**
- *      blocking_notifier_call_chain - Call functions in a blocking notifier chain
+ *      __blocking_notifier_call_chain - Call functions in a blocking notifier chain
 *      @nh: Pointer to head of the blocking notifier chain
 *      @val: Value passed unmodified to notifier function
 *      @v: Pointer passed unmodified to notifier function
+ *      @nr_to_call: See comment for notifier_call_chain.
+ *      @nr_calls: See comment for notifier_call_chain.
 *
 *      Calls each function in a notifier chain in turn.  The functions
 *      run in a process context, so they are allowed to block.
@@ -320,8 +352,9 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
 *      of the last notifier function called.
 */
 
-int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
-                unsigned long val, void *v)
+                                   unsigned long val, void *v,
+                                   int nr_to_call, int *nr_calls)
 {
        int ret = NOTIFY_DONE;
@@ -332,12 +365,19 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
         */
        if (rcu_dereference(nh->head)) {
                down_read(&nh->rwsem);
-                ret = notifier_call_chain(&nh->head, val, v);
+                ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
+                                        nr_calls);
                up_read(&nh->rwsem);
        }
        return ret;
 }
+EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
+int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
+                unsigned long val, void *v)
+{
+        return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
+}
 EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
 /*
@@ -383,10 +423,12 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
 EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
 /**
- *      raw_notifier_call_chain - Call functions in a raw notifier chain
+ *      __raw_notifier_call_chain - Call functions in a raw notifier chain
 *      @nh: Pointer to head of the raw notifier chain
 *      @val: Value passed unmodified to notifier function
 *      @v: Pointer passed unmodified to notifier function
+ *      @nr_to_call: See comment for notifier_call_chain.
+ *      @nr_calls: See comment for notifier_call_chain
 *
 *      Calls each function in a notifier chain in turn.  The functions
 *      run in an undefined context.
@@ -400,10 +442,19 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
 *      of the last notifier function called.
 */
+int __raw_notifier_call_chain(struct raw_notifier_head *nh,
+                              unsigned long val, void *v,
+                              int nr_to_call, int *nr_calls)
+{
+        return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
+}
+EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
 int raw_notifier_call_chain(struct raw_notifier_head *nh,
                unsigned long val, void *v)
 {
-        return notifier_call_chain(&nh->head, val, v);
+        return __raw_notifier_call_chain(nh, val, v, -1, NULL);
 }
 EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
@@ -478,10 +529,12 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
 EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
 /**
- *      srcu_notifier_call_chain - Call functions in an SRCU notifier chain
+ *      __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
 *      @nh: Pointer to head of the SRCU notifier chain
 *      @val: Value passed unmodified to notifier function
 *      @v: Pointer passed unmodified to notifier function
+ *      @nr_to_call: See comment for notifier_call_chain.
+ *      @nr_calls: See comment for notifier_call_chain
 *
 *      Calls each function in a notifier chain in turn.  The functions
 *      run in a process context, so they are allowed to block.
@@ -494,18 +547,25 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
 *      of the last notifier function called.
 */
-int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
-                unsigned long val, void *v)
+                               unsigned long val, void *v,
+                               int nr_to_call, int *nr_calls)
 {
        int ret;
        int idx;
        idx = srcu_read_lock(&nh->srcu);
-        ret = notifier_call_chain(&nh->head, val, v);
+        ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
        srcu_read_unlock(&nh->srcu, idx);
        return ret;
 }
+EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
+int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
+                unsigned long val, void *v)
+{
+        return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
+}
 EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
 /**
@@ -881,7 +941,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 #ifdef CONFIG_SOFTWARE_SUSPEND
        case LINUX_REBOOT_CMD_SW_SUSPEND:
                {
-                        int ret = pm_suspend(PM_SUSPEND_DISK);
+                        int ret = hibernate();
                        unlock_kernel();
                        return ret;
                }
@@ -1292,7 +1352,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
 }
 /*
- * Samma p� svenska..
+ * Samma på svenska..
 */
 asmlinkage long sys_setfsgid(gid_t gid)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f0664bd5011c..4073353abd4f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -77,6 +77,7 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
+extern int sysctl_stat_interval;
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -857,6 +858,17 @@ static ctl_table vm_table[] = {
                .extra2         = &one_hundred,
        },
 #endif
+#ifdef CONFIG_SMP
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "stat_interval",
+                .data           = &sysctl_stat_interval,
+                .maxlen         = sizeof(sysctl_stat_interval),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec_jiffies,
+                .strategy       = &sysctl_jiffies,
+        },
+#endif
 #if defined(CONFIG_X86_32) || \
   (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
        {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index fe5c7db24247..3db5c3c460d7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -74,15 +74,17 @@ static struct clocksource *watchdog;
 static struct timer_list watchdog_timer;
 static DEFINE_SPINLOCK(watchdog_lock);
 static cycle_t watchdog_last;
+static int watchdog_resumed;
 /*
- * Interval: 0.5sec Treshold: 0.0625s
+ * Interval: 0.5sec Threshold: 0.0625s
 */
 #define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4)
+#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
 static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
 {
-        if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD)
+        if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
                return;
        printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
@@ -98,15 +100,26 @@ static void clocksource_watchdog(unsigned long data)
        struct clocksource *cs, *tmp;
        cycle_t csnow, wdnow;
        int64_t wd_nsec, cs_nsec;
+        int resumed;
        spin_lock(&watchdog_lock);
+        resumed = watchdog_resumed;
+        if (unlikely(resumed))
+                watchdog_resumed = 0;
        wdnow = watchdog->read();
        wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
        watchdog_last = wdnow;
        list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
                csnow = cs->read();
+                if (unlikely(resumed)) {
+                        cs->wd_last = csnow;
+                        continue;
+                }
                /* Initialized ? */
                if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
                        if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
@@ -136,6 +149,13 @@ static void clocksource_watchdog(unsigned long data)
        }
        spin_unlock(&watchdog_lock);
 }
+static void clocksource_resume_watchdog(void)
+{
+        spin_lock(&watchdog_lock);
+        watchdog_resumed = 1;
+        spin_unlock(&watchdog_lock);
+}
 static void clocksource_check_watchdog(struct clocksource *cs)
 {
        struct clocksource *cse;
@@ -182,9 +202,34 @@ static void clocksource_check_watchdog(struct clocksource *cs)
        if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
                cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
 }
+static inline void clocksource_resume_watchdog(void) { }
 #endif
 /**
+ * clocksource_resume - resume the clocksource(s)
+ */
+void clocksource_resume(void)
+{
+        struct list_head *tmp;
+        unsigned long flags;
+        spin_lock_irqsave(&clocksource_lock, flags);
+        list_for_each(tmp, &clocksource_list) {
+                struct clocksource *cs;
+                cs = list_entry(tmp, struct clocksource, list);
+                if (cs->resume)
+                        cs->resume();
+        }
+        clocksource_resume_watchdog();
+        spin_unlock_irqrestore(&clocksource_lock, flags);
+}
+/**
 * clocksource_get_next - Returns the selected clocksource
 *
 */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index b734ca4bc75e..8bbcfb77f7d2 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -65,7 +65,7 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
        SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
 #endif
        SEQ_printf(m, "\n");
-        SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n",
+        SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n",
                (unsigned long long)ktime_to_ns(timer->expires),
                (unsigned long long)(ktime_to_ns(timer->expires) - now));
 }
@@ -111,14 +111,14 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 {
        SEQ_printf(m, "  .index:      %d\n",
                        base->index);
-        SEQ_printf(m, "  .resolution: %Ld nsecs\n",
+        SEQ_printf(m, "  .resolution: %Lu nsecs\n",
                        (unsigned long long)ktime_to_ns(base->resolution));
        SEQ_printf(m,   "  .get_time:   ");
        print_name_offset(m, base->get_time);
        SEQ_printf(m,   "\n");
 #ifdef CONFIG_HIGH_RES_TIMERS
-        SEQ_printf(m, "  .offset:     %Ld nsecs\n",
+        SEQ_printf(m, "  .offset:     %Lu nsecs\n",
-                        ktime_to_ns(base->offset));
+                   (unsigned long long) ktime_to_ns(base->offset));
 #endif
        SEQ_printf(m,   "active timers:\n");
        print_active_timers(m, base, now);
@@ -135,10 +135,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
                print_base(m, cpu_base->clock_base + i, now);
        }
 #define P(x) \
-        SEQ_printf(m, "  .%-15s: %Ld\n", #x, (u64)(cpu_base->x))
+        SEQ_printf(m, "  .%-15s: %Lu\n", #x, \
+                   (unsigned long long)(cpu_base->x))
 #define P_ns(x) \
-        SEQ_printf(m, "  .%-15s: %Ld nsecs\n", #x, \
+        SEQ_printf(m, "  .%-15s: %Lu nsecs\n", #x, \
-                (u64)(ktime_to_ns(cpu_base->x)))
+                   (unsigned long long)(ktime_to_ns(cpu_base->x)))
 #ifdef CONFIG_HIGH_RES_TIMERS
        P_ns(expires_next);
@@ -150,10 +151,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 #ifdef CONFIG_TICK_ONESHOT
 # define P(x) \
-        SEQ_printf(m, "  .%-15s: %Ld\n", #x, (u64)(ts->x))
+        SEQ_printf(m, "  .%-15s: %Lu\n", #x, \
+                   (unsigned long long)(ts->x))
 # define P_ns(x) \
-        SEQ_printf(m, "  .%-15s: %Ld nsecs\n", #x, \
+        SEQ_printf(m, "  .%-15s: %Lu nsecs\n", #x, \
-                (u64)(ktime_to_ns(ts->x)))
+                   (unsigned long long)(ktime_to_ns(ts->x)))
        {
                struct tick_sched *ts = tick_get_tick_sched(cpu);
                P(nohz_mode);
@@ -167,7 +169,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
                P(last_jiffies);
                P(next_jiffies);
                P_ns(idle_expires);
-                SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies);
+                SEQ_printf(m, "jiffies: %Lu\n",
+                           (unsigned long long)jiffies);
        }
 #endif
diff --git a/kernel/timer.c b/kernel/timer.c
index 7a6448340f90..59a28b1752f8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -92,24 +92,24 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
 /* Functions below help us manage 'deferrable' flag */
 static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
 {
-        return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+        return (unsigned int)((unsigned long)base & TBASE_DEFERRABLE_FLAG);
 }
 static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
 {
-        return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+        return (tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG);
 }
 static inline void timer_set_deferrable(struct timer_list *timer)
 {
-        timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
+        timer->base = (tvec_base_t *)((unsigned long)timer->base |
-                                       TBASE_DEFERRABLE_FLAG));
+                                       TBASE_DEFERRABLE_FLAG);
 }
 static inline void
 timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
 {
-        timer->base = (tvec_base_t *)((unsigned long)(new_base) |
+        timer->base = (tvec_base_t *)((unsigned long)new_base |
                                      tbase_get_deferrable(timer->base));
 }
@@ -1293,11 +1293,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
        long cpu = (long)hcpu;
        switch(action) {
        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
                if (init_timers_cpu(cpu) < 0)
                        return NOTIFY_BAD;
                break;
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                migrate_timers(cpu);
                break;
 #endif
@@ -1497,6 +1499,8 @@ unregister_time_interpolator(struct time_interpolator *ti)
                prev = &curr->next;
        }
+        clocksource_resume();
        write_seqlock_irqsave(&xtime_lock, flags);
        if (ti == time_interpolator) {
                /* we lost the best time-interpolator: */
diff --git a/kernel/wait.c b/kernel/wait.c
index 59a82f63275d..444ddbfaefc4 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL(remove_wait_queue);
 * The spin_unlock() itself is semi-permeable and only protects
 * one way (it only protects stuff inside the critical region and
 * stops them from bleeding out - it would still allow subsequent
- * loads to move into the the critical region).
+ * loads to move into the critical region).
 */
 void fastcall
 prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6fa5e63085d..fb56fedd5c02 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,30 +36,20 @@
 /*
 * The per-CPU workqueue (if single thread, we always use the first
 * possible cpu).
- *
- * The sequence counters are for flush_scheduled_work().  It wants to wait
- * until all currently-scheduled works are completed, but it doesn't
- * want to be livelocked by new, incoming ones.  So it waits until
- * remove_sequence is >= the insert_sequence which pertained when
- * flush_scheduled_work() was called.
 */
 struct cpu_workqueue_struct {
        spinlock_t lock;
-        long remove_sequence;   /* Least-recently added (next to run) */
-        long insert_sequence;   /* Next to add */
        struct list_head worklist;
        wait_queue_head_t more_work;
-        wait_queue_head_t work_done;
+        struct work_struct *current_work;
        struct workqueue_struct *wq;
        struct task_struct *thread;
+        int should_stop;
        int run_depth;          /* Detect run_workqueue() recursion depth */
-        int freezeable;         /* Freeze the thread during suspend */
 } ____cacheline_aligned;
 /*
@@ -68,8 +58,10 @@ struct cpu_workqueue_struct {
 */
 struct workqueue_struct {
        struct cpu_workqueue_struct *cpu_wq;
+        struct list_head list;
        const char *name;
-        struct list_head list;  /* Empty if single thread */
+        int singlethread;
+        int freezeable;         /* Freeze threads during suspend */
 };
 /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
@@ -77,106 +69,68 @@ struct workqueue_struct {
 static DEFINE_MUTEX(workqueue_mutex);
 static LIST_HEAD(workqueues);
-static int singlethread_cpu;
+static int singlethread_cpu __read_mostly;
+static cpumask_t cpu_singlethread_map __read_mostly;
+/* optimization, we could use cpu_possible_map */
+static cpumask_t cpu_populated_map __read_mostly;
 /* If it's single threaded, it isn't in the list of workqueues. */
 static inline int is_single_threaded(struct workqueue_struct *wq)
 {
-        return list_empty(&wq->list);
+        return wq->singlethread;
+}
+static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
+{
+        return is_single_threaded(wq)
+                ? &cpu_singlethread_map : &cpu_populated_map;
+}
+static
+struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
+{
+        if (unlikely(is_single_threaded(wq)))
+                cpu = singlethread_cpu;
+        return per_cpu_ptr(wq->cpu_wq, cpu);
 }
 /*
 * Set the workqueue on which a work item is to be run
 * - Must *only* be called if the pending flag is set
 */
-static inline void set_wq_data(struct work_struct *work, void *wq)
+static inline void set_wq_data(struct work_struct *work,
+                                struct cpu_workqueue_struct *cwq)
 {
        unsigned long new;
        BUG_ON(!work_pending(work));
-        new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
+        new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
        new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
        atomic_long_set(&work->data, new);
 }
-static inline void *get_wq_data(struct work_struct *work)
+static inline
+struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 {
        return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
-static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
+static void insert_work(struct cpu_workqueue_struct *cwq,
+                                struct work_struct *work, int tail)
 {
-        int ret = 0;
+        set_wq_data(work, cwq);
-        unsigned long flags;
-        spin_lock_irqsave(&cwq->lock, flags);
        /*
-         * We need to re-validate the work info after we've gotten
+         * Ensure that we get the right work->data if we see the
-         * the cpu_workqueue lock. We can run the work now iff:
+         * result of list_add() below, see try_to_grab_pending().
-         *
-         *  - the wq_data still matches the cpu_workqueue_struct
-         *  - AND the work is still marked pending
-         *  - AND the work is still on a list (which will be this
-         *    workqueue_struct list)
-         *
-         * All these conditions are important, because we
-         * need to protect against the work being run right
-         * now on another CPU (all but the last one might be
-         * true if it's currently running and has not been
-         * released yet, for example).
         */
-        if (get_wq_data(work) == cwq
+        smp_wmb();
-            && work_pending(work)
+        if (tail)
-            && !list_empty(&work->entry)) {
+                list_add_tail(&work->entry, &cwq->worklist);
-                work_func_t f = work->func;
+        else
-                list_del_init(&work->entry);
+                list_add(&work->entry, &cwq->worklist);
-                spin_unlock_irqrestore(&cwq->lock, flags);
+        wake_up(&cwq->more_work);
-                if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
-                        work_release(work);
-                f(work);
-                spin_lock_irqsave(&cwq->lock, flags);
-                cwq->remove_sequence++;
-                wake_up(&cwq->work_done);
-                ret = 1;
-        }
-        spin_unlock_irqrestore(&cwq->lock, flags);
-        return ret;
-}
-/**
- * run_scheduled_work - run scheduled work synchronously
- * @work: work to run
- *
- * This checks if the work was pending, and runs it
- * synchronously if so. It returns a boolean to indicate
- * whether it had any scheduled work to run or not.
- *
- * NOTE! This _only_ works for normal work_structs. You
- * CANNOT use this for delayed work, because the wq data
- * for delayed work will not point properly to the per-
- * CPU workqueue struct, but will change!
- */
-int fastcall run_scheduled_work(struct work_struct *work)
-{
-        for (;;) {
-                struct cpu_workqueue_struct *cwq;
-                if (!work_pending(work))
-                        return 0;
-                if (list_empty(&work->entry))
-                        return 0;
-                /* NOTE! This depends intimately on __queue_work! */
-                cwq = get_wq_data(work);
-                if (!cwq)
-                        return 0;
-                if (__run_work(cwq, work))
-                        return 1;
-        }
 }
-EXPORT_SYMBOL(run_scheduled_work);
 /* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
@@ -185,10 +139,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
        unsigned long flags;
        spin_lock_irqsave(&cwq->lock, flags);
-        set_wq_data(work, cwq);
+        insert_work(cwq, work, 1);
-        list_add_tail(&work->entry, &cwq->worklist);
-        cwq->insert_sequence++;
-        wake_up(&cwq->more_work);
        spin_unlock_irqrestore(&cwq->lock, flags);
 }
@@ -204,16 +155,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
 */
 int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-        int ret = 0, cpu = get_cpu();
+        int ret = 0;
        if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
-                if (unlikely(is_single_threaded(wq)))
-                        cpu = singlethread_cpu;
                BUG_ON(!list_empty(&work->entry));
-                __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+                __queue_work(wq_per_cpu(wq, get_cpu()), work);
+                put_cpu();
                ret = 1;
        }
-        put_cpu();
        return ret;
 }
 EXPORT_SYMBOL_GPL(queue_work);
@@ -221,13 +170,10 @@ EXPORT_SYMBOL_GPL(queue_work);
 void delayed_work_timer_fn(unsigned long __data)
 {
        struct delayed_work *dwork = (struct delayed_work *)__data;
-        struct workqueue_struct *wq = get_wq_data(&dwork->work);
+        struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
-        int cpu = smp_processor_id();
+        struct workqueue_struct *wq = cwq->wq;
-        if (unlikely(is_single_threaded(wq)))
+        __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
-                cpu = singlethread_cpu;
-        __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
 }
 /**
@@ -241,27 +187,11 @@ void delayed_work_timer_fn(unsigned long __data)
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
                        struct delayed_work *dwork, unsigned long delay)
 {
-        int ret = 0;
+        timer_stats_timer_set_start_info(&dwork->timer);
-        struct timer_list *timer = &dwork->timer;
-        struct work_struct *work = &dwork->work;
-        timer_stats_timer_set_start_info(timer);
        if (delay == 0)
-                return queue_work(wq, work);
+                return queue_work(wq, &dwork->work);
-        if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
-                BUG_ON(timer_pending(timer));
-                BUG_ON(!list_empty(&work->entry));
-                /* This stores wq for the moment, for the timer_fn */
+        return queue_delayed_work_on(-1, wq, dwork, delay);
-                set_wq_data(work, wq);
-                timer->expires = jiffies + delay;
-                timer->data = (unsigned long)dwork;
-                timer->function = delayed_work_timer_fn;
-                add_timer(timer);
-                ret = 1;
-        }
-        return ret;
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work);
@@ -285,12 +215,16 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                BUG_ON(timer_pending(timer));
                BUG_ON(!list_empty(&work->entry));
-                /* This stores wq for the moment, for the timer_fn */
+                /* This stores cwq for the moment, for the timer_fn */
-                set_wq_data(work, wq);
+                set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
                timer->expires = jiffies + delay;
                timer->data = (unsigned long)dwork;
                timer->function = delayed_work_timer_fn;
-                add_timer_on(timer, cpu);
+                if (unlikely(cpu >= 0))
+                        add_timer_on(timer, cpu);
+                else
+                        add_timer(timer);
                ret = 1;
        }
        return ret;
@@ -299,13 +233,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
-        unsigned long flags;
+        spin_lock_irq(&cwq->lock);
-        /*
-         * Keep taking off work from the queue until
-         * done.
-         */
-        spin_lock_irqsave(&cwq->lock, flags);
        cwq->run_depth++;
        if (cwq->run_depth > 3) {
                /* morton gets to eat his hat */
@@ -318,12 +246,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
                                                struct work_struct, entry);
                work_func_t f = work->func;
+                cwq->current_work = work;
                list_del_init(cwq->worklist.next);
-                spin_unlock_irqrestore(&cwq->lock, flags);
+                spin_unlock_irq(&cwq->lock);
                BUG_ON(get_wq_data(work) != cwq);
-                if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
+                work_clear_pending(work);
-                        work_release(work);
                f(work);
                if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
@@ -337,63 +265,81 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
                        dump_stack();
                }
-                spin_lock_irqsave(&cwq->lock, flags);
+                spin_lock_irq(&cwq->lock);
-                cwq->remove_sequence++;
+                cwq->current_work = NULL;
-                wake_up(&cwq->work_done);
        }
        cwq->run_depth--;
-        spin_unlock_irqrestore(&cwq->lock, flags);
+        spin_unlock_irq(&cwq->lock);
+}
+/*
+ * NOTE: the caller must not touch *cwq if this func returns true
+ */
+static int cwq_should_stop(struct cpu_workqueue_struct *cwq)
+{
+        int should_stop = cwq->should_stop;
+        if (unlikely(should_stop)) {
+                spin_lock_irq(&cwq->lock);
+                should_stop = cwq->should_stop && list_empty(&cwq->worklist);
+                if (should_stop)
+                        cwq->thread = NULL;
+                spin_unlock_irq(&cwq->lock);
+        }
+        return should_stop;
 }
 static int worker_thread(void *__cwq)
 {
        struct cpu_workqueue_struct *cwq = __cwq;
-        DECLARE_WAITQUEUE(wait, current);
+        DEFINE_WAIT(wait);
-        struct k_sigaction sa;
-        sigset_t blocked;
-        if (!cwq->freezeable)
+        if (!cwq->wq->freezeable)
                current->flags |= PF_NOFREEZE;
        set_user_nice(current, -5);
-        /* Block and flush all signals */
+        for (;;) {
-        sigfillset(&blocked);
+                prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
-        sigprocmask(SIG_BLOCK, &blocked, NULL);
+                if (!freezing(current) && !cwq->should_stop
-        flush_signals(current);
+                    && list_empty(&cwq->worklist))
+                        schedule();
-        /*
+                finish_wait(&cwq->more_work, &wait);
-         * We inherited MPOL_INTERLEAVE from the booting kernel.
-         * Set MPOL_DEFAULT to insure node local allocations.
-         */
-        numa_default_policy();
-        /* SIG_IGN makes children autoreap: see do_notify_parent(). */
-        sa.sa.sa_handler = SIG_IGN;
-        sa.sa.sa_flags = 0;
-        siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
-        do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
-        set_current_state(TASK_INTERRUPTIBLE);
+                try_to_freeze();
-        while (!kthread_should_stop()) {
-                if (cwq->freezeable)
-                        try_to_freeze();
-                add_wait_queue(&cwq->more_work, &wait);
+                if (cwq_should_stop(cwq))
-                if (list_empty(&cwq->worklist))
+                        break;
-                        schedule();
-                else
-                        __set_current_state(TASK_RUNNING);
-                remove_wait_queue(&cwq->more_work, &wait);
-                if (!list_empty(&cwq->worklist))
+                run_workqueue(cwq);
-                        run_workqueue(cwq);
-                set_current_state(TASK_INTERRUPTIBLE);
        }
-        __set_current_state(TASK_RUNNING);
        return 0;
 }
+struct wq_barrier {
+        struct work_struct      work;
+        struct completion       done;
+};
+static void wq_barrier_func(struct work_struct *work)
+{
+        struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
+        complete(&barr->done);
+}
+static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
+                                        struct wq_barrier *barr, int tail)
+{
+        INIT_WORK(&barr->work, wq_barrier_func);
+        __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
+        init_completion(&barr->done);
+        insert_work(cwq, &barr->work, tail);
+}
 static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 {
        if (cwq->thread == current) {
@@ -403,21 +349,18 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
                 */
                run_workqueue(cwq);
        } else {
-                DEFINE_WAIT(wait);
+                struct wq_barrier barr;
-                long sequence_needed;
+                int active = 0;
                spin_lock_irq(&cwq->lock);
-                sequence_needed = cwq->insert_sequence;
+                if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+                        insert_wq_barrier(cwq, &barr, 1);
-                while (sequence_needed - cwq->remove_sequence > 0) {
+                        active = 1;
-                        prepare_to_wait(&cwq->work_done, &wait,
-                                        TASK_UNINTERRUPTIBLE);
-                        spin_unlock_irq(&cwq->lock);
-                        schedule();
-                        spin_lock_irq(&cwq->lock);
                }
-                finish_wait(&cwq->work_done, &wait);
                spin_unlock_irq(&cwq->lock);
+                if (active)
+                        wait_for_completion(&barr.done);
        }
 }
@@ -428,151 +371,145 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 * Forces execution of the workqueue and blocks until its completion.
 * This is typically used in driver shutdown handlers.
 *
- * This function will sample each workqueue's current insert_sequence number and
+ * We sleep until all works which were queued on entry have been handled,
- * will sleep until the head sequence is greater than or equal to that.  This
+ * but we are not livelocked by new incoming ones.
- * means that we sleep until all works which were queued on entry have been
- * handled, but we are not livelocked by new incoming ones.
 *
 * This function used to run the workqueues itself.  Now we just wait for the
 * helper threads to do it.
 */
 void fastcall flush_workqueue(struct workqueue_struct *wq)
 {
+        const cpumask_t *cpu_map = wq_cpu_map(wq);
+        int cpu;
        might_sleep();
+        for_each_cpu_mask(cpu, *cpu_map)
+                flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
+}
+EXPORT_SYMBOL_GPL(flush_workqueue);
-        if (is_single_threaded(wq)) {
+/*
-                /* Always use first cpu's area. */
+ * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit,
-                flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu));
+ * so this work can't be re-armed in any way.
-        } else {
+ */
-                int cpu;
+static int try_to_grab_pending(struct work_struct *work)
+{
+        struct cpu_workqueue_struct *cwq;
+        int ret = 0;
-                mutex_lock(&workqueue_mutex);
+        if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
-                for_each_online_cpu(cpu)
+                return 1;
-                        flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
-                mutex_unlock(&workqueue_mutex);
+        /*
+         * The queueing is in progress, or it is already queued. Try to
+         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
+         */
+        cwq = get_wq_data(work);
+        if (!cwq)
+                return ret;
+        spin_lock_irq(&cwq->lock);
+        if (!list_empty(&work->entry)) {
+                /*
+                 * This work is queued, but perhaps we locked the wrong cwq.
+                 * In that case we must see the new value after rmb(), see
+                 * insert_work()->wmb().
+                 */
+                smp_rmb();
+                if (cwq == get_wq_data(work)) {
+                        list_del_init(&work->entry);
+                        ret = 1;
+                }
        }
+        spin_unlock_irq(&cwq->lock);
+        return ret;
 }
-EXPORT_SYMBOL_GPL(flush_workqueue);
-static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
+static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
-                                                   int cpu, int freezeable)
+                                struct work_struct *work)
 {
-        struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+        struct wq_barrier barr;
-        struct task_struct *p;
+        int running = 0;
-        spin_lock_init(&cwq->lock);
+        spin_lock_irq(&cwq->lock);
-        cwq->wq = wq;
+        if (unlikely(cwq->current_work == work)) {
-        cwq->thread = NULL;
+                insert_wq_barrier(cwq, &barr, 0);
-        cwq->insert_sequence = 0;
+                running = 1;
-        cwq->remove_sequence = 0;
+        }
-        cwq->freezeable = freezeable;
+        spin_unlock_irq(&cwq->lock);
-        INIT_LIST_HEAD(&cwq->worklist);
-        init_waitqueue_head(&cwq->more_work);
-        init_waitqueue_head(&cwq->work_done);
-        if (is_single_threaded(wq))
+        if (unlikely(running))
-                p = kthread_create(worker_thread, cwq, "%s", wq->name);
+                wait_for_completion(&barr.done);
-        else
-                p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
-        if (IS_ERR(p))
-                return NULL;
-        cwq->thread = p;
-        return p;
 }
-struct workqueue_struct *__create_workqueue(const char *name,
+static void wait_on_work(struct work_struct *work)
-                                            int singlethread, int freezeable)
 {
-        int cpu, destroy = 0;
+        struct cpu_workqueue_struct *cwq;
        struct workqueue_struct *wq;
-        struct task_struct *p;
+        const cpumask_t *cpu_map;
+        int cpu;
-        wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+        might_sleep();
-        if (!wq)
-                return NULL;
-        wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
+        cwq = get_wq_data(work);
-        if (!wq->cpu_wq) {
+        if (!cwq)
-                kfree(wq);
+                return;
-                return NULL;
-        }
-        wq->name = name;
+        wq = cwq->wq;
-        mutex_lock(&workqueue_mutex);
+        cpu_map = wq_cpu_map(wq);
-        if (singlethread) {
-                INIT_LIST_HEAD(&wq->list);
-                p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
-                if (!p)
-                        destroy = 1;
-                else
-                        wake_up_process(p);
-        } else {
-                list_add(&wq->list, &workqueues);
-                for_each_online_cpu(cpu) {
-                        p = create_workqueue_thread(wq, cpu, freezeable);
-                        if (p) {
-                                kthread_bind(p, cpu);
-                                wake_up_process(p);
-                        } else
-                                destroy = 1;
-                }
-        }
-        mutex_unlock(&workqueue_mutex);
-        /*
+        for_each_cpu_mask(cpu, *cpu_map)
-         * Was there any error during startup? If yes then clean up:
+                wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
-         */
-        if (destroy) {
-                destroy_workqueue(wq);
-                wq = NULL;
-        }
-        return wq;
 }
-EXPORT_SYMBOL_GPL(__create_workqueue);
-static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
+/**
+ * cancel_work_sync - block until a work_struct's callback has terminated
+ * @work: the work which is to be flushed
+ *
+ * cancel_work_sync() will cancel the work if it is queued. If the work's
+ * callback appears to be running, cancel_work_sync() will block until it
+ * has completed.
+ *
+ * It is possible to use this function if the work re-queues itself. It can
+ * cancel the work even if it migrates to another workqueue, however in that
+ * case it only guarantees that work->func() has completed on the last queued
+ * workqueue.
+ *
+ * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
+ * pending, otherwise it goes into a busy-wait loop until the timer expires.
+ *
+ * The caller must ensure that workqueue_struct on which this work was last
+ * queued can't be destroyed before this function returns.
+ */
+void cancel_work_sync(struct work_struct *work)
 {
-        struct cpu_workqueue_struct *cwq;
+        while (!try_to_grab_pending(work))
-        unsigned long flags;
+                cpu_relax();
-        struct task_struct *p;
+        wait_on_work(work);
+        work_clear_pending(work);
-        cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-        spin_lock_irqsave(&cwq->lock, flags);
-        p = cwq->thread;
-        cwq->thread = NULL;
-        spin_unlock_irqrestore(&cwq->lock, flags);
-        if (p)
-                kthread_stop(p);
 }
+EXPORT_SYMBOL_GPL(cancel_work_sync);
 /**
- * destroy_workqueue - safely terminate a workqueue
+ * cancel_rearming_delayed_work - reliably kill off a delayed work.
- * @wq: target workqueue
+ * @dwork: the delayed work struct
 *
- * Safely destroy a workqueue. All work currently pending will be done first.
+ * It is possible to use this function if @dwork rearms itself via queue_work()
+ * or queue_delayed_work(). See also the comment for cancel_work_sync().
 */
-void destroy_workqueue(struct workqueue_struct *wq)
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
 {
-        int cpu;
+        while (!del_timer(&dwork->timer) &&
+               !try_to_grab_pending(&dwork->work))
-        flush_workqueue(wq);
+                cpu_relax();
+        wait_on_work(&dwork->work);
-        /* We don't need the distraction of CPUs appearing and vanishing. */
+        work_clear_pending(&dwork->work);
-        mutex_lock(&workqueue_mutex);
-        if (is_single_threaded(wq))
-                cleanup_workqueue_thread(wq, singlethread_cpu);
-        else {
-                for_each_online_cpu(cpu)
-                        cleanup_workqueue_thread(wq, cpu);
-                list_del(&wq->list);
-        }
-        mutex_unlock(&workqueue_mutex);
-        free_percpu(wq->cpu_wq);
-        kfree(wq);
 }
-EXPORT_SYMBOL_GPL(destroy_workqueue);
+EXPORT_SYMBOL(cancel_rearming_delayed_work);
-static struct workqueue_struct *keventd_wq;
+static struct workqueue_struct *keventd_wq __read_mostly;
 /**
 * schedule_work - put work task in global workqueue
@@ -638,7 +575,7 @@ int schedule_on_each_cpu(work_func_t func)
        if (!works)
                return -ENOMEM;
-        mutex_lock(&workqueue_mutex);
+        preempt_disable();              /* CPU hotplug */
        for_each_online_cpu(cpu) {
                struct work_struct *work = per_cpu_ptr(works, cpu);
@@ -646,7 +583,7 @@ int schedule_on_each_cpu(work_func_t func)
                set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
                __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
        }
-        mutex_unlock(&workqueue_mutex);
+        preempt_enable();
        flush_workqueue(keventd_wq);
        free_percpu(works);
        return 0;
@@ -659,29 +596,6 @@ void flush_scheduled_work(void)
 EXPORT_SYMBOL(flush_scheduled_work);
 /**
- * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work.
- * @wq:   the controlling workqueue structure
- * @dwork: the delayed work struct
- */
-void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
-                                       struct delayed_work *dwork)
-{
-        while (!cancel_delayed_work(dwork))
-                flush_workqueue(wq);
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
-/**
- * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work.
- * @dwork: the delayed work struct
- */
-void cancel_rearming_delayed_work(struct delayed_work *dwork)
-{
-        cancel_rearming_delayed_workqueue(keventd_wq, dwork);
-}
-EXPORT_SYMBOL(cancel_rearming_delayed_work);
-/**
 * execute_in_process_context - reliably execute the routine with user context
 * @fn:         the function to execute
 * @ew:         guaranteed storage for the execute work structure (must
@@ -728,94 +642,209 @@ int current_is_keventd(void)
 }
-/* Take the work from this (downed) CPU. */
+static struct cpu_workqueue_struct *
-static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
+init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 {
        struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-        struct list_head list;
-        struct work_struct *work;
-        spin_lock_irq(&cwq->lock);
+        cwq->wq = wq;
-        list_replace_init(&cwq->worklist, &list);
+        spin_lock_init(&cwq->lock);
+        INIT_LIST_HEAD(&cwq->worklist);
+        init_waitqueue_head(&cwq->more_work);
+        return cwq;
+}
+static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+        struct workqueue_struct *wq = cwq->wq;
+        const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
+        struct task_struct *p;
+        p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
+        /*
+         * Nobody can add the work_struct to this cwq,
+         *      if (caller is __create_workqueue)
+         *              nobody should see this wq
+         *      else // caller is CPU_UP_PREPARE
+         *              cpu is not on cpu_online_map
+         * so we can abort safely.
+         */
+        if (IS_ERR(p))
+                return PTR_ERR(p);
+        cwq->thread = p;
+        cwq->should_stop = 0;
+        return 0;
+}
+static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+        struct task_struct *p = cwq->thread;
-        while (!list_empty(&list)) {
+        if (p != NULL) {
-                printk("Taking work for %s\n", wq->name);
+                if (cpu >= 0)
-                work = list_entry(list.next,struct work_struct,entry);
+                        kthread_bind(p, cpu);
-                list_del(&work->entry);
+                wake_up_process(p);
-                __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
        }
-        spin_unlock_irq(&cwq->lock);
 }
-/* We're holding the cpucontrol mutex here */
+struct workqueue_struct *__create_workqueue(const char *name,
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+                                            int singlethread, int freezeable)
-                                  unsigned long action,
-                                  void *hcpu)
 {
-        unsigned int hotcpu = (unsigned long)hcpu;
        struct workqueue_struct *wq;
+        struct cpu_workqueue_struct *cwq;
+        int err = 0, cpu;
-        switch (action) {
+        wq = kzalloc(sizeof(*wq), GFP_KERNEL);
-        case CPU_UP_PREPARE:
+        if (!wq)
-                mutex_lock(&workqueue_mutex);
+                return NULL;
-                /* Create a new workqueue thread for it. */
-                list_for_each_entry(wq, &workqueues, list) {
-                        if (!create_workqueue_thread(wq, hotcpu, 0)) {
-                                printk("workqueue for %i failed\n", hotcpu);
-                                return NOTIFY_BAD;
-                        }
-                }
-                break;
-        case CPU_ONLINE:
+        wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
-                /* Kick off worker threads. */
+        if (!wq->cpu_wq) {
-                list_for_each_entry(wq, &workqueues, list) {
+                kfree(wq);
-                        struct cpu_workqueue_struct *cwq;
+                return NULL;
+        }
-                        cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
+        wq->name = name;
-                        kthread_bind(cwq->thread, hotcpu);
+        wq->singlethread = singlethread;
-                        wake_up_process(cwq->thread);
+        wq->freezeable = freezeable;
-                }
+        INIT_LIST_HEAD(&wq->list);
-                mutex_unlock(&workqueue_mutex);
-                break;
-        case CPU_UP_CANCELED:
+        if (singlethread) {
-                list_for_each_entry(wq, &workqueues, list) {
+                cwq = init_cpu_workqueue(wq, singlethread_cpu);
-                        if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
+                err = create_workqueue_thread(cwq, singlethread_cpu);
+                start_workqueue_thread(cwq, -1);
+        } else {
+                mutex_lock(&workqueue_mutex);
+                list_add(&wq->list, &workqueues);
+                for_each_possible_cpu(cpu) {
+                        cwq = init_cpu_workqueue(wq, cpu);
+                        if (err || !cpu_online(cpu))
                                continue;
-                        /* Unbind so it can run. */
+                        err = create_workqueue_thread(cwq, cpu);
-                        kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
+                        start_workqueue_thread(cwq, cpu);
-                                     any_online_cpu(cpu_online_map));
-                        cleanup_workqueue_thread(wq, hotcpu);
                }
                mutex_unlock(&workqueue_mutex);
-                break;
+        }
+        if (err) {
+                destroy_workqueue(wq);
+                wq = NULL;
+        }
+        return wq;
+}
+EXPORT_SYMBOL_GPL(__create_workqueue);
+static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
+{
+        struct wq_barrier barr;
+        int alive = 0;
+        spin_lock_irq(&cwq->lock);
+        if (cwq->thread != NULL) {
+                insert_wq_barrier(cwq, &barr, 1);
+                cwq->should_stop = 1;
+                alive = 1;
+        }
+        spin_unlock_irq(&cwq->lock);
+        if (alive) {
+                wait_for_completion(&barr.done);
-        case CPU_DOWN_PREPARE:
+                while (unlikely(cwq->thread != NULL))
+                        cpu_relax();
+                /*
+                 * Wait until cwq->thread unlocks cwq->lock,
+                 * it won't touch *cwq after that.
+                 */
+                smp_rmb();
+                spin_unlock_wait(&cwq->lock);
+        }
+}
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
+void destroy_workqueue(struct workqueue_struct *wq)
+{
+        const cpumask_t *cpu_map = wq_cpu_map(wq);
+        struct cpu_workqueue_struct *cwq;
+        int cpu;
+        mutex_lock(&workqueue_mutex);
+        list_del(&wq->list);
+        mutex_unlock(&workqueue_mutex);
+        for_each_cpu_mask(cpu, *cpu_map) {
+                cwq = per_cpu_ptr(wq->cpu_wq, cpu);
+                cleanup_workqueue_thread(cwq, cpu);
+        }
+        free_percpu(wq->cpu_wq);
+        kfree(wq);
+}
+EXPORT_SYMBOL_GPL(destroy_workqueue);
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+                                                unsigned long action,
+                                                void *hcpu)
+{
+        unsigned int cpu = (unsigned long)hcpu;
+        struct cpu_workqueue_struct *cwq;
+        struct workqueue_struct *wq;
+        action &= ~CPU_TASKS_FROZEN;
+        switch (action) {
+        case CPU_LOCK_ACQUIRE:
                mutex_lock(&workqueue_mutex);
-                break;
+                return NOTIFY_OK;
-        case CPU_DOWN_FAILED:
+        case CPU_LOCK_RELEASE:
                mutex_unlock(&workqueue_mutex);
-                break;
+                return NOTIFY_OK;
-        case CPU_DEAD:
+        case CPU_UP_PREPARE:
-                list_for_each_entry(wq, &workqueues, list)
+                cpu_set(cpu, cpu_populated_map);
-                        cleanup_workqueue_thread(wq, hotcpu);
+        }
-                list_for_each_entry(wq, &workqueues, list)
-                        take_over_work(wq, hotcpu);
+        list_for_each_entry(wq, &workqueues, list) {
-                mutex_unlock(&workqueue_mutex);
+                cwq = per_cpu_ptr(wq->cpu_wq, cpu);
-                break;
+                switch (action) {
+                case CPU_UP_PREPARE:
+                        if (!create_workqueue_thread(cwq, cpu))
+                                break;
+                        printk(KERN_ERR "workqueue for %i failed\n", cpu);
+                        return NOTIFY_BAD;
+                case CPU_ONLINE:
+                        start_workqueue_thread(cwq, cpu);
+                        break;
+                case CPU_UP_CANCELED:
+                        start_workqueue_thread(cwq, -1);
+                case CPU_DEAD:
+                        cleanup_workqueue_thread(cwq, cpu);
+                        break;
+                }
        }
        return NOTIFY_OK;
 }
-void init_workqueues(void)
+void __init init_workqueues(void)
 {
+        cpu_populated_map = cpu_online_map;
        singlethread_cpu = first_cpu(cpu_possible_map);
+        cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu);
        hotcpu_notifier(workqueue_cpu_callback, 0);
        keventd_wq = create_workqueue("events");
        BUG_ON(!keventd_wq);
 }