50 files changed, 1729 insertions, 860 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 248e1c396f8b..4af15802ccd4 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -7,7 +7,7 @@ choice
        default HZ_250
        help
         Allows the configuration of the timer frequency. It is customary
-         to have the timer interrupt run at 1000 HZ but 100 HZ may be more
+         to have the timer interrupt run at 1000 Hz but 100 Hz may be more
         beneficial for servers and NUMA systems that do not need to have
         a fast response for user interaction and that may experience bus
         contention and cacheline bounces as a result of timer interrupts.
@@ -19,21 +19,30 @@ choice
        config HZ_100
                bool "100 HZ"
        help
-          100 HZ is a typical choice for servers, SMP and NUMA systems
+          100 Hz is a typical choice for servers, SMP and NUMA systems
          with lots of processors that may show reduced performance if
          too many timer interrupts are occurring.
        config HZ_250
                bool "250 HZ"
        help
-         250 HZ is a good compromise choice allowing server performance
+         250 Hz is a good compromise choice allowing server performance
         while also showing good interactive responsiveness even
-         on SMP and NUMA systems.
+         on SMP and NUMA systems. If you are going to be using NTSC video
+         or multimedia, selected 300Hz instead.
+        config HZ_300
+                bool "300 HZ"
+        help
+         300 Hz is a good compromise choice allowing server performance
+         while also showing good interactive responsiveness even
+         on SMP and NUMA systems and exactly dividing by both PAL and
+         NTSC frame rates for video and multimedia work.
        config HZ_1000
                bool "1000 HZ"
        help
-         1000 HZ is the preferred choice for desktop systems and other
+         1000 Hz is the preferred choice for desktop systems and other
         systems requiring fast interactive responses to events.
 endchoice
@@ -42,5 +51,6 @@ config HZ
        int
        default 100 if HZ_100
        default 250 if HZ_250
+        default 300 if HZ_300
        default 1000 if HZ_1000
diff --git a/kernel/acct.c b/kernel/acct.c
index 0aad5ca36a81..dc12db8600e7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,7 +89,8 @@ struct acct_glbs {
        struct timer_list       timer;
 };
-static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
+static struct acct_glbs acct_globals __cacheline_aligned =
+        {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
 /*
 * Called whenever the timer says to check the free space.
diff --git a/kernel/audit.c b/kernel/audit.c
index 98106f6078b0..d9b690ac684b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -57,6 +57,7 @@
 #include <linux/netlink.h>
 #include <linux/selinux.h>
 #include <linux/inotify.h>
+#include <linux/freezer.h>
 #include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f40d923af8e..2e896f8ae29e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
        struct audit_rule *rule;
        int i;
-        rule = kmalloc(sizeof(*rule), GFP_KERNEL);
+        rule = kzalloc(sizeof(*rule), GFP_KERNEL);
        if (unlikely(!rule))
                return NULL;
-        memset(rule, 0, sizeof(*rule));
        rule->flags = krule->flags | krule->listnr;
        rule->action = krule->action;
diff --git a/kernel/configs.c b/kernel/configs.c
index f9e31974f4ad..8fa1fb28f8a7 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
        return count;
 }
-static struct file_operations ikconfig_file_ops = {
+static const struct file_operations ikconfig_file_ops = {
        .owner = THIS_MODULE,
        .read = ikconfig_read_current,
 };
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 272254f20d97..9124669f4586 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -270,11 +270,7 @@ int disable_nonboot_cpus(void)
                        goto out;
                }
        }
-        error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
-        if (error) {
-                printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
-                goto out;
-        }
        /* We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
         */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6313c38c930e..0a6b4d89f9a0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
        }
        /* Remaining checks don't apply to root cpuset */
-        if ((par = cur->parent) == NULL)
+        if (cur == &top_cpuset)
                return 0;
+        par = cur->parent;
        /* We must be a subset of our parent cpuset */
        if (!is_cpuset_subset(trial, par))
                return -EACCES;
@@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
        cpu_exclusive_changed =
                (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
        mutex_lock(&callback_mutex);
-        if (turning_on)
+        cs->flags = trialcs.flags;
-                set_bit(bit, &cs->flags);
-        else
-                clear_bit(bit, &cs->flags);
        mutex_unlock(&callback_mutex);
        if (cpu_exclusive_changed)
@@ -1281,7 +1280,8 @@ typedef enum {
        FILE_TASKLIST,
 } cpuset_filetype_t;
-static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf,
+static ssize_t cpuset_common_file_write(struct file *file,
+                                        const char __user *userbuf,
                                        size_t nbytes, loff_t *unused_ppos)
 {
        struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1292,7 +1292,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        int retval = 0;
        /* Crude upper limit on largest legitimate cpulist user might write. */
-        if (nbytes > 100 + 6 * NR_CPUS)
+        if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
                return -E2BIG;
        /* +1 for nul-terminator */
@@ -1532,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
        return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
-static struct file_operations cpuset_file_operations = {
+static const struct file_operations cpuset_file_operations = {
        .read = cpuset_file_read,
        .write = cpuset_file_write,
        .llseek = generic_file_llseek,
@@ -2045,7 +2045,6 @@ out:
        return err;
 }
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
 /*
 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2109,9 +2108,7 @@ static void common_cpu_mem_hotplug_unplug(void)
        mutex_unlock(&callback_mutex);
        mutex_unlock(&manage_mutex);
 }
-#endif
-#ifdef CONFIG_HOTPLUG_CPU
 /*
 * The top_cpuset tracks what CPUs and Memory Nodes are online,
 * period.  This is necessary in order to make cpusets transparent
@@ -2128,7 +2125,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
        common_cpu_mem_hotplug_unplug();
        return 0;
 }
-#endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
@@ -2610,7 +2606,7 @@ static int cpuset_open(struct inode *inode, struct file *file)
        return single_open(file, proc_cpuset_show, pid);
 }
-struct file_operations proc_cpuset_operations = {
+const struct file_operations proc_cpuset_operations = {
        .open           = cpuset_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 66a0ea48751d..766d5912b26a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -20,7 +20,7 @@
 #include <linux/delayacct.h>
 int delayacct_on __read_mostly = 1;     /* Delay accounting turned on/off */
-kmem_cache_t *delayacct_cache;
+struct kmem_cache *delayacct_cache;
 static int __init delayacct_setup_disable(char *str)
 {
@@ -41,7 +41,7 @@ void delayacct_init(void)
 void __delayacct_tsk_init(struct task_struct *tsk)
 {
-        tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+        tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
        if (tsk->delays)
                spin_lock_init(&tsk->delays->lock);
 }
diff --git a/kernel/dma.c b/kernel/dma.c
index 2020644c938a..937b13ca33ba 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file)
        return single_open(file, proc_dma_show, NULL);
 }
-static struct file_operations proc_dma_operations = {
+static const struct file_operations proc_dma_operations = {
        .open           = proc_dma_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
diff --git a/kernel/exit.c b/kernel/exit.c
index 06de6c4e8ca3..4e3f919edc48 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -850,9 +850,7 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
        struct task_struct *tsk = current;
-        struct taskstats *tidstats;
        int group_dead;
-        unsigned int mycpu;
        profile_task_exit(tsk);
@@ -890,8 +888,6 @@ fastcall NORET_TYPE void do_exit(long code)
                                current->comm, current->pid,
                                preempt_count());
-        taskstats_exit_alloc(&tidstats, &mycpu);
        acct_update_integrals(tsk);
        if (tsk->mm) {
                update_hiwater_rss(tsk->mm);
@@ -911,8 +907,8 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
        if (unlikely(tsk->audit_context))
                audit_free(tsk);
-        taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
-        taskstats_exit_free(tidstats);
+        taskstats_exit(tsk, group_dead);
        exit_mm(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index fd22245e3881..7f2e31ba33af 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -82,26 +82,26 @@ int nr_processes(void)
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 # define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
 # define free_task_struct(tsk)  kmem_cache_free(task_struct_cachep, (tsk))
-static kmem_cache_t *task_struct_cachep;
+static struct kmem_cache *task_struct_cachep;
 #endif
 /* SLAB cache for signal_struct structures (tsk->signal) */
-static kmem_cache_t *signal_cachep;
+static struct kmem_cache *signal_cachep;
 /* SLAB cache for sighand_struct structures (tsk->sighand) */
-kmem_cache_t *sighand_cachep;
+struct kmem_cache *sighand_cachep;
 /* SLAB cache for files_struct structures (tsk->files) */
-kmem_cache_t *files_cachep;
+struct kmem_cache *files_cachep;
 /* SLAB cache for fs_struct structures (tsk->fs) */
-kmem_cache_t *fs_cachep;
+struct kmem_cache *fs_cachep;
 /* SLAB cache for vm_area_struct structures */
-kmem_cache_t *vm_area_cachep;
+struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
-static kmem_cache_t *mm_cachep;
+static struct kmem_cache *mm_cachep;
 void free_task(struct task_struct *tsk)
 {
@@ -237,7 +237,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                goto fail_nomem;
                        charge = len;
                }
-                tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+                tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                if (!tmp)
                        goto fail_nomem;
                *tmp = *mpnt;
@@ -319,7 +319,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
-#define allocate_mm()   (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
+#define allocate_mm()   (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)     (kmem_cache_free(mm_cachep, (mm)))
 #include <linux/init_task.h>
@@ -448,7 +448,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
                tsk->vfork_done = NULL;
                complete(vfork_done);
        }
-        if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
+        /*
+         * If we're exiting normally, clear a user-space tid field if
+         * requested.  We leave this alone when dying by signal, to leave
+         * the value intact in a core dump, and to save the unnecessary
+         * trouble otherwise.  Userland only wants this done for a sys_exit.
+         */
+        if (tsk->clear_child_tid
+            && !(tsk->flags & PF_SIGNALED)
+            && atomic_read(&mm->mm_users) > 1) {
                u32 __user * tidptr = tsk->clear_child_tid;
                tsk->clear_child_tid = NULL;
@@ -479,6 +488,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
        memcpy(mm, oldmm, sizeof(*mm));
+        /* Initializing for Swap token stuff */
+        mm->token_priority = 0;
+        mm->last_interval = 0;
        if (!mm_init(mm))
                goto fail_nomem;
@@ -542,6 +555,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
                goto fail_nomem;
 good_mm:
+        /* Initializing for Swap token stuff */
+        mm->token_priority = 0;
+        mm->last_interval = 0;
        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;
@@ -613,7 +630,7 @@ static struct files_struct *alloc_files(void)
        struct files_struct *newf;
        struct fdtable *fdt;
-        newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+        newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
        if (!newf)
                goto out;
@@ -830,7 +847,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
        if (clone_flags & CLONE_THREAD) {
                atomic_inc(&current->signal->count);
                atomic_inc(&current->signal->live);
-                taskstats_tgid_alloc(current);
                return 0;
        }
        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -1413,7 +1429,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
-static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags)
 {
        struct sighand_struct *sighand = data;
diff --git a/kernel/futex.c b/kernel/futex.c
index 93ef30ba209f..95989a3b4168 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
        int ret;
-        inc_preempt_count();
+        pagefault_disable();
        ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
-        dec_preempt_count();
+        pagefault_enable();
        return ret ? -EFAULT : 0;
 }
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void)
        if (likely(current->pi_state_cache))
                return 0;
-        pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+        pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
        if (!pi_state)
                return -ENOMEM;
-        memset(pi_state, 0, sizeof(*pi_state));
        INIT_LIST_HEAD(&pi_state->list);
        /* pi_mutex gets initialized later */
        pi_state->owner = NULL;
@@ -553,7 +552,7 @@ static void wake_futex(struct futex_q *q)
         * at the end of wake_up_all() does not prevent this store from
         * moving.
         */
-        wmb();
+        smp_wmb();
        q->lock_ptr = NULL;
 }
@@ -585,9 +584,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        if (!(uval & FUTEX_OWNER_DIED)) {
                newval = FUTEX_WAITERS | new_owner->pid;
-                inc_preempt_count();
+                pagefault_disable();
                curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-                dec_preempt_count();
+                pagefault_enable();
                if (curval == -EFAULT)
                        return -EFAULT;
                if (curval != uval)
@@ -618,9 +617,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
         * There is no waiter, so we unlock the futex. The owner died
         * bit has not to be preserved here. We are the owner:
         */
-        inc_preempt_count();
+        pagefault_disable();
        oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
-        dec_preempt_count();
+        pagefault_enable();
        if (oldval == -EFAULT)
                return oldval;
@@ -1158,9 +1157,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
         */
        newval = current->pid;
-        inc_preempt_count();
+        pagefault_disable();
        curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
-        dec_preempt_count();
+        pagefault_enable();
        if (unlikely(curval == -EFAULT))
                goto uaddr_faulted;
@@ -1183,9 +1182,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        uval = curval;
        newval = uval | FUTEX_WAITERS;
-        inc_preempt_count();
+        pagefault_disable();
        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-        dec_preempt_count();
+        pagefault_enable();
        if (unlikely(curval == -EFAULT))
                goto uaddr_faulted;
@@ -1215,10 +1214,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
                        newval = current->pid |
                                FUTEX_OWNER_DIED | FUTEX_WAITERS;
-                        inc_preempt_count();
+                        pagefault_disable();
                        curval = futex_atomic_cmpxchg_inatomic(uaddr,
                                                               uval, newval);
-                        dec_preempt_count();
+                        pagefault_enable();
                        if (unlikely(curval == -EFAULT))
                                goto uaddr_faulted;
@@ -1390,9 +1389,9 @@ retry_locked:
         * anyone else up:
         */
        if (!(uval & FUTEX_OWNER_DIED)) {
-                inc_preempt_count();
+                pagefault_disable();
                uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
-                dec_preempt_count();
+                pagefault_enable();
        }
        if (unlikely(uval == -EFAULT))
@@ -1493,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp,
        return ret;
 }
-static struct file_operations futex_fops = {
+static const struct file_operations futex_fops = {
        .release        = futex_close,
        .poll           = futex_poll,
 };
@@ -1858,10 +1857,16 @@ static struct file_system_type futex_fs_type = {
 static int __init init(void)
 {
-        unsigned int i;
+        int i = register_filesystem(&futex_fs_type);
+        if (i)
+                return i;
-        register_filesystem(&futex_fs_type);
        futex_mnt = kern_mount(&futex_fs_type);
+        if (IS_ERR(futex_mnt)) {
+                unregister_filesystem(&futex_fs_type);
+                return PTR_ERR(futex_mnt);
+        }
        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
                INIT_LIST_HEAD(&futex_queues[i].chain);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a681912bc89a..aff1f0fabb0d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
                .chip = &no_irq_chip,
                .handle_irq = handle_bad_irq,
                .depth = 1,
-                .lock = SPIN_LOCK_UNLOCKED,
+                .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
 #ifdef CONFIG_SMP
                .affinity = CPU_MASK_ALL
 #endif
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index eeac3e313b2b..ab63cfc42992 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -20,6 +20,7 @@
 #include <linux/proc_fs.h>
 #include <linux/sched.h>        /* for cond_resched */
 #include <linux/mm.h>
+#include <linux/ctype.h>
 #include <asm/sections.h>
@@ -301,13 +302,6 @@ struct kallsym_iter
        char name[KSYM_NAME_LEN+1];
 };
-/* Only label it "global" if it is exported. */
-static void upcase_if_global(struct kallsym_iter *iter)
-{
-        if (is_exported(iter->name, iter->owner))
-                iter->type += 'A' - 'a';
-}
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
        iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
@@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
        if (iter->owner == NULL)
                return 0;
-        upcase_if_global(iter);
+        /* Label it "global" if it is exported, "local" if not exported. */
+        iter->type = is_exported(iter->name, iter->owner)
+                ? toupper(iter->type) : tolower(iter->type);
        return 1;
 }
@@ -401,7 +398,7 @@ static int s_show(struct seq_file *m, void *p)
        return 0;
 }
-static struct seq_operations kallsyms_op = {
+static const struct seq_operations kallsyms_op = {
        .start = s_start,
        .next = s_next,
        .stop = s_stop,
@@ -436,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file)
        return seq_release(inode, file);
 }
-static struct file_operations kallsyms_operations = {
+static const struct file_operations kallsyms_operations = {
        .open = kallsyms_open,
        .read = seq_read,
        .llseek = seq_lseek,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index fcdd5d2bc3f4..afbbbe981be2 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -20,6 +20,8 @@
 #include <linux/syscalls.h>
 #include <linux/ioport.h>
 #include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -108,11 +110,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
        /* Allocate a controlling structure */
        result = -ENOMEM;
-        image = kmalloc(sizeof(*image), GFP_KERNEL);
+        image = kzalloc(sizeof(*image), GFP_KERNEL);
        if (!image)
                goto out;
-        memset(image, 0, sizeof(*image));
        image->head = 0;
        image->entry = &image->head;
        image->last_entry = &image->head;
@@ -1067,6 +1068,60 @@ void crash_kexec(struct pt_regs *regs)
        }
 }
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+                            size_t data_len)
+{
+        struct elf_note note;
+        note.n_namesz = strlen(name) + 1;
+        note.n_descsz = data_len;
+        note.n_type   = type;
+        memcpy(buf, &note, sizeof(note));
+        buf += (sizeof(note) + 3)/4;
+        memcpy(buf, name, note.n_namesz);
+        buf += (note.n_namesz + 3)/4;
+        memcpy(buf, data, note.n_descsz);
+        buf += (note.n_descsz + 3)/4;
+        return buf;
+}
+static void final_note(u32 *buf)
+{
+        struct elf_note note;
+        note.n_namesz = 0;
+        note.n_descsz = 0;
+        note.n_type   = 0;
+        memcpy(buf, &note, sizeof(note));
+}
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+        struct elf_prstatus prstatus;
+        u32 *buf;
+        if ((cpu < 0) || (cpu >= NR_CPUS))
+                return;
+        /* Using ELF notes here is opportunistic.
+         * I need a well defined structure format
+         * for the data I pass, and I need tags
+         * on the data to indicate what information I have
+         * squirrelled away.  ELF notes happen to provide
+         * all of that, so there is no need to invent something new.
+         */
+        buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+        if (!buf)
+                return;
+        memset(&prstatus, 0, sizeof(prstatus));
+        prstatus.pr_pid = current->pid;
+        elf_core_copy_regs(&prstatus.pr_reg, regs);
+        buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+                                sizeof(prstatus));
+        final_note(buf);
+}
 static int __init crash_notes_memory_init(void)
 {
        /* Allocate memory for saving cpu registers. */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2b76dee28496..8d2bea09a4ec 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module);
 #endif /* CONFIG_KMOD */
 struct subprocess_info {
+        struct work_struct work;
        struct completion *complete;
        char *path;
        char **argv;
@@ -221,9 +222,10 @@ static int wait_for_helper(void *data)
 }
 /* This is run by khelper thread  */
-static void __call_usermodehelper(void *data)
+static void __call_usermodehelper(struct work_struct *work)
 {
-        struct subprocess_info *sub_info = data;
+        struct subprocess_info *sub_info =
+                container_of(work, struct subprocess_info, work);
        pid_t pid;
        int wait = sub_info->wait;
@@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 {
        DECLARE_COMPLETION_ONSTACK(done);
        struct subprocess_info sub_info = {
+                .work           = __WORK_INITIALIZER(sub_info.work,
+                                                     __call_usermodehelper),
                .complete       = &done,
                .path           = path,
                .argv           = argv,
@@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
                .wait           = wait,
                .retval         = 0,
        };
-        DECLARE_WORK(work, __call_usermodehelper, &sub_info);
        if (!khelper_wq)
                return -EBUSY;
@@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
        if (path[0] == '\0')
                return 0;
-        queue_work(khelper_wq, &work);
+        queue_work(khelper_wq, &sub_info.work);
        wait_for_completion(&done);
        return sub_info.retval;
 }
@@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 {
        DECLARE_COMPLETION(done);
        struct subprocess_info sub_info = {
+                .work           = __WORK_INITIALIZER(sub_info.work,
+                                                     __call_usermodehelper),
                .complete       = &done,
                .path           = path,
                .argv           = argv,
@@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
                .retval         = 0,
        };
        struct file *f;
-        DECLARE_WORK(work, __call_usermodehelper, &sub_info);
        if (!khelper_wq)
                return -EBUSY;
@@ -318,7 +322,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
        }
        sub_info.stdin = f;
-        queue_work(khelper_wq, &work);
+        queue_work(khelper_wq, &sub_info.work);
        wait_for_completion(&done);
        return sub_info.retval;
 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 610c837ad9e0..17ec4afb0994 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -38,6 +38,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
+#include <linux/freezer.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
@@ -83,9 +84,36 @@ struct kprobe_insn_page {
        kprobe_opcode_t *insns;         /* Page of instruction slots */
        char slot_used[INSNS_PER_PAGE];
        int nused;
+        int ngarbage;
 };
 static struct hlist_head kprobe_insn_pages;
+static int kprobe_garbage_slots;
+static int collect_garbage_slots(void);
+static int __kprobes check_safety(void)
+{
+        int ret = 0;
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
+        ret = freeze_processes();
+        if (ret == 0) {
+                struct task_struct *p, *q;
+                do_each_thread(p, q) {
+                        if (p != current && p->state == TASK_RUNNING &&
+                            p->pid != 0) {
+                                printk("Check failed: %s is running\n",p->comm);
+                                ret = -1;
+                                goto loop_end;
+                        }
+                } while_each_thread(p, q);
+        }
+loop_end:
+        thaw_processes();
+#else
+        synchronize_sched();
+#endif
+        return ret;
+}
 /**
 * get_insn_slot() - Find a slot on an executable page for an instruction.
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
+      retry:
        hlist_for_each(pos, &kprobe_insn_pages) {
                kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
                if (kip->nused < INSNS_PER_PAGE) {
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
                }
        }
-        /* All out of space.  Need to allocate a new page. Use slot 0.*/
+        /* If there are any garbage slots, collect it and try again. */
+        if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+                goto retry;
+        }
+        /* All out of space.  Need to allocate a new page. Use slot 0. */
        kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
        if (!kip) {
                return NULL;
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
        memset(kip->slot_used, 0, INSNS_PER_PAGE);
        kip->slot_used[0] = 1;
        kip->nused = 1;
+        kip->ngarbage = 0;
        return kip->insns;
 }
-void __kprobes free_insn_slot(kprobe_opcode_t *slot)
+/* Return 1 if all garbages are collected, otherwise 0. */
+static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+{
+        kip->slot_used[idx] = 0;
+        kip->nused--;
+        if (kip->nused == 0) {
+                /*
+                 * Page is no longer in use.  Free it unless
+                 * it's the last one.  We keep the last one
+                 * so as not to have to set it up again the
+                 * next time somebody inserts a probe.
+                 */
+                hlist_del(&kip->hlist);
+                if (hlist_empty(&kprobe_insn_pages)) {
+                        INIT_HLIST_NODE(&kip->hlist);
+                        hlist_add_head(&kip->hlist,
+                                       &kprobe_insn_pages);
+                } else {
+                        module_free(NULL, kip->insns);
+                        kfree(kip);
+                }
+                return 1;
+        }
+        return 0;
+}
+static int __kprobes collect_garbage_slots(void)
+{
+        struct kprobe_insn_page *kip;
+        struct hlist_node *pos, *next;
+        /* Ensure no-one is preepmted on the garbages */
+        if (check_safety() != 0)
+                return -EAGAIN;
+        hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+                int i;
+                kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+                if (kip->ngarbage == 0)
+                        continue;
+                kip->ngarbage = 0;      /* we will collect all garbages */
+                for (i = 0; i < INSNS_PER_PAGE; i++) {
+                        if (kip->slot_used[i] == -1 &&
+                            collect_one_slot(kip, i))
+                                break;
+                }
+        }
+        kprobe_garbage_slots = 0;
+        return 0;
+}
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
 {
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
                if (kip->insns <= slot &&
                    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
                        int i = (slot - kip->insns) / MAX_INSN_SIZE;
-                        kip->slot_used[i] = 0;
+                        if (dirty) {
-                        kip->nused--;
+                                kip->slot_used[i] = -1;
-                        if (kip->nused == 0) {
+                                kip->ngarbage++;
-                                /*
+                        } else {
-                                 * Page is no longer in use.  Free it unless
+                                collect_one_slot(kip, i);
-                                 * it's the last one.  We keep the last one
-                                 * so as not to have to set it up again the
-                                 * next time somebody inserts a probe.
-                                 */
-                                hlist_del(&kip->hlist);
-                                if (hlist_empty(&kprobe_insn_pages)) {
-                                        INIT_HLIST_NODE(&kip->hlist);
-                                        hlist_add_head(&kip->hlist,
-                                                &kprobe_insn_pages);
-                                } else {
-                                        module_free(NULL, kip->insns);
-                                        kfree(kip);
-                                }
                        }
-                        return;
+                        break;
                }
        }
+        if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+                collect_garbage_slots();
+        }
 }
 #endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4f9c60ef95e8..1db8c72d0d38 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -31,6 +31,8 @@ struct kthread_create_info
        /* Result passed back to kthread_create() from keventd. */
        struct task_struct *result;
        struct completion done;
+        struct work_struct work;
 };
 struct kthread_stop_info
@@ -111,9 +113,10 @@ static int kthread(void *_create)
 }
 /* We are keventd: create a thread. */
-static void keventd_create_kthread(void *_create)
+static void keventd_create_kthread(struct work_struct *work)
 {
-        struct kthread_create_info *create = _create;
+        struct kthread_create_info *create =
+                container_of(work, struct kthread_create_info, work);
        int pid;
        /* We want our own signal handler (we take no signals by default). */
@@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
                                   ...)
 {
        struct kthread_create_info create;
-        DECLARE_WORK(work, keventd_create_kthread, &create);
        create.threadfn = threadfn;
        create.data = data;
        init_completion(&create.started);
        init_completion(&create.done);
+        INIT_WORK(&create.work, keventd_create_kthread);
        /*
         * The workqueue needs to start up first:
         */
        if (!helper_wq)
-                work.func(work.data);
+                create.work.func(&create.work);
        else {
-                queue_work(helper_wq, &work);
+                queue_work(helper_wq, &create.work);
                wait_for_completion(&create.done);
        }
        if (!IS_ERR(create.result)) {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9bb8d784eb02..b02032476dc2 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -140,13 +140,6 @@ void lockdep_on(void)
 EXPORT_SYMBOL(lockdep_on);
-int lockdep_internal(void)
-{
-        return current->lockdep_recursion != 0;
-}
-EXPORT_SYMBOL(lockdep_internal);
 /*
 * Debugging switches:
 */
@@ -233,8 +226,10 @@ static int save_trace(struct stack_trace *trace)
        trace->max_entries = trace->nr_entries;
        nr_stack_trace_entries += trace->nr_entries;
-        if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
+        if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) {
+                __raw_spin_unlock(&hash_lock);
                return 0;
+        }
        if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
                __raw_spin_unlock(&hash_lock);
@@ -353,7 +348,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
 static void print_lock_name(struct lock_class *class)
 {
-        char str[128], c1, c2, c3, c4;
+        char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4;
        const char *name;
        get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -375,7 +370,7 @@ static void print_lock_name(struct lock_class *class)
 static void print_lockdep_cache(struct lockdep_map *lock)
 {
        const char *name;
-        char str[128];
+        char str[KSYM_NAME_LEN + 1];
        name = lock->name;
        if (!name)
@@ -445,7 +440,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
        print_lock_class_header(class, depth);
        list_for_each_entry(entry, &class->locks_after, entry) {
-                DEBUG_LOCKS_WARN_ON(!entry->class);
+                if (DEBUG_LOCKS_WARN_ON(!entry->class))
+                        return;
                print_lock_dependencies(entry->class, depth + 1);
                printk("%*s ... acquired at:\n",depth,"");
@@ -470,7 +467,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
                return 0;
        entry->class = this;
-        save_trace(&entry->trace);
+        if (!save_trace(&entry->trace))
+                return 0;
        /*
         * Since we never remove from the dependency list, the list can
@@ -558,8 +556,12 @@ static noinline int print_circular_bug_tail(void)
        if (debug_locks_silent)
                return 0;
+        /* hash_lock unlocked by the header */
+        __raw_spin_lock(&hash_lock);
        this.class = check_source->class;
-        save_trace(&this.trace);
+        if (!save_trace(&this.trace))
+                return 0;
+        __raw_spin_unlock(&hash_lock);
        print_circular_bug_entry(&this, 0);
        printk("\nother info that might help us debug this:\n\n");
@@ -962,14 +964,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
                               &prev->class->locks_after, next->acquire_ip);
        if (!ret)
                return 0;
-        /*
-         * Return value of 2 signals 'dependency already added',
-         * in that case we dont have to add the backlink either.
-         */
-        if (ret == 2)
-                return 2;
        ret = add_lock_to_list(next->class, prev->class,
                               &next->class->locks_before, next->acquire_ip);
+        if (!ret)
+                return 0;
        /*
         * Debugging printouts:
@@ -1021,7 +1020,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
                 * added:
                 */
                if (hlock->read != 2) {
-                        check_prev_add(curr, hlock, next);
+                        if (!check_prev_add(curr, hlock, next))
+                                return 0;
                        /*
                         * Stop after the first non-trylock entry,
                         * as non-trylock entries have added their
@@ -1178,6 +1178,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        struct lockdep_subclass_key *key;
        struct list_head *hash_head;
        struct lock_class *class;
+        unsigned long flags;
        class = look_up_lock_class(lock, subclass);
        if (likely(class))
@@ -1199,6 +1200,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        key = lock->key->subkeys + subclass;
        hash_head = classhashentry(key);
+        raw_local_irq_save(flags);
        __raw_spin_lock(&hash_lock);
        /*
         * We have to do the hash-walk again, to avoid races
@@ -1213,6 +1215,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
         */
        if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
                __raw_spin_unlock(&hash_lock);
+                raw_local_irq_restore(flags);
                debug_locks_off();
                printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
                printk("turning off the locking correctness validator.\n");
@@ -1235,15 +1238,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        if (verbose(class)) {
                __raw_spin_unlock(&hash_lock);
+                raw_local_irq_restore(flags);
                printk("\nnew class %p: %s", class->key, class->name);
                if (class->name_version > 1)
                        printk("#%d", class->name_version);
                printk("\n");
                dump_stack();
+                raw_local_irq_save(flags);
                __raw_spin_lock(&hash_lock);
        }
 out_unlock_set:
        __raw_spin_unlock(&hash_lock);
+        raw_local_irq_restore(flags);
        if (!subclass || force)
                lock->class_cache = class;
@@ -1724,6 +1730,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
                debug_atomic_dec(&nr_unused_locks);
                break;
        default:
+                __raw_spin_unlock(&hash_lock);
                debug_locks_off();
                WARN_ON(1);
                return 0;
@@ -2641,6 +2648,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
        }
        local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
 static void print_held_locks_bug(struct task_struct *curr)
 {
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index eab043c83bb2..8ce09bc4613d 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -20,7 +20,7 @@
 #define MAX_LOCKDEP_KEYS_BITS   11
 #define MAX_LOCKDEP_KEYS        (1UL << MAX_LOCKDEP_KEYS_BITS)
-#define MAX_LOCKDEP_CHAINS_BITS 13
+#define MAX_LOCKDEP_CHAINS_BITS 14
 #define MAX_LOCKDEP_CHAINS      (1UL << MAX_LOCKDEP_CHAINS_BITS)
 /*
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index f6e72eaab3fa..b554b40a4aa6 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v)
        return 0;
 }
-static struct seq_operations lockdep_ops = {
+static const struct seq_operations lockdep_ops = {
        .start  = l_start,
        .next   = l_next,
        .stop   = l_stop,
@@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file)
        return res;
 }
-static struct file_operations proc_lockdep_operations = {
+static const struct file_operations proc_lockdep_operations = {
        .open           = lockdep_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
@@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file)
        return single_open(file, lockdep_stats_show, NULL);
 }
-static struct file_operations proc_lockdep_stats_operations = {
+static const struct file_operations proc_lockdep_stats_operations = {
        .open           = lockdep_stats_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
diff --git a/kernel/module.c b/kernel/module.c
index e2d09d604ca0..d9eae45d0145 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2209,7 +2209,7 @@ static int m_show(struct seq_file *m, void *p)
   Where refcount is a number or -, and deps is a comma-separated list
   of depends or -.
 */
-struct seq_operations modules_op = {
+const struct seq_operations modules_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 18651641a7b5..841539d72c55 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 void debug_mutex_unlock(struct mutex *lock)
 {
+        if (unlikely(!debug_locks))
+                return;
        DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
        DEBUG_LOCKS_WARN_ON(lock->magic != lock);
        DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
diff --git a/kernel/pid.c b/kernel/pid.c
index b914392085f9..a48879b0b921 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -31,7 +31,7 @@
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
 static struct hlist_head *pid_hash;
 static int pidhash_shift;
-static kmem_cache_t *pid_cachep;
+static struct kmem_cache *pid_cachep;
 int pid_max = PID_MAX_DEFAULT;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9cbb5d1be06f..5fe87de10ff0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -70,7 +70,7 @@
 /*
 * Lets keep our timers in a slab cache :-)
 */
-static kmem_cache_t *posix_timers_cache;
+static struct kmem_cache *posix_timers_cache;
 static struct idr posix_timers_id;
 static DEFINE_SPINLOCK(idr_lock);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 825068ca3479..710ed084e7c5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -78,7 +78,7 @@ config PM_SYSFS_DEPRECATED
 config SOFTWARE_SUSPEND
        bool "Software Suspend"
-        depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP))
+        depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
        ---help---
          Enable the possibility of suspending the machine.
          It doesn't need ACPI or APM.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index b1fb7866b0b3..0b00f56c2ad0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -20,6 +20,7 @@
 #include <linux/pm.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
+#include <linux/freezer.h>
 #include "power.h"
@@ -27,6 +28,23 @@
 static int noresume = 0;
 char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
+sector_t swsusp_resume_block;
+/**
+ *      platform_prepare - prepare the machine for hibernation using the
+ *      platform driver if so configured and return an error code if it fails
+ */
+static inline int platform_prepare(void)
+{
+        int error = 0;
+        if (pm_disk_mode == PM_DISK_PLATFORM) {
+                if (pm_ops && pm_ops->prepare)
+                        error = pm_ops->prepare(PM_SUSPEND_DISK);
+        }
+        return error;
+}
 /**
 *      power_down - Shut machine down for hibernate.
@@ -40,12 +58,10 @@ dev_t swsusp_resume_device;
 static void power_down(suspend_disk_method_t mode)
 {
-        int error = 0;
        switch(mode) {
        case PM_DISK_PLATFORM:
                kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-                error = pm_ops->enter(PM_SUSPEND_DISK);
+                pm_ops->enter(PM_SUSPEND_DISK);
                break;
        case PM_DISK_SHUTDOWN:
                kernel_power_off();
@@ -90,12 +106,18 @@ static int prepare_processes(void)
                goto thaw;
        }
+        error = platform_prepare();
+        if (error)
+                goto thaw;
        /* Free memory before shutting down devices. */
        if (!(error = swsusp_shrink_memory()))
                return 0;
-thaw:
+        platform_finish();
+ thaw:
        thaw_processes();
-enable_cpus:
+ enable_cpus:
        enable_nonboot_cpus();
        pm_restore_console();
        return error;
@@ -127,7 +149,7 @@ int pm_suspend_disk(void)
                return error;
        if (pm_disk_mode == PM_DISK_TESTPROC)
-                goto Thaw;
+                return 0;
        suspend_console();
        error = device_suspend(PMSG_FREEZE);
@@ -189,10 +211,10 @@ static int software_resume(void)
 {
        int error;
-        down(&pm_sem);
+        mutex_lock(&pm_mutex);
        if (!swsusp_resume_device) {
                if (!strlen(resume_file)) {
-                        up(&pm_sem);
+                        mutex_unlock(&pm_mutex);
                        return -ENOENT;
                }
                swsusp_resume_device = name_to_dev_t(resume_file);
@@ -207,7 +229,7 @@ static int software_resume(void)
                 * FIXME: If noresume is specified, we need to find the partition
                 * and reset it back to normal swap space.
                 */
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
                return 0;
        }
@@ -251,7 +273,7 @@ static int software_resume(void)
        unprepare_processes();
 Done:
        /* For success case, the suspend path will release the lock */
-        up(&pm_sem);
+        mutex_unlock(&pm_mutex);
        pr_debug("PM: Resume from disk failed.\n");
        return 0;
 }
@@ -312,7 +334,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
-        down(&pm_sem);
+        mutex_lock(&pm_mutex);
        for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
                if (!strncmp(buf, pm_disk_modes[i], len)) {
                        mode = i;
@@ -336,7 +358,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
        pr_debug("PM: suspend-to-disk mode set to '%s'\n",
                 pm_disk_modes[mode]);
-        up(&pm_sem);
+        mutex_unlock(&pm_mutex);
        return error ? error : n;
 }
@@ -361,14 +383,14 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
        if (maj != MAJOR(res) || min != MINOR(res))
                goto out;
-        down(&pm_sem);
+        mutex_lock(&pm_mutex);
        swsusp_resume_device = res;
-        up(&pm_sem);
+        mutex_unlock(&pm_mutex);
        printk("Attempting manual resume\n");
        noresume = 0;
        software_resume();
        ret = n;
-out:
+ out:
        return ret;
 }
@@ -423,6 +445,19 @@ static int __init resume_setup(char *str)
        return 1;
 }
+static int __init resume_offset_setup(char *str)
+{
+        unsigned long long offset;
+        if (noresume)
+                return 1;
+        if (sscanf(str, "%llu", &offset) == 1)
+                swsusp_resume_block = offset;
+        return 1;
+}
 static int __init noresume_setup(char *str)
 {
        noresume = 1;
@@ -430,4 +465,5 @@ static int __init noresume_setup(char *str)
 }
 __setup("noresume", noresume_setup);
+__setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 873228c71dab..500eb87f643d 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
 *
 */
+#include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
@@ -18,13 +19,14 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/resume-trace.h>
+#include <linux/freezer.h>
 #include "power.h"
 /*This is just an arbitrary number */
 #define FREE_PAGE_NUMBER (100)
-DECLARE_MUTEX(pm_sem);
+DEFINE_MUTEX(pm_mutex);
 struct pm_ops *pm_ops;
 suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
@@ -36,9 +38,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
 void pm_set_ops(struct pm_ops * ops)
 {
-        down(&pm_sem);
+        mutex_lock(&pm_mutex);
        pm_ops = ops;
-        up(&pm_sem);
+        mutex_unlock(&pm_mutex);
 }
@@ -182,7 +184,7 @@ static int enter_state(suspend_state_t state)
        if (!valid_state(state))
                return -ENODEV;
-        if (down_trylock(&pm_sem))
+        if (!mutex_trylock(&pm_mutex))
                return -EBUSY;
        if (state == PM_SUSPEND_DISK) {
@@ -200,7 +202,7 @@ static int enter_state(suspend_state_t state)
        pr_debug("PM: Finishing wakeup.\n");
        suspend_finish(state);
 Unlock:
-        up(&pm_sem);
+        mutex_unlock(&pm_mutex);
        return error;
 }
@@ -229,7 +231,7 @@ int pm_suspend(suspend_state_t state)
        return -EINVAL;
 }
+EXPORT_SYMBOL(pm_suspend);
 decl_subsys(power,NULL,NULL);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index bfe999f7b272..eb461b816bf4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void)
        return -EPERM;
 }
 #endif
-extern struct semaphore pm_sem;
+extern struct mutex pm_mutex;
 #define power_attr(_name) \
 static struct subsys_attribute _name##_attr = { \
        .attr   = {                             \
@@ -42,6 +44,7 @@ extern const void __nosave_begin, __nosave_end;
 extern unsigned long image_size;
 extern int in_suspend;
 extern dev_t swsusp_resume_device;
+extern sector_t swsusp_resume_block;
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
@@ -102,8 +105,18 @@ struct snapshot_handle {
 extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+extern void snapshot_write_finalize(struct snapshot_handle *handle);
 extern int snapshot_image_loaded(struct snapshot_handle *handle);
-extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
+/*
+ * This structure is used to pass the values needed for the identification
+ * of the resume swap area from a user space to the kernel via the
+ * SNAPSHOT_SET_SWAP_AREA ioctl
+ */
+struct resume_swap_area {
+        loff_t offset;
+        u_int32_t dev;
+} __attribute__((packed));
 #define SNAPSHOT_IOC_MAGIC      '3'
 #define SNAPSHOT_FREEZE                 _IO(SNAPSHOT_IOC_MAGIC, 1)
@@ -117,7 +130,14 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 #define SNAPSHOT_FREE_SWAP_PAGES        _IO(SNAPSHOT_IOC_MAGIC, 9)
 #define SNAPSHOT_SET_SWAP_FILE          _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
 #define SNAPSHOT_S2RAM                  _IO(SNAPSHOT_IOC_MAGIC, 11)
-#define SNAPSHOT_IOC_MAXNR      11
+#define SNAPSHOT_PMOPS                  _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
+#define SNAPSHOT_SET_SWAP_AREA          _IOW(SNAPSHOT_IOC_MAGIC, 13, \
+                                                        struct resume_swap_area)
+#define SNAPSHOT_IOC_MAXNR      13
+#define PMOPS_PREPARE   1
+#define PMOPS_ENTER     2
+#define PMOPS_FINISH    3
 /**
 *      The bitmap is used for tracing allocated swap pages
@@ -141,7 +161,7 @@ struct bitmap_page {
 extern void free_bitmap(struct bitmap_page *bitmap);
 extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
-extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
+extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
 extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
 extern int swsusp_check(void);
@@ -153,3 +173,7 @@ extern int swsusp_read(void);
 extern int swsusp_write(void);
 extern void swsusp_close(void);
 extern int suspend_enter(suspend_state_t state);
+struct timeval;
+extern void swsusp_show_speed(struct timeval *, struct timeval *,
+                                unsigned int, char *);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index f1f900ac3164..678ec736076b 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -16,12 +16,12 @@
 * callback we use.
 */
-static void do_poweroff(void *dummy)
+static void do_poweroff(struct work_struct *dummy)
 {
        kernel_power_off();
 }
-static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
+static DECLARE_WORK(poweroff_work, do_poweroff);
 static void handle_poweroff(int key, struct tty_struct *tty)
 {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 72e72d2c61e6..99eeb119b06d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,12 +13,15 @@
 #include <linux/suspend.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/freezer.h>
 /* 
 * Timeout for stopping processes
 */
 #define TIMEOUT (20 * HZ)
+#define FREEZER_KERNEL_THREADS 0
+#define FREEZER_USER_SPACE 1
 static inline int freezeable(struct task_struct * p)
 {
@@ -39,7 +42,6 @@ void refrigerator(void)
        long save;
        save = current->state;
        pr_debug("%s entered refrigerator\n", current->comm);
-        printk("=");
        frozen_process(current);
        spin_lock_irq(&current->sighand->siglock);
@@ -79,96 +81,136 @@ static void cancel_freezing(struct task_struct *p)
        }
 }
-/* 0 = success, else # of processes that we failed to stop */
+static inline int is_user_space(struct task_struct *p)
-int freeze_processes(void)
+{
+        return p->mm && !(p->flags & PF_BORROWED_MM);
+}
+static unsigned int try_to_freeze_tasks(int freeze_user_space)
 {
-        int todo, nr_user, user_frozen;
-        unsigned long start_time;
        struct task_struct *g, *p;
+        unsigned long end_time;
+        unsigned int todo;
-        printk( "Stopping tasks: " );
+        end_time = jiffies + TIMEOUT;
-        start_time = jiffies;
-        user_frozen = 0;
        do {
-                nr_user = todo = 0;
+                todo = 0;
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        if (!freezeable(p))
                                continue;
                        if (frozen(p))
                                continue;
-                        if (p->state == TASK_TRACED && frozen(p->parent)) {
+                        if (p->state == TASK_TRACED &&
+                            (frozen(p->parent) ||
+                             p->parent->state == TASK_STOPPED)) {
                                cancel_freezing(p);
                                continue;
                        }
-                        if (p->mm && !(p->flags & PF_BORROWED_MM)) {
+                        if (is_user_space(p)) {
-                                /* The task is a user-space one.
+                                if (!freeze_user_space)
-                                 * Freeze it unless there's a vfork completion
+                                        continue;
-                                 * pending
+                                /* Freeze the task unless there is a vfork
+                                 * completion pending
                                 */
                                if (!p->vfork_done)
                                        freeze_process(p);
-                                nr_user++;
                        } else {
-                                /* Freeze only if the user space is frozen */
+                                if (freeze_user_space)
-                                if (user_frozen)
+                                        continue;
-                                        freeze_process(p);
-                                todo++;
+                                freeze_process(p);
                        }
+                        todo++;
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
-                todo += nr_user;
-                if (!user_frozen && !nr_user) {
-                        sys_sync();
-                        start_time = jiffies;
-                }
-                user_frozen = !nr_user;
                yield();                        /* Yield is okay here */
-                if (todo && time_after(jiffies, start_time + TIMEOUT))
+                if (todo && time_after(jiffies, end_time))
                        break;
-        } while(todo);
+        } while (todo);
-        /* This does not unfreeze processes that are already frozen
-         * (we have slightly ugly calling convention in that respect,
-         * and caller must call thaw_processes() if something fails),
-         * but it cleans up leftover PF_FREEZE requests.
-         */
        if (todo) {
-                printk( "\n" );
+                /* This does not unfreeze processes that are already frozen
-                printk(KERN_ERR " stopping tasks timed out "
+                 * (we have slightly ugly calling convention in that respect,
-                        "after %d seconds (%d tasks remaining):\n",
+                 * and caller must call thaw_processes() if something fails),
-                        TIMEOUT / HZ, todo);
+                 * but it cleans up leftover PF_FREEZE requests.
+                 */
+                printk("\n");
+                printk(KERN_ERR "Stopping %s timed out after %d seconds "
+                                "(%d tasks refusing to freeze):\n",
+                                freeze_user_space ? "user space processes" :
+                                        "kernel threads",
+                                TIMEOUT / HZ, todo);
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
+                        if (is_user_space(p) == !freeze_user_space)
+                                continue;
                        if (freezeable(p) && !frozen(p))
-                                printk(KERN_ERR "  %s\n", p->comm);
+                                printk(KERN_ERR " %s\n", p->comm);
                        cancel_freezing(p);
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
-                return todo;
        }
-        printk( "|\n" );
+        return todo;
+}
+/**
+ *      freeze_processes - tell processes to enter the refrigerator
+ *
+ *      Returns 0 on success, or the number of processes that didn't freeze,
+ *      although they were told to.
+ */
+int freeze_processes(void)
+{
+        unsigned int nr_unfrozen;
+        printk("Stopping tasks ... ");
+        nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
+        if (nr_unfrozen)
+                return nr_unfrozen;
+        sys_sync();
+        nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
+        if (nr_unfrozen)
+                return nr_unfrozen;
+        printk("done.\n");
        BUG_ON(in_atomic());
        return 0;
 }
-void thaw_processes(void)
+static void thaw_tasks(int thaw_user_space)
 {
        struct task_struct *g, *p;
-        printk( "Restarting tasks..." );
        read_lock(&tasklist_lock);
        do_each_thread(g, p) {
                if (!freezeable(p))
                        continue;
+                if (is_user_space(p) == !thaw_user_space)
+                        continue;
                if (!thaw_process(p))
-                        printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
+                        printk(KERN_WARNING " Strange, %s not stopped\n",
+                                p->comm );
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
+}
+void thaw_processes(void)
+{
+        printk("Restarting tasks ... ");
+        thaw_tasks(FREEZER_KERNEL_THREADS);
+        thaw_tasks(FREEZER_USER_SPACE);
        schedule();
-        printk( " done\n" );
+        printk("done.\n");
 }
 EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 99f9b7d177d6..c024606221c4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,15 +1,15 @@
 /*
 * linux/kernel/power/snapshot.c
 *
- * This file provide system snapshot/restore functionality.
+ * This file provides system snapshot/restore functionality for swsusp.
 *
 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
 *
- * This file is released under the GPLv2, and is based on swsusp.c.
+ * This file is released under the GPLv2.
 *
 */
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -34,137 +34,24 @@
 #include "power.h"
-/* List of PBEs used for creating and restoring the suspend image */
+/* List of PBEs needed for restoring the pages that were allocated before
+ * the suspend and included in the suspend image, but have also been
+ * allocated by the "resume" kernel, so their contents cannot be written
+ * directly to their "original" page frames.
+ */
 struct pbe *restore_pblist;
-static unsigned int nr_copy_pages;
+/* Pointer to an auxiliary buffer (1 page) */
-static unsigned int nr_meta_pages;
 static void *buffer;
-#ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void)
-{
-        struct zone *zone;
-        unsigned long zone_pfn;
-        unsigned int n = 0;
-        for_each_zone (zone)
-                if (is_highmem(zone)) {
-                        mark_free_pages(zone);
-                        for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
-                                struct page *page;
-                                unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-                                if (!pfn_valid(pfn))
-                                        continue;
-                                page = pfn_to_page(pfn);
-                                if (PageReserved(page))
-                                        continue;
-                                if (PageNosaveFree(page))
-                                        continue;
-                                n++;
-                        }
-                }
-        return n;
-}
-struct highmem_page {
-        char *data;
-        struct page *page;
-        struct highmem_page *next;
-};
-static struct highmem_page *highmem_copy;
-static int save_highmem_zone(struct zone *zone)
-{
-        unsigned long zone_pfn;
-        mark_free_pages(zone);
-        for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-                struct page *page;
-                struct highmem_page *save;
-                void *kaddr;
-                unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-                if (!(pfn%10000))
-                        printk(".");
-                if (!pfn_valid(pfn))
-                        continue;
-                page = pfn_to_page(pfn);
-                /*
-                 * This condition results from rvmalloc() sans vmalloc_32()
-                 * and architectural memory reservations. This should be
-                 * corrected eventually when the cases giving rise to this
-                 * are better understood.
-                 */
-                if (PageReserved(page))
-                        continue;
-                BUG_ON(PageNosave(page));
-                if (PageNosaveFree(page))
-                        continue;
-                save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
-                if (!save)
-                        return -ENOMEM;
-                save->next = highmem_copy;
-                save->page = page;
-                save->data = (void *) get_zeroed_page(GFP_ATOMIC);
-                if (!save->data) {
-                        kfree(save);
-                        return -ENOMEM;
-                }
-                kaddr = kmap_atomic(page, KM_USER0);
-                memcpy(save->data, kaddr, PAGE_SIZE);
-                kunmap_atomic(kaddr, KM_USER0);
-                highmem_copy = save;
-        }
-        return 0;
-}
-int save_highmem(void)
-{
-        struct zone *zone;
-        int res = 0;
-        pr_debug("swsusp: Saving Highmem");
-        drain_local_pages();
-        for_each_zone (zone) {
-                if (is_highmem(zone))
-                        res = save_highmem_zone(zone);
-                if (res)
-                        return res;
-        }
-        printk("\n");
-        return 0;
-}
-int restore_highmem(void)
-{
-        printk("swsusp: Restoring Highmem\n");
-        while (highmem_copy) {
-                struct highmem_page *save = highmem_copy;
-                void *kaddr;
-                highmem_copy = save->next;
-                kaddr = kmap_atomic(save->page, KM_USER0);
-                memcpy(kaddr, save->data, PAGE_SIZE);
-                kunmap_atomic(kaddr, KM_USER0);
-                free_page((long) save->data);
-                kfree(save);
-        }
-        return 0;
-}
-#else
-static inline unsigned int count_highmem_pages(void) {return 0;}
-static inline int save_highmem(void) {return 0;}
-static inline int restore_highmem(void) {return 0;}
-#endif
 /**
 *      @safe_needed - on resume, for storing the PBE list and the image,
 *      we can only use memory pages that do not conflict with the pages
- *      used before suspend.
+ *      used before suspend.  The unsafe pages have PageNosaveFree set
+ *      and we count them using unsafe_pages.
 *
- *      The unsafe pages are marked with the PG_nosave_free flag
+ *      Each allocated image page is marked as PageNosave and PageNosaveFree
- *      and we count them using unsafe_pages
+ *      so that swsusp_free() can release it.
 */
 #define PG_ANY          0
@@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;}
 static unsigned int allocated_unsafe_pages;
-static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 {
        void *res;
@@ -195,20 +82,39 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 unsigned long get_safe_page(gfp_t gfp_mask)
 {
-        return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
+        return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+}
+static struct page *alloc_image_page(gfp_t gfp_mask)
+{
+        struct page *page;
+        page = alloc_page(gfp_mask);
+        if (page) {
+                SetPageNosave(page);
+                SetPageNosaveFree(page);
+        }
+        return page;
 }
 /**
 *      free_image_page - free page represented by @addr, allocated with
- *      alloc_image_page (page flags set by it must be cleared)
+ *      get_image_page (page flags set by it must be cleared)
 */
 static inline void free_image_page(void *addr, int clear_nosave_free)
 {
-        ClearPageNosave(virt_to_page(addr));
+        struct page *page;
+        BUG_ON(!virt_addr_valid(addr));
+        page = virt_to_page(addr);
+        ClearPageNosave(page);
        if (clear_nosave_free)
-                ClearPageNosaveFree(virt_to_page(addr));
+                ClearPageNosaveFree(page);
-        free_page((unsigned long)addr);
+        __free_page(page);
 }
 /* struct linked_page is used to build chains of pages */
@@ -269,7 +175,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
        if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
                struct linked_page *lp;
-                lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
+                lp = get_image_page(ca->gfp_mask, ca->safe_needed);
                if (!lp)
                        return NULL;
@@ -446,8 +352,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
        /* Compute the number of zones */
        nr = 0;
-        for_each_zone (zone)
+        for_each_zone(zone)
-                if (populated_zone(zone) && !is_highmem(zone))
+                if (populated_zone(zone))
                        nr++;
        /* Allocate the list of zones bitmap objects */
@@ -459,10 +365,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
        }
        /* Initialize the zone bitmap objects */
-        for_each_zone (zone) {
+        for_each_zone(zone) {
                unsigned long pfn;
-                if (!populated_zone(zone) || is_highmem(zone))
+                if (!populated_zone(zone))
                        continue;
                zone_bm->start_pfn = zone->zone_start_pfn;
@@ -481,7 +387,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
                while (bb) {
                        unsigned long *ptr;
-                        ptr = alloc_image_page(gfp_mask, safe_needed);
+                        ptr = get_image_page(gfp_mask, safe_needed);
                        bb->data = ptr;
                        if (!ptr)
                                goto Free;
@@ -505,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
        memory_bm_position_reset(bm);
        return 0;
-Free:
+ Free:
        bm->p_list = ca.chain;
        memory_bm_free(bm, PG_UNSAFE_CLEAR);
        return -ENOMEM;
@@ -651,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
        memory_bm_position_reset(bm);
        return BM_END_OF_MAP;
-Return_pfn:
+ Return_pfn:
        bm->cur.chunk = chunk;
        bm->cur.bit = bit;
        return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
@@ -669,10 +575,82 @@ unsigned int snapshot_additional_pages(struct zone *zone)
        res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
        res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
-        return res;
+        return 2 * res;
+}
+#ifdef CONFIG_HIGHMEM
+/**
+ *      count_free_highmem_pages - compute the total number of free highmem
+ *      pages, system-wide.
+ */
+static unsigned int count_free_highmem_pages(void)
+{
+        struct zone *zone;
+        unsigned int cnt = 0;
+        for_each_zone(zone)
+                if (populated_zone(zone) && is_highmem(zone))
+                        cnt += zone->free_pages;
+        return cnt;
+}
+/**
+ *      saveable_highmem_page - Determine whether a highmem page should be
+ *      included in the suspend image.
+ *
+ *      We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ *      and it isn't a part of a free chunk of pages.
+ */
+static struct page *saveable_highmem_page(unsigned long pfn)
+{
+        struct page *page;
+        if (!pfn_valid(pfn))
+                return NULL;
+        page = pfn_to_page(pfn);
+        BUG_ON(!PageHighMem(page));
+        if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
+                return NULL;
+        return page;
 }
 /**
+ *      count_highmem_pages - compute the total number of saveable highmem
+ *      pages.
+ */
+unsigned int count_highmem_pages(void)
+{
+        struct zone *zone;
+        unsigned int n = 0;
+        for_each_zone(zone) {
+                unsigned long pfn, max_zone_pfn;
+                if (!is_highmem(zone))
+                        continue;
+                mark_free_pages(zone);
+                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+                        if (saveable_highmem_page(pfn))
+                                n++;
+        }
+        return n;
+}
+#else
+static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
+static inline unsigned int count_highmem_pages(void) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+/**
 *      pfn_is_nosave - check if given pfn is in the 'nosave' section
 */
@@ -684,12 +662,12 @@ static inline int pfn_is_nosave(unsigned long pfn)
 }
 /**
- *      saveable - Determine whether a page should be cloned or not.
+ *      saveable - Determine whether a non-highmem page should be included in
- *      @pfn:   The page
+ *      the suspend image.
 *
- *      We save a page if it isn't Nosave, and is not in the range of pages
+ *      We should save the page if it isn't Nosave, and is not in the range
- *      statically defined as 'unsaveable', and it
+ *      of pages statically defined as 'unsaveable', and it isn't a part of
- *      isn't a part of a free chunk of pages.
+ *      a free chunk of pages.
 */
 static struct page *saveable_page(unsigned long pfn)
@@ -701,76 +679,130 @@ static struct page *saveable_page(unsigned long pfn)
        page = pfn_to_page(pfn);
-        if (PageNosave(page))
+        BUG_ON(PageHighMem(page));
+        if (PageNosave(page) || PageNosaveFree(page))
                return NULL;
        if (PageReserved(page) && pfn_is_nosave(pfn))
                return NULL;
-        if (PageNosaveFree(page))
-                return NULL;
        return page;
 }
+/**
+ *      count_data_pages - compute the total number of saveable non-highmem
+ *      pages.
+ */
 unsigned int count_data_pages(void)
 {
        struct zone *zone;
        unsigned long pfn, max_zone_pfn;
        unsigned int n = 0;
-        for_each_zone (zone) {
+        for_each_zone(zone) {
                if (is_highmem(zone))
                        continue;
                mark_free_pages(zone);
                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-                        n += !!saveable_page(pfn);
+                        if(saveable_page(pfn))
+                                n++;
        }
        return n;
 }
-static inline void copy_data_page(long *dst, long *src)
+/* This is needed, because copy_page and memcpy are not usable for copying
+ * task structs.
+ */
+static inline void do_copy_page(long *dst, long *src)
 {
        int n;
-        /* copy_page and memcpy are not usable for copying task structs. */
        for (n = PAGE_SIZE / sizeof(long); n; n--)
                *dst++ = *src++;
 }
+#ifdef CONFIG_HIGHMEM
+static inline struct page *
+page_is_saveable(struct zone *zone, unsigned long pfn)
+{
+        return is_highmem(zone) ?
+                        saveable_highmem_page(pfn) : saveable_page(pfn);
+}
+static inline void
+copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+        struct page *s_page, *d_page;
+        void *src, *dst;
+        s_page = pfn_to_page(src_pfn);
+        d_page = pfn_to_page(dst_pfn);
+        if (PageHighMem(s_page)) {
+                src = kmap_atomic(s_page, KM_USER0);
+                dst = kmap_atomic(d_page, KM_USER1);
+                do_copy_page(dst, src);
+                kunmap_atomic(src, KM_USER0);
+                kunmap_atomic(dst, KM_USER1);
+        } else {
+                src = page_address(s_page);
+                if (PageHighMem(d_page)) {
+                        /* Page pointed to by src may contain some kernel
+                         * data modified by kmap_atomic()
+                         */
+                        do_copy_page(buffer, src);
+                        dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
+                        memcpy(dst, buffer, PAGE_SIZE);
+                        kunmap_atomic(dst, KM_USER0);
+                } else {
+                        dst = page_address(d_page);
+                        do_copy_page(dst, src);
+                }
+        }
+}
+#else
+#define page_is_saveable(zone, pfn)     saveable_page(pfn)
+static inline void
+copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+        do_copy_page(page_address(pfn_to_page(dst_pfn)),
+                        page_address(pfn_to_page(src_pfn)));
+}
+#endif /* CONFIG_HIGHMEM */
 static void
 copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 {
        struct zone *zone;
        unsigned long pfn;
-        for_each_zone (zone) {
+        for_each_zone(zone) {
                unsigned long max_zone_pfn;
-                if (is_highmem(zone))
-                        continue;
                mark_free_pages(zone);
                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-                        if (saveable_page(pfn))
+                        if (page_is_saveable(zone, pfn))
                                memory_bm_set_bit(orig_bm, pfn);
        }
        memory_bm_position_reset(orig_bm);
        memory_bm_position_reset(copy_bm);
        do {
                pfn = memory_bm_next_pfn(orig_bm);
-                if (likely(pfn != BM_END_OF_MAP)) {
+                if (likely(pfn != BM_END_OF_MAP))
-                        struct page *page;
+                        copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
-                        void *src;
-                        page = pfn_to_page(pfn);
-                        src = page_address(page);
-                        page = pfn_to_page(memory_bm_next_pfn(copy_bm));
-                        copy_data_page(page_address(page), src);
-                }
        } while (pfn != BM_END_OF_MAP);
 }
+/* Total number of image pages */
+static unsigned int nr_copy_pages;
+/* Number of pages needed for saving the original pfns of the image pages */
+static unsigned int nr_meta_pages;
 /**
 *      swsusp_free - free pages allocated for the suspend.
 *
@@ -792,7 +824,7 @@ void swsusp_free(void)
                                if (PageNosave(page) && PageNosaveFree(page)) {
                                        ClearPageNosave(page);
                                        ClearPageNosaveFree(page);
-                                        free_page((long) page_address(page));
+                                        __free_page(page);
                                }
                        }
        }
@@ -802,34 +834,108 @@ void swsusp_free(void)
        buffer = NULL;
 }
+#ifdef CONFIG_HIGHMEM
+/**
+  *     count_pages_for_highmem - compute the number of non-highmem pages
+  *     that will be necessary for creating copies of highmem pages.
+  */
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
+{
+        unsigned int free_highmem = count_free_highmem_pages();
+        if (free_highmem >= nr_highmem)
+                nr_highmem = 0;
+        else
+                nr_highmem -= free_highmem;
+        return nr_highmem;
+}
+#else
+static unsigned int
+count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+#endif /* CONFIG_HIGHMEM */
 /**
- *      enough_free_mem - Make sure we enough free memory to snapshot.
+ *      enough_free_mem - Make sure we have enough free memory for the
- *
+ *      snapshot image.
- *      Returns TRUE or FALSE after checking the number of available
- *      free pages.
 */
-static int enough_free_mem(unsigned int nr_pages)
+static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 {
        struct zone *zone;
        unsigned int free = 0, meta = 0;
-        for_each_zone (zone)
+        for_each_zone(zone) {
-                if (!is_highmem(zone)) {
+                meta += snapshot_additional_pages(zone);
+                if (!is_highmem(zone))
                        free += zone->free_pages;
-                        meta += snapshot_additional_pages(zone);
+        }
-                }
-        pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
+        nr_pages += count_pages_for_highmem(nr_highmem);
+        pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n",
                nr_pages, PAGES_FOR_IO, meta, free);
        return free > nr_pages + PAGES_FOR_IO + meta;
 }
+#ifdef CONFIG_HIGHMEM
+/**
+ *      get_highmem_buffer - if there are some highmem pages in the suspend
+ *      image, we may need the buffer to copy them and/or load their data.
+ */
+static inline int get_highmem_buffer(int safe_needed)
+{
+        buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+        return buffer ? 0 : -ENOMEM;
+}
+/**
+ *      alloc_highmem_image_pages - allocate some highmem pages for the image.
+ *      Try to allocate as many pages as needed, but if the number of free
+ *      highmem pages is lesser than that, allocate them all.
+ */
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+{
+        unsigned int to_alloc = count_free_highmem_pages();
+        if (to_alloc > nr_highmem)
+                to_alloc = nr_highmem;
+        nr_highmem -= to_alloc;
+        while (to_alloc-- > 0) {
+                struct page *page;
+                page = alloc_image_page(__GFP_HIGHMEM);
+                memory_bm_set_bit(bm, page_to_pfn(page));
+        }
+        return nr_highmem;
+}
+#else
+static inline int get_highmem_buffer(int safe_needed) { return 0; }
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+/**
+ *      swsusp_alloc - allocate memory for the suspend image
+ *
+ *      We first try to allocate as many highmem pages as there are
+ *      saveable highmem pages in the system.  If that fails, we allocate
+ *      non-highmem pages for the copies of the remaining highmem ones.
+ *
+ *      In this approach it is likely that the copies of highmem pages will
+ *      also be located in the high memory, because of the way in which
+ *      copy_data_pages() works.
+ */
 static int
 swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
-                unsigned int nr_pages)
+                unsigned int nr_pages, unsigned int nr_highmem)
 {
        int error;
@@ -841,46 +947,61 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
        if (error)
                goto Free;
+        if (nr_highmem > 0) {
+                error = get_highmem_buffer(PG_ANY);
+                if (error)
+                        goto Free;
+                nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
+        }
        while (nr_pages-- > 0) {
-                struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+                struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
                if (!page)
                        goto Free;
-                SetPageNosave(page);
-                SetPageNosaveFree(page);
                memory_bm_set_bit(copy_bm, page_to_pfn(page));
        }
        return 0;
-Free:
+ Free:
        swsusp_free();
        return -ENOMEM;
 }
-/* Memory bitmap used for marking saveable pages */
+/* Memory bitmap used for marking saveable pages (during suspend) or the
+ * suspend image pages (during resume)
+ */
 static struct memory_bitmap orig_bm;
-/* Memory bitmap used for marking allocated pages that will contain the copies
+/* Memory bitmap used on suspend for marking allocated pages that will contain
- * of saveable pages
+ * the copies of saveable pages.  During resume it is initially used for
+ * marking the suspend image pages, but then its set bits are duplicated in
+ * @orig_bm and it is released.  Next, on systems with high memory, it may be
+ * used for marking "safe" highmem pages, but it has to be reinitialized for
+ * this purpose.
 */
 static struct memory_bitmap copy_bm;
 asmlinkage int swsusp_save(void)
 {
-        unsigned int nr_pages;
+        unsigned int nr_pages, nr_highmem;
-        pr_debug("swsusp: critical section: \n");
+        printk("swsusp: critical section: \n");
        drain_local_pages();
        nr_pages = count_data_pages();
-        printk("swsusp: Need to copy %u pages\n", nr_pages);
+        nr_highmem = count_highmem_pages();
+        printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem);
-        if (!enough_free_mem(nr_pages)) {
+        if (!enough_free_mem(nr_pages, nr_highmem)) {
                printk(KERN_ERR "swsusp: Not enough free memory\n");
                return -ENOMEM;
        }
-        if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
+        if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
+                printk(KERN_ERR "swsusp: Memory allocation failed\n");
                return -ENOMEM;
+        }
        /* During allocating of suspend pagedir, new cold pages may appear.
         * Kill them.
@@ -894,10 +1015,12 @@ asmlinkage int swsusp_save(void)
         * touch swap space! Except we must write out our image of course.
         */
+        nr_pages += nr_highmem;
        nr_copy_pages = nr_pages;
-        nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
        printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
        return 0;
 }
@@ -960,7 +1083,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
        if (!buffer) {
                /* This makes the buffer be freed by swsusp_free() */
-                buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
+                buffer = get_image_page(GFP_ATOMIC, PG_ANY);
                if (!buffer)
                        return -ENOMEM;
        }
@@ -975,9 +1098,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
                        memset(buffer, 0, PAGE_SIZE);
                        pack_pfns(buffer, &orig_bm);
                } else {
-                        unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+                        struct page *page;
-                        handle->buffer = page_address(pfn_to_page(pfn));
+                        page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
+                        if (PageHighMem(page)) {
+                                /* Highmem pages are copied to the buffer,
+                                 * because we can't return with a kmapped
+                                 * highmem page (we may not be called again).
+                                 */
+                                void *kaddr;
+                                kaddr = kmap_atomic(page, KM_USER0);
+                                memcpy(buffer, kaddr, PAGE_SIZE);
+                                kunmap_atomic(kaddr, KM_USER0);
+                                handle->buffer = buffer;
+                        } else {
+                                handle->buffer = page_address(page);
+                        }
                }
                handle->prev = handle->cur;
        }
@@ -1005,7 +1142,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
        unsigned long pfn, max_zone_pfn;
        /* Clear page flags */
-        for_each_zone (zone) {
+        for_each_zone(zone) {
                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                        if (pfn_valid(pfn))
@@ -1101,6 +1238,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
        }
 }
+/* List of "safe" pages that may be used to store data loaded from the suspend
+ * image
+ */
+static struct linked_page *safe_pages_list;
+#ifdef CONFIG_HIGHMEM
+/* struct highmem_pbe is used for creating the list of highmem pages that
+ * should be restored atomically during the resume from disk, because the page
+ * frames they have occupied before the suspend are in use.
+ */
+struct highmem_pbe {
+        struct page *copy_page; /* data is here now */
+        struct page *orig_page; /* data was here before the suspend */
+        struct highmem_pbe *next;
+};
+/* List of highmem PBEs needed for restoring the highmem pages that were
+ * allocated before the suspend and included in the suspend image, but have
+ * also been allocated by the "resume" kernel, so their contents cannot be
+ * written directly to their "original" page frames.
+ */
+static struct highmem_pbe *highmem_pblist;
+/**
+ *      count_highmem_image_pages - compute the number of highmem pages in the
+ *      suspend image.  The bits in the memory bitmap @bm that correspond to the
+ *      image pages are assumed to be set.
+ */
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
+{
+        unsigned long pfn;
+        unsigned int cnt = 0;
+        memory_bm_position_reset(bm);
+        pfn = memory_bm_next_pfn(bm);
+        while (pfn != BM_END_OF_MAP) {
+                if (PageHighMem(pfn_to_page(pfn)))
+                        cnt++;
+                pfn = memory_bm_next_pfn(bm);
+        }
+        return cnt;
+}
+/**
+ *      prepare_highmem_image - try to allocate as many highmem pages as
+ *      there are highmem image pages (@nr_highmem_p points to the variable
+ *      containing the number of highmem image pages).  The pages that are
+ *      "safe" (ie. will not be overwritten when the suspend image is
+ *      restored) have the corresponding bits set in @bm (it must be
+ *      unitialized).
+ *
+ *      NOTE: This function should not be called if there are no highmem
+ *      image pages.
+ */
+static unsigned int safe_highmem_pages;
+static struct memory_bitmap *safe_highmem_bm;
+static int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+        unsigned int to_alloc;
+        if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
+                return -ENOMEM;
+        if (get_highmem_buffer(PG_SAFE))
+                return -ENOMEM;
+        to_alloc = count_free_highmem_pages();
+        if (to_alloc > *nr_highmem_p)
+                to_alloc = *nr_highmem_p;
+        else
+                *nr_highmem_p = to_alloc;
+        safe_highmem_pages = 0;
+        while (to_alloc-- > 0) {
+                struct page *page;
+                page = alloc_page(__GFP_HIGHMEM);
+                if (!PageNosaveFree(page)) {
+                        /* The page is "safe", set its bit the bitmap */
+                        memory_bm_set_bit(bm, page_to_pfn(page));
+                        safe_highmem_pages++;
+                }
+                /* Mark the page as allocated */
+                SetPageNosave(page);
+                SetPageNosaveFree(page);
+        }
+        memory_bm_position_reset(bm);
+        safe_highmem_bm = bm;
+        return 0;
+}
+/**
+ *      get_highmem_page_buffer - for given highmem image page find the buffer
+ *      that suspend_write_next() should set for its caller to write to.
+ *
+ *      If the page is to be saved to its "original" page frame or a copy of
+ *      the page is to be made in the highmem, @buffer is returned.  Otherwise,
+ *      the copy of the page is to be made in normal memory, so the address of
+ *      the copy is returned.
+ *
+ *      If @buffer is returned, the caller of suspend_write_next() will write
+ *      the page's contents to @buffer, so they will have to be copied to the
+ *      right location on the next call to suspend_write_next() and it is done
+ *      with the help of copy_last_highmem_page().  For this purpose, if
+ *      @buffer is returned, @last_highmem page is set to the page to which
+ *      the data will have to be copied from @buffer.
+ */
+static struct page *last_highmem_page;
+static void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+        struct highmem_pbe *pbe;
+        void *kaddr;
+        if (PageNosave(page) && PageNosaveFree(page)) {
+                /* We have allocated the "original" page frame and we can
+                 * use it directly to store the loaded page.
+                 */
+                last_highmem_page = page;
+                return buffer;
+        }
+        /* The "original" page frame has not been allocated and we have to
+         * use a "safe" page frame to store the loaded page.
+         */
+        pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
+        if (!pbe) {
+                swsusp_free();
+                return NULL;
+        }
+        pbe->orig_page = page;
+        if (safe_highmem_pages > 0) {
+                struct page *tmp;
+                /* Copy of the page will be stored in high memory */
+                kaddr = buffer;
+                tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
+                safe_highmem_pages--;
+                last_highmem_page = tmp;
+                pbe->copy_page = tmp;
+        } else {
+                /* Copy of the page will be stored in normal memory */
+                kaddr = safe_pages_list;
+                safe_pages_list = safe_pages_list->next;
+                pbe->copy_page = virt_to_page(kaddr);
+        }
+        pbe->next = highmem_pblist;
+        highmem_pblist = pbe;
+        return kaddr;
+}
+/**
+ *      copy_last_highmem_page - copy the contents of a highmem image from
+ *      @buffer, where the caller of snapshot_write_next() has place them,
+ *      to the right location represented by @last_highmem_page .
+ */
+static void copy_last_highmem_page(void)
+{
+        if (last_highmem_page) {
+                void *dst;
+                dst = kmap_atomic(last_highmem_page, KM_USER0);
+                memcpy(dst, buffer, PAGE_SIZE);
+                kunmap_atomic(dst, KM_USER0);
+                last_highmem_page = NULL;
+        }
+}
+static inline int last_highmem_page_copied(void)
+{
+        return !last_highmem_page;
+}
+static inline void free_highmem_data(void)
+{
+        if (safe_highmem_bm)
+                memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
+        if (buffer)
+                free_image_page(buffer, PG_UNSAFE_CLEAR);
+}
+#else
+static inline int get_safe_write_buffer(void) { return 0; }
+static unsigned int
+count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+static inline int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+        return 0;
+}
+static inline void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+        return NULL;
+}
+static inline void copy_last_highmem_page(void) {}
+static inline int last_highmem_page_copied(void) { return 1; }
+static inline void free_highmem_data(void) {}
+#endif /* CONFIG_HIGHMEM */
 /**
 *      prepare_image - use the memory bitmap @bm to mark the pages that will
 *      be overwritten in the process of restoring the system memory state
@@ -1110,20 +1459,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 *      The idea is to allocate a new memory bitmap first and then allocate
 *      as many pages as needed for the image data, but not to assign these
 *      pages to specific tasks initially.  Instead, we just mark them as
- *      allocated and create a list of "safe" pages that will be used later.
+ *      allocated and create a lists of "safe" pages that will be used
+ *      later.  On systems with high memory a list of "safe" highmem pages is
+ *      also created.
 */
 #define PBES_PER_LINKED_PAGE    (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-static struct linked_page *safe_pages_list;
 static int
 prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
-        unsigned int nr_pages;
+        unsigned int nr_pages, nr_highmem;
        struct linked_page *sp_list, *lp;
        int error;
+        /* If there is no highmem, the buffer will not be necessary */
+        free_image_page(buffer, PG_UNSAFE_CLEAR);
+        buffer = NULL;
+        nr_highmem = count_highmem_image_pages(bm);
        error = mark_unsafe_pages(bm);
        if (error)
                goto Free;
@@ -1134,6 +1488,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
        duplicate_memory_bitmap(new_bm, bm);
        memory_bm_free(bm, PG_UNSAFE_KEEP);
+        if (nr_highmem > 0) {
+                error = prepare_highmem_image(bm, &nr_highmem);
+                if (error)
+                        goto Free;
+        }
        /* Reserve some safe pages for potential later use.
         *
         * NOTE: This way we make sure there will be enough safe pages for the
@@ -1142,10 +1501,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
         */
        sp_list = NULL;
        /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
-        nr_pages = nr_copy_pages - allocated_unsafe_pages;
+        nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
        nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
        while (nr_pages > 0) {
-                lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
+                lp = get_image_page(GFP_ATOMIC, PG_SAFE);
                if (!lp) {
                        error = -ENOMEM;
                        goto Free;
@@ -1156,7 +1515,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
        }
        /* Preallocate memory for the image */
        safe_pages_list = NULL;
-        nr_pages = nr_copy_pages - allocated_unsafe_pages;
+        nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
        while (nr_pages > 0) {
                lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
                if (!lp) {
@@ -1181,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
        }
        return 0;
-Free:
+ Free:
        swsusp_free();
        return error;
 }
@@ -1196,6 +1555,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
        struct pbe *pbe;
        struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
+        if (PageHighMem(page))
+                return get_highmem_page_buffer(page, ca);
        if (PageNosave(page) && PageNosaveFree(page))
                /* We have allocated the "original" page frame and we can
                 * use it directly to store the loaded page.
@@ -1210,12 +1572,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
                swsusp_free();
                return NULL;
        }
-        pbe->orig_address = (unsigned long)page_address(page);
+        pbe->orig_address = page_address(page);
-        pbe->address = (unsigned long)safe_pages_list;
+        pbe->address = safe_pages_list;
        safe_pages_list = safe_pages_list->next;
        pbe->next = restore_pblist;
        restore_pblist = pbe;
-        return (void *)pbe->address;
+        return pbe->address;
 }
 /**
@@ -1249,14 +1611,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
        if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
                return 0;
-        if (!buffer) {
+        if (handle->offset == 0) {
-                /* This makes the buffer be freed by swsusp_free() */
+                if (!buffer)
-                buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
+                        /* This makes the buffer be freed by swsusp_free() */
+                        buffer = get_image_page(GFP_ATOMIC, PG_ANY);
                if (!buffer)
                        return -ENOMEM;
-        }
-        if (!handle->offset)
                handle->buffer = buffer;
+        }
        handle->sync_read = 1;
        if (handle->prev < handle->cur) {
                if (handle->prev == 0) {
@@ -1284,8 +1648,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
                                        return -ENOMEM;
                        }
                } else {
+                        copy_last_highmem_page();
                        handle->buffer = get_buffer(&orig_bm, &ca);
-                        handle->sync_read = 0;
+                        if (handle->buffer != buffer)
+                                handle->sync_read = 0;
                }
                handle->prev = handle->cur;
        }
@@ -1301,15 +1667,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
        return count;
 }
+/**
+ *      snapshot_write_finalize - must be called after the last call to
+ *      snapshot_write_next() in case the last page in the image happens
+ *      to be a highmem page and its contents should be stored in the
+ *      highmem.  Additionally, it releases the memory that will not be
+ *      used any more.
+ */
+void snapshot_write_finalize(struct snapshot_handle *handle)
+{
+        copy_last_highmem_page();
+        /* Free only if we have loaded the image entirely */
+        if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
+                memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+                free_highmem_data();
+        }
+}
 int snapshot_image_loaded(struct snapshot_handle *handle)
 {
-        return !(!nr_copy_pages ||
+        return !(!nr_copy_pages || !last_highmem_page_copied() ||
                        handle->cur <= nr_meta_pages + nr_copy_pages);
 }
-void snapshot_free_unused_memory(struct snapshot_handle *handle)
+#ifdef CONFIG_HIGHMEM
+/* Assumes that @buf is ready and points to a "safe" page */
+static inline void
+swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 {
-        /* Free only if we have loaded the image entirely */
+        void *kaddr1, *kaddr2;
-        if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
-                memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+        kaddr1 = kmap_atomic(p1, KM_USER0);
+        kaddr2 = kmap_atomic(p2, KM_USER1);
+        memcpy(buf, kaddr1, PAGE_SIZE);
+        memcpy(kaddr1, kaddr2, PAGE_SIZE);
+        memcpy(kaddr2, buf, PAGE_SIZE);
+        kunmap_atomic(kaddr1, KM_USER0);
+        kunmap_atomic(kaddr2, KM_USER1);
+}
+/**
+ *      restore_highmem - for each highmem page that was allocated before
+ *      the suspend and included in the suspend image, and also has been
+ *      allocated by the "resume" kernel swap its current (ie. "before
+ *      resume") contents with the previous (ie. "before suspend") one.
+ *
+ *      If the resume eventually fails, we can call this function once
+ *      again and restore the "before resume" highmem state.
+ */
+int restore_highmem(void)
+{
+        struct highmem_pbe *pbe = highmem_pblist;
+        void *buf;
+        if (!pbe)
+                return 0;
+        buf = get_image_page(GFP_ATOMIC, PG_SAFE);
+        if (!buf)
+                return -ENOMEM;
+        while (pbe) {
+                swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
+                pbe = pbe->next;
+        }
+        free_image_page(buf, PG_UNSAFE_CLEAR);
+        return 0;
 }
+#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1a3b0dd2c3fc..f133d4a6d817 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -34,34 +34,123 @@ extern char resume_file[];
 #define SWSUSP_SIG      "S1SUSPEND"
 static struct swsusp_header {
-        char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+        char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
-        swp_entry_t image;
+        sector_t image;
        char    orig_sig[10];
        char    sig[10];
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
 /*
- * Saving part...
+ * General things
 */
 static unsigned short root_swap = 0xffff;
+static struct block_device *resume_bdev;
+/**
+ *      submit - submit BIO request.
+ *      @rw:    READ or WRITE.
+ *      @off    physical offset of page.
+ *      @page:  page we're reading or writing.
+ *      @bio_chain: list of pending biod (for async reading)
+ *
+ *      Straight from the textbook - allocate and initialize the bio.
+ *      If we're reading, make sure the page is marked as dirty.
+ *      Then submit it and, if @bio_chain == NULL, wait.
+ */
+static int submit(int rw, pgoff_t page_off, struct page *page,
+                        struct bio **bio_chain)
+{
+        struct bio *bio;
+        bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+        bio->bi_bdev = resume_bdev;
+        bio->bi_end_io = end_swap_bio_read;
+        if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+                printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
+                bio_put(bio);
+                return -EFAULT;
+        }
+        lock_page(page);
+        bio_get(bio);
+        if (bio_chain == NULL) {
+                submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+                wait_on_page_locked(page);
+                if (rw == READ)
+                        bio_set_pages_dirty(bio);
+                bio_put(bio);
+        } else {
+                if (rw == READ)
+                        get_page(page); /* These pages are freed later */
+                bio->bi_private = *bio_chain;
+                *bio_chain = bio;
+                submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+        }
+        return 0;
+}
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+        return submit(READ, page_off, virt_to_page(addr), bio_chain);
+}
+static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+        return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
+}
+static int wait_on_bio_chain(struct bio **bio_chain)
+{
+        struct bio *bio;
+        struct bio *next_bio;
+        int ret = 0;
+        if (bio_chain == NULL)
+                return 0;
+        bio = *bio_chain;
+        if (bio == NULL)
+                return 0;
+        while (bio) {
+                struct page *page;
+                next_bio = bio->bi_private;
+                page = bio->bi_io_vec[0].bv_page;
+                wait_on_page_locked(page);
+                if (!PageUptodate(page) || PageError(page))
+                        ret = -EIO;
+                put_page(page);
+                bio_put(bio);
+                bio = next_bio;
+        }
+        *bio_chain = NULL;
+        return ret;
+}
+/*
+ * Saving part
+ */
-static int mark_swapfiles(swp_entry_t start)
+static int mark_swapfiles(sector_t start)
 {
        int error;
-        rw_swap_page_sync(READ, swp_entry(root_swap, 0),
+        bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
-                          virt_to_page((unsigned long)&swsusp_header), NULL);
        if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
            !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
                memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
                memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
                swsusp_header.image = start;
-                error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
+                error = bio_write_page(swsusp_resume_block,
-                                virt_to_page((unsigned long)&swsusp_header),
+                                        &swsusp_header, NULL);
-                                NULL);
        } else {
-                pr_debug("swsusp: Partition is not swap space.\n");
+                printk(KERN_ERR "swsusp: Swap header not found!\n");
                error = -ENODEV;
        }
        return error;
@@ -74,12 +163,21 @@ static int mark_swapfiles(swp_entry_t start)
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
-        int res = swap_type_of(swsusp_resume_device);
+        int res;
+        res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
+        if (res < 0)
+                return res;
+        root_swap = res;
+        resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE);
+        if (IS_ERR(resume_bdev))
+                return PTR_ERR(resume_bdev);
+        res = set_blocksize(resume_bdev, PAGE_SIZE);
+        if (res < 0)
+                blkdev_put(resume_bdev);
-        if (res >= 0) {
-                root_swap = res;
-                return 0;
-        }
        return res;
 }
@@ -90,36 +188,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 *      @bio_chain:     Link the next write BIO here
 */
-static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
+static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 {
-        swp_entry_t entry;
+        void *src;
-        int error = -ENOSPC;
+        if (!offset)
-        if (offset) {
+                return -ENOSPC;
-                struct page *page = virt_to_page(buf);
+        if (bio_chain) {
-                if (bio_chain) {
+                src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
-                        /*
+                if (src) {
-                         * Whether or not we successfully allocated a copy page,
+                        memcpy(src, buf, PAGE_SIZE);
-                         * we take a ref on the page here.  It gets undone in
+                } else {
-                         * wait_on_bio_chain().
+                        WARN_ON_ONCE(1);
-                         */
+                        bio_chain = NULL;       /* Go synchronous */
-                        struct page *page_copy;
+                        src = buf;
-                        page_copy = alloc_page(GFP_ATOMIC);
-                        if (page_copy == NULL) {
-                                WARN_ON_ONCE(1);
-                                bio_chain = NULL;       /* Go synchronous */
-                                get_page(page);
-                        } else {
-                                memcpy(page_address(page_copy),
-                                        page_address(page), PAGE_SIZE);
-                                page = page_copy;
-                        }
                }
-                entry = swp_entry(root_swap, offset);
+        } else {
-                error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
+                src = buf;
        }
-        return error;
+        return bio_write_page(offset, src, bio_chain);
 }
 /*
@@ -137,11 +225,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
 *      at a time.
 */
-#define MAP_PAGE_ENTRIES        (PAGE_SIZE / sizeof(long) - 1)
+#define MAP_PAGE_ENTRIES        (PAGE_SIZE / sizeof(sector_t) - 1)
 struct swap_map_page {
-        unsigned long           entries[MAP_PAGE_ENTRIES];
+        sector_t entries[MAP_PAGE_ENTRIES];
-        unsigned long           next_swap;
+        sector_t next_swap;
 };
 /**
@@ -151,7 +239,7 @@ struct swap_map_page {
 struct swap_map_handle {
        struct swap_map_page *cur;
-        unsigned long cur_swap;
+        sector_t cur_swap;
        struct bitmap_page *bitmap;
        unsigned int k;
 };
@@ -166,26 +254,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
        handle->bitmap = NULL;
 }
-static void show_speed(struct timeval *start, struct timeval *stop,
-                        unsigned nr_pages, char *msg)
-{
-        s64 elapsed_centisecs64;
-        int centisecs;
-        int k;
-        int kps;
-        elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
-        do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
-        centisecs = elapsed_centisecs64;
-        if (centisecs == 0)
-                centisecs = 1;  /* avoid div-by-zero */
-        k = nr_pages * (PAGE_SIZE / 1024);
-        kps = (k * 100) / centisecs;
-        printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
-                        centisecs / 100, centisecs % 100,
-                        kps / 1000, (kps % 1000) / 10);
-}
 static int get_swap_writer(struct swap_map_handle *handle)
 {
        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -196,7 +264,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
                release_swap_writer(handle);
                return -ENOMEM;
        }
-        handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+        handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
        if (!handle->cur_swap) {
                release_swap_writer(handle);
                return -ENOSPC;
@@ -205,43 +273,15 @@ static int get_swap_writer(struct swap_map_handle *handle)
        return 0;
 }
-static int wait_on_bio_chain(struct bio **bio_chain)
-{
-        struct bio *bio;
-        struct bio *next_bio;
-        int ret = 0;
-        if (bio_chain == NULL)
-                return 0;
-        bio = *bio_chain;
-        if (bio == NULL)
-                return 0;
-        while (bio) {
-                struct page *page;
-                next_bio = bio->bi_private;
-                page = bio->bi_io_vec[0].bv_page;
-                wait_on_page_locked(page);
-                if (!PageUptodate(page) || PageError(page))
-                        ret = -EIO;
-                put_page(page);
-                bio_put(bio);
-                bio = next_bio;
-        }
-        *bio_chain = NULL;
-        return ret;
-}
 static int swap_write_page(struct swap_map_handle *handle, void *buf,
                                struct bio **bio_chain)
 {
        int error = 0;
-        unsigned long offset;
+        sector_t offset;
        if (!handle->cur)
                return -EINVAL;
-        offset = alloc_swap_page(root_swap, handle->bitmap);
+        offset = alloc_swapdev_block(root_swap, handle->bitmap);
        error = write_page(buf, offset, bio_chain);
        if (error)
                return error;
@@ -250,7 +290,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                error = wait_on_bio_chain(bio_chain);
                if (error)
                        goto out;
-                offset = alloc_swap_page(root_swap, handle->bitmap);
+                offset = alloc_swapdev_block(root_swap, handle->bitmap);
                if (!offset)
                        return -ENOSPC;
                handle->cur->next_swap = offset;
@@ -261,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                handle->cur_swap = offset;
                handle->k = 0;
        }
-out:
+ out:
        return error;
 }
@@ -315,7 +355,7 @@ static int save_image(struct swap_map_handle *handle,
                error = err2;
        if (!error)
                printk("\b\b\b\bdone\n");
-        show_speed(&start, &stop, nr_to_write, "Wrote");
+        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
        return error;
 }
@@ -350,100 +390,50 @@ int swsusp_write(void)
        struct swsusp_info *header;
        int error;
-        if ((error = swsusp_swap_check())) {
+        error = swsusp_swap_check();
+        if (error) {
                printk(KERN_ERR "swsusp: Cannot find swap device, try "
                                "swapon -a.\n");
                return error;
        }
        memset(&snapshot, 0, sizeof(struct snapshot_handle));
        error = snapshot_read_next(&snapshot, PAGE_SIZE);
-        if (error < PAGE_SIZE)
+        if (error < PAGE_SIZE) {
-                return error < 0 ? error : -EFAULT;
+                if (error >= 0)
+                        error = -EFAULT;
+                goto out;
+        }
        header = (struct swsusp_info *)data_of(snapshot);
        if (!enough_swap(header->pages)) {
                printk(KERN_ERR "swsusp: Not enough free swap\n");
-                return -ENOSPC;
+                error = -ENOSPC;
+                goto out;
        }
        error = get_swap_writer(&handle);
        if (!error) {
-                unsigned long start = handle.cur_swap;
+                sector_t start = handle.cur_swap;
                error = swap_write_page(&handle, header, NULL);
                if (!error)
                        error = save_image(&handle, &snapshot,
                                        header->pages - 1);
                if (!error) {
                        flush_swap_writer(&handle);
                        printk("S");
-                        error = mark_swapfiles(swp_entry(root_swap, start));
+                        error = mark_swapfiles(start);
                        printk("|\n");
                }
        }
        if (error)
                free_all_swap_pages(root_swap, handle.bitmap);
        release_swap_writer(&handle);
+ out:
+        swsusp_close();
        return error;
 }
-static struct block_device *resume_bdev;
-/**
- *      submit - submit BIO request.
- *      @rw:    READ or WRITE.
- *      @off    physical offset of page.
- *      @page:  page we're reading or writing.
- *      @bio_chain: list of pending biod (for async reading)
- *
- *      Straight from the textbook - allocate and initialize the bio.
- *      If we're reading, make sure the page is marked as dirty.
- *      Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, pgoff_t page_off, struct page *page,
-                        struct bio **bio_chain)
-{
-        struct bio *bio;
-        bio = bio_alloc(GFP_ATOMIC, 1);
-        if (!bio)
-                return -ENOMEM;
-        bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-        bio->bi_bdev = resume_bdev;
-        bio->bi_end_io = end_swap_bio_read;
-        if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-                printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
-                bio_put(bio);
-                return -EFAULT;
-        }
-        lock_page(page);
-        bio_get(bio);
-        if (bio_chain == NULL) {
-                submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-                wait_on_page_locked(page);
-                if (rw == READ)
-                        bio_set_pages_dirty(bio);
-                bio_put(bio);
-        } else {
-                if (rw == READ)
-                        get_page(page); /* These pages are freed later */
-                bio->bi_private = *bio_chain;
-                *bio_chain = bio;
-                submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-        }
-        return 0;
-}
-static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-        return submit(READ, page_off, virt_to_page(addr), bio_chain);
-}
-static int bio_write_page(pgoff_t page_off, void *addr)
-{
-        return submit(WRITE, page_off, virt_to_page(addr), NULL);
-}
 /**
 *      The following functions allow us to read data using a swap map
 *      in a file-alike way
@@ -456,17 +446,18 @@ static void release_swap_reader(struct swap_map_handle *handle)
        handle->cur = NULL;
 }
-static int get_swap_reader(struct swap_map_handle *handle,
+static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
-                                      swp_entry_t start)
 {
        int error;
-        if (!swp_offset(start))
+        if (!start)
                return -EINVAL;
-        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+        handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
        if (!handle->cur)
                return -ENOMEM;
-        error = bio_read_page(swp_offset(start), handle->cur, NULL);
+        error = bio_read_page(start, handle->cur, NULL);
        if (error) {
                release_swap_reader(handle);
                return error;
@@ -478,7 +469,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 static int swap_read_page(struct swap_map_handle *handle, void *buf,
                                struct bio **bio_chain)
 {
-        unsigned long offset;
+        sector_t offset;
        int error;
        if (!handle->cur)
@@ -547,11 +538,11 @@ static int load_image(struct swap_map_handle *handle,
                error = err2;
        if (!error) {
                printk("\b\b\b\bdone\n");
-                snapshot_free_unused_memory(snapshot);
+                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
                        error = -ENODATA;
        }
-        show_speed(&start, &stop, nr_to_read, "Read");
+        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
        return error;
 }
@@ -600,12 +591,16 @@ int swsusp_check(void)
        if (!IS_ERR(resume_bdev)) {
                set_blocksize(resume_bdev, PAGE_SIZE);
                memset(&swsusp_header, 0, sizeof(swsusp_header));
-                if ((error = bio_read_page(0, &swsusp_header, NULL)))
+                error = bio_read_page(swsusp_resume_block,
+                                        &swsusp_header, NULL);
+                if (error)
                        return error;
                if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
                        memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
                        /* Reset swap signature now */
-                        error = bio_write_page(0, &swsusp_header);
+                        error = bio_write_page(swsusp_resume_block,
+                                                &swsusp_header, NULL);
                } else {
                        return -EINVAL;
                }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0b66659dc516..31aa0390c777 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -49,6 +49,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/highmem.h>
+#include <linux/time.h>
 #include "power.h"
@@ -64,10 +65,8 @@ int in_suspend __nosavedata = 0;
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
-int save_highmem(void);
 int restore_highmem(void);
 #else
-static inline int save_highmem(void) { return 0; }
 static inline int restore_highmem(void) { return 0; }
 static inline unsigned int count_highmem_pages(void) { return 0; }
 #endif
@@ -134,18 +133,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
        return 0;
 }
-unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
+sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
 {
        unsigned long offset;
        offset = swp_offset(get_swap_page_of_type(swap));
        if (offset) {
-                if (bitmap_set(bitmap, offset)) {
+                if (bitmap_set(bitmap, offset))
                        swap_free(swp_entry(swap, offset));
-                        offset = 0;
+                else
-                }
+                        return swapdev_block(swap, offset);
        }
-        return offset;
+        return 0;
 }
 void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
@@ -166,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 }
 /**
+ *      swsusp_show_speed - print the time elapsed between two events represented by
+ *      @start and @stop
+ *
+ *      @nr_pages -     number of pages processed between @start and @stop
+ *      @msg -          introductory message to print
+ */
+void swsusp_show_speed(struct timeval *start, struct timeval *stop,
+                        unsigned nr_pages, char *msg)
+{
+        s64 elapsed_centisecs64;
+        int centisecs;
+        int k;
+        int kps;
+        elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+        do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+        centisecs = elapsed_centisecs64;
+        if (centisecs == 0)
+                centisecs = 1;  /* avoid div-by-zero */
+        k = nr_pages * (PAGE_SIZE / 1024);
+        kps = (k * 100) / centisecs;
+        printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+                        centisecs / 100, centisecs % 100,
+                        kps / 1000, (kps % 1000) / 10);
+}
+/**
 *      swsusp_shrink_memory -  Try to free as much memory as needed
 *
 *      ... but do not OOM-kill anyone
@@ -184,23 +211,37 @@ static inline unsigned long __shrink_memory(long tmp)
 int swsusp_shrink_memory(void)
 {
-        long size, tmp;
+        long tmp;
        struct zone *zone;
        unsigned long pages = 0;
        unsigned int i = 0;
        char *p = "-\\|/";
+        struct timeval start, stop;
        printk("Shrinking memory...  ");
+        do_gettimeofday(&start);
        do {
-                size = 2 * count_highmem_pages();
+                long size, highmem_size;
-                size += size / 50 + count_data_pages() + PAGES_FOR_IO;
+                highmem_size = count_highmem_pages();
+                size = count_data_pages() + PAGES_FOR_IO;
                tmp = size;
+                size += highmem_size;
                for_each_zone (zone)
-                        if (!is_highmem(zone) && populated_zone(zone)) {
+                        if (populated_zone(zone)) {
-                                tmp -= zone->free_pages;
+                                if (is_highmem(zone)) {
-                                tmp += zone->lowmem_reserve[ZONE_NORMAL];
+                                        highmem_size -= zone->free_pages;
-                                tmp += snapshot_additional_pages(zone);
+                                } else {
+                                        tmp -= zone->free_pages;
+                                        tmp += zone->lowmem_reserve[ZONE_NORMAL];
+                                        tmp += snapshot_additional_pages(zone);
+                                }
                        }
+                if (highmem_size < 0)
+                        highmem_size = 0;
+                tmp += highmem_size;
                if (tmp > 0) {
                        tmp = __shrink_memory(tmp);
                        if (!tmp)
@@ -212,7 +253,9 @@ int swsusp_shrink_memory(void)
                }
                printk("\b%c", p[i++%4]);
        } while (tmp > 0);
+        do_gettimeofday(&stop);
        printk("\bdone (%lu pages freed)\n", pages);
+        swsusp_show_speed(&start, &stop, pages, "Freed");
        return 0;
 }
@@ -223,6 +266,7 @@ int swsusp_suspend(void)
        if ((error = arch_prepare_suspend()))
                return error;
        local_irq_disable();
        /* At this point, device_suspend() has been called, but *not*
         * device_power_down(). We *must* device_power_down() now.
@@ -235,23 +279,16 @@ int swsusp_suspend(void)
                goto Enable_irqs;
        }
-        if ((error = save_highmem())) {
-                printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
-                goto Restore_highmem;
-        }
        save_processor_state();
        if ((error = swsusp_arch_suspend()))
                printk(KERN_ERR "Error %d suspending\n", error);
        /* Restore control flow magically appears here */
        restore_processor_state();
-Restore_highmem:
-        restore_highmem();
        /* NOTE:  device_power_up() is just a resume() for devices
         * that suspended with irqs off ... no overall powerup.
         */
        device_power_up();
-Enable_irqs:
+ Enable_irqs:
        local_irq_enable();
        return error;
 }
@@ -268,18 +305,23 @@ int swsusp_resume(void)
                printk(KERN_ERR "Some devices failed to power down, very bad\n");
        /* We'll ignore saved state, but this gets preempt count (etc) right */
        save_processor_state();
-        error = swsusp_arch_resume();
+        error = restore_highmem();
-        /* Code below is only ever reached in case of failure. Otherwise
+        if (!error) {
-         * execution continues at place where swsusp_arch_suspend was called
+                error = swsusp_arch_resume();
-         */
+                /* The code below is only ever reached in case of a failure.
-        BUG_ON(!error);
+                 * Otherwise execution continues at place where
+                 * swsusp_arch_suspend() was called
+                 */
+                BUG_ON(!error);
+                /* This call to restore_highmem() undos the previous one */
+                restore_highmem();
+        }
        /* The only reason why swsusp_arch_resume() can fail is memory being
         * very tight, so we have to free it as soon as we can to avoid
         * subsequent failures
         */
        swsusp_free();
        restore_processor_state();
-        restore_highmem();
        touch_softlockup_watchdog();
        device_power_up();
        local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d991d3b0e5a4..89443b85163b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -11,6 +11,7 @@
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
+#include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
 #include <linux/miscdevice.h>
@@ -21,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
+#include <linux/freezer.h>
 #include <asm/uaccess.h>
@@ -54,7 +56,8 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        filp->private_data = data;
        memset(&data->handle, 0, sizeof(struct snapshot_handle));
        if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
-                data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+                data->swap = swsusp_resume_device ?
+                                swap_type_of(swsusp_resume_device, 0) : -1;
                data->mode = O_RDONLY;
        } else {
                data->swap = -1;
@@ -76,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
        free_all_swap_pages(data->swap, data->bitmap);
        free_bitmap(data->bitmap);
        if (data->frozen) {
-                down(&pm_sem);
+                mutex_lock(&pm_mutex);
                thaw_processes();
                enable_nonboot_cpus();
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
        }
        atomic_inc(&device_available);
        return 0;
@@ -124,7 +127,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 {
        int error = 0;
        struct snapshot_data *data;
-        loff_t offset, avail;
+        loff_t avail;
+        sector_t offset;
        if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
                return -ENOTTY;
@@ -140,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
        case SNAPSHOT_FREEZE:
                if (data->frozen)
                        break;
-                down(&pm_sem);
+                mutex_lock(&pm_mutex);
                error = disable_nonboot_cpus();
                if (!error) {
                        error = freeze_processes();
@@ -150,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                                error = -EBUSY;
                        }
                }
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
                if (!error)
                        data->frozen = 1;
                break;
@@ -158,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
        case SNAPSHOT_UNFREEZE:
                if (!data->frozen)
                        break;
-                down(&pm_sem);
+                mutex_lock(&pm_mutex);
                thaw_processes();
                enable_nonboot_cpus();
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
                data->frozen = 0;
                break;
@@ -170,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                        error = -EPERM;
                        break;
                }
-                down(&pm_sem);
+                mutex_lock(&pm_mutex);
                /* Free memory before shutting down devices. */
                error = swsusp_shrink_memory();
                if (!error) {
@@ -183,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                        }
                        resume_console();
                }
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
                if (!error)
                        error = put_user(in_suspend, (unsigned int __user *)arg);
                if (!error)
@@ -191,13 +195,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                break;
        case SNAPSHOT_ATOMIC_RESTORE:
+                snapshot_write_finalize(&data->handle);
                if (data->mode != O_WRONLY || !data->frozen ||
                    !snapshot_image_loaded(&data->handle)) {
                        error = -EPERM;
                        break;
                }
-                snapshot_free_unused_memory(&data->handle);
+                mutex_lock(&pm_mutex);
-                down(&pm_sem);
                pm_prepare_console();
                suspend_console();
                error = device_suspend(PMSG_PRETHAW);
@@ -207,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                }
                resume_console();
                pm_restore_console();
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
                break;
        case SNAPSHOT_FREE:
@@ -238,10 +242,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                                break;
                        }
                }
-                offset = alloc_swap_page(data->swap, data->bitmap);
+                offset = alloc_swapdev_block(data->swap, data->bitmap);
                if (offset) {
                        offset <<= PAGE_SHIFT;
-                        error = put_user(offset, (loff_t __user *)arg);
+                        error = put_user(offset, (sector_t __user *)arg);
                } else {
                        error = -ENOSPC;
                }
@@ -264,7 +268,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                         * so we need to recode them
                         */
                        if (old_decode_dev(arg)) {
-                                data->swap = swap_type_of(old_decode_dev(arg));
+                                data->swap = swap_type_of(old_decode_dev(arg), 0);
                                if (data->swap < 0)
                                        error = -ENODEV;
                        } else {
@@ -282,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                        break;
                }
-                if (down_trylock(&pm_sem)) {
+                if (!mutex_trylock(&pm_mutex)) {
                        error = -EBUSY;
                        break;
                }
@@ -309,8 +313,66 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                if (pm_ops->finish)
                        pm_ops->finish(PM_SUSPEND_MEM);
-OutS3:
+ OutS3:
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
+                break;
+        case SNAPSHOT_PMOPS:
+                switch (arg) {
+                case PMOPS_PREPARE:
+                        if (pm_ops->prepare) {
+                                error = pm_ops->prepare(PM_SUSPEND_DISK);
+                        }
+                        break;
+                case PMOPS_ENTER:
+                        kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+                        error = pm_ops->enter(PM_SUSPEND_DISK);
+                        break;
+                case PMOPS_FINISH:
+                        if (pm_ops && pm_ops->finish) {
+                                pm_ops->finish(PM_SUSPEND_DISK);
+                        }
+                        break;
+                default:
+                        printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
+                        error = -EINVAL;
+                }
+                break;
+        case SNAPSHOT_SET_SWAP_AREA:
+                if (data->bitmap) {
+                        error = -EPERM;
+                } else {
+                        struct resume_swap_area swap_area;
+                        dev_t swdev;
+                        error = copy_from_user(&swap_area, (void __user *)arg,
+                                        sizeof(struct resume_swap_area));
+                        if (error) {
+                                error = -EFAULT;
+                                break;
+                        }
+                        /*
+                         * User space encodes device types as two-byte values,
+                         * so we need to recode them
+                         */
+                        swdev = old_decode_dev(swap_area.dev);
+                        if (swdev) {
+                                offset = swap_area.offset;
+                                data->swap = swap_type_of(swdev, offset);
+                                if (data->swap < 0)
+                                        error = -ENODEV;
+                        } else {
+                                data->swap = -1;
+                                error = -EINVAL;
+                        }
+                }
                break;
        default:
@@ -321,7 +383,7 @@ OutS3:
        return error;
 }
-static struct file_operations snapshot_fops = {
+static const struct file_operations snapshot_fops = {
        .open = snapshot_open,
        .release = snapshot_release,
        .read = snapshot_read,
diff --git a/kernel/printk.c b/kernel/printk.c
index 66426552fbfe..185bb45eacf7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -53,8 +53,6 @@ int console_printk[4] = {
        DEFAULT_CONSOLE_LOGLEVEL,       /* default_console_loglevel */
 };
-EXPORT_UNUSED_SYMBOL(console_printk);  /*  June 2006  */
 /*
 * Low lever drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
@@ -335,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
        }
 }
+static int __read_mostly ignore_loglevel;
+int __init ignore_loglevel_setup(char *str)
+{
+        ignore_loglevel = 1;
+        printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+        return 1;
+}
+__setup("ignore_loglevel", ignore_loglevel_setup);
 /*
 * Write out chars from start to end - 1 inclusive
 */
 static void _call_console_drivers(unsigned long start,
                                unsigned long end, int msg_log_level)
 {
-        if (msg_log_level < console_loglevel &&
+        if ((msg_log_level < console_loglevel || ignore_loglevel) &&
                        console_drivers && start != end) {
                if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
                        /* wrapped write */
@@ -631,12 +641,7 @@ EXPORT_SYMBOL(vprintk);
 asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
-        return 0;
+        return -ENOSYS;
-}
-int do_syslog(int type, char __user *buf, int len)
-{
-        return 0;
 }
 static void call_console_drivers(unsigned long start, unsigned long end)
@@ -777,7 +782,6 @@ int is_console_locked(void)
 {
        return console_locked;
 }
-EXPORT_UNUSED_SYMBOL(is_console_locked);  /*  June 2006  */
 /**
 * release_console_sem - unlock the console system
diff --git a/kernel/profile.c b/kernel/profile.c
index f940b462eec9..fb5e03d57e9d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
 static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
-static int prof_on __read_mostly;
+int prof_on __read_mostly;
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
 static int __init profile_setup(char * str)
 {
        static char __initdata schedstr[] = "schedule";
+        static char __initdata sleepstr[] = "sleep";
        int par;
-        if (!strncmp(str, schedstr, strlen(schedstr))) {
+        if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+                prof_on = SLEEP_PROFILING;
+                if (str[strlen(sleepstr)] == ',')
+                        str += strlen(sleepstr) + 1;
+                if (get_option(&str, &par))
+                        prof_shift = par;
+                printk(KERN_INFO
+                        "kernel sleep profiling enabled (shift: %ld)\n",
+                        prof_shift);
+        } else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
                prof_on = SCHED_PROFILING;
                if (str[strlen(schedstr)] == ',')
                        str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
 * positions to which hits are accounted during short intervals (e.g.
 * several seconds) is usually very small. Exclusion from buffer
 * flipping is provided by interrupt disablement (note that for
- * SCHED_PROFILING profile_hit() may be called from process context).
+ * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
+ * process context).
 * The hash function is meant to be lightweight as opposed to strong,
 * and was vaguely inspired by ppc64 firmware-supported inverted
 * pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
        mutex_unlock(&profile_flip_mutex);
 }
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
        int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
                put_cpu();
                return;
        }
+        /*
+         * We buffer the global profiler buffer into a per-CPU
+         * queue and thus reduce the number of global (and possibly
+         * NUMA-alien) accesses. The write-queue is self-coalescing:
+         */
        local_irq_save(flags);
        do {
                for (j = 0; j < PROFILE_GRPSZ; ++j) {
                        if (hits[i + j].pc == pc) {
-                                hits[i + j].hits++;
+                                hits[i + j].hits += nr_hits;
                                goto out;
                        } else if (!hits[i + j].hits) {
                                hits[i + j].pc = pc;
-                                hits[i + j].hits = 1;
+                                hits[i + j].hits = nr_hits;
                                goto out;
                        }
                }
                i = (i + secondary) & (NR_PROFILE_HIT - 1);
        } while (i != primary);
-        atomic_inc(&prof_buffer[pc]);
+        /*
+         * Add the current hit(s) and flush the write-queue out
+         * to the global buffer:
+         */
+        atomic_add(nr_hits, &prof_buffer[pc]);
        for (i = 0; i < NR_PROFILE_HIT; ++i) {
                atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
                hits[i].pc = hits[i].hits = 0;
@@ -298,7 +319,6 @@ out:
        put_cpu();
 }
-#ifdef CONFIG_HOTPLUG_CPU
 static int __devinit profile_cpu_callback(struct notifier_block *info,
                                        unsigned long action, void *__cpu)
 {
@@ -351,19 +371,19 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
        }
        return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 #else /* !CONFIG_SMP */
 #define profile_flip_buffers()          do { } while (0)
 #define profile_discard_flip_buffers()  do { } while (0)
+#define profile_cpu_callback            NULL
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long pc;
        if (prof_on != type || !prof_buffer)
                return;
        pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
-        atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);
+        atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 }
 #endif /* !CONFIG_SMP */
@@ -442,7 +462,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        read = 0;
        while (p < sizeof(unsigned int) && count > 0) {
-                put_user(*((char *)(&sample_step)+p),buf);
+                if (put_user(*((char *)(&sample_step)+p),buf))
+                        return -EFAULT;
                buf++; p++; count--; read++;
        }
        pnt = (char *)prof_buffer + p - sizeof(atomic_t);
@@ -480,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
        return count;
 }
-static struct file_operations proc_profile_operations = {
+static const struct file_operations proc_profile_operations = {
        .read           = read_profile,
        .write          = write_profile,
 };
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 26bb5ffe1ef1..3554b76da84c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
        list = rdp->donelist;
        while (list) {
-                next = rdp->donelist = list->next;
+                next = list->next;
+                prefetch(next);
                list->func(list);
                list = next;
                if (++count >= rdp->blimit)
                        break;
        }
+        rdp->donelist = list;
        local_irq_disable();
        rdp->qlen -= count;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e2bda18f6f42..c52f981ea008 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void)
        cleanup_srcu_struct(&srcu_ctl);
 }
-static int srcu_torture_read_lock(void)
+static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
 {
        return srcu_read_lock(&srcu_ctl);
 }
@@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
                schedule_timeout_interruptible(longdelay);
 }
-static void srcu_torture_read_unlock(int idx)
+static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
 {
        srcu_read_unlock(&srcu_ctl, idx);
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index f04bbdb56ac2..75a3a9a7efc2 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -308,9 +308,10 @@ static struct rchan_callbacks default_channel_callbacks = {
 *      reason waking is deferred is that calling directly from write
 *      causes problems if you're writing from say the scheduler.
 */
-static void wakeup_readers(void *private)
+static void wakeup_readers(struct work_struct *work)
 {
-        struct rchan_buf *buf = private;
+        struct rchan_buf *buf =
+                container_of(work, struct rchan_buf, wake_readers.work);
        wake_up_interruptible(&buf->read_wait);
 }
@@ -328,7 +329,7 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
        if (init) {
                init_waitqueue_head(&buf->read_wait);
                kref_init(&buf->kref);
-                INIT_WORK(&buf->wake_readers, NULL, NULL);
+                INIT_DELAYED_WORK(&buf->wake_readers, NULL);
        } else {
                cancel_delayed_work(&buf->wake_readers);
                flush_scheduled_work();
@@ -549,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
                        buf->padding[old_subbuf];
                smp_mb();
                if (waitqueue_active(&buf->read_wait)) {
-                        PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+                        PREPARE_DELAYED_WORK(&buf->wake_readers,
+                                             wakeup_readers);
                        schedule_delayed_work(&buf->wake_readers, 1);
                }
        }
@@ -1011,7 +1013,7 @@ static ssize_t relay_file_sendfile(struct file *filp,
                                       actor, &desc);
 }
-struct file_operations relay_file_operations = {
+const struct file_operations relay_file_operations = {
        .open           = relay_file_open,
        .poll           = relay_file_poll,
        .mmap           = relay_file_mmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index 6de60c12143e..7b9a497419d9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v)
        return 0;
 }
-static struct seq_operations resource_op = {
+static const struct seq_operations resource_op = {
        .start  = r_start,
        .next   = r_next,
        .stop   = r_stop,
@@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file)
        return res;
 }
-static struct file_operations proc_ioports_operations = {
+static const struct file_operations proc_ioports_operations = {
        .open           = ioports_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
        .release        = seq_release,
 };
-static struct file_operations proc_iomem_operations = {
+static const struct file_operations proc_iomem_operations = {
        .open           = iomem_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 6dcea9dd8c94..015fc633c96c 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/sysdev.h>
 #include <linux/timer.h>
+#include <linux/freezer.h>
 #include "rtmutex.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 3399701c680e..f385eff4682d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,7 +34,7 @@
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
@@ -505,7 +505,7 @@ static int schedstat_open(struct inode *inode, struct file *file)
        return res;
 }
-struct file_operations proc_schedstat_operations = {
+const struct file_operations proc_schedstat_operations = {
        .open    = schedstat_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
@@ -948,6 +948,17 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
        }
 #endif
+        /*
+         * Sleep time is in units of nanosecs, so shift by 20 to get a
+         * milliseconds-range estimation of the amount of time that the task
+         * spent sleeping:
+         */
+        if (unlikely(prof_on == SLEEP_PROFILING)) {
+                if (p->state == TASK_UNINTERRUPTIBLE)
+                        profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+                                     (now - p->timestamp) >> 20);
+        }
        if (!rt_task(p))
                p->prio = recalc_task_prio(p, now);
@@ -3333,6 +3344,7 @@ asmlinkage void __sched schedule(void)
                printk(KERN_ERR "BUG: scheduling while atomic: "
                        "%s/0x%08x/%d\n",
                        current->comm, preempt_count(), current->pid);
+                debug_show_held_locks(current);
                dump_stack();
        }
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4804,18 +4816,18 @@ static void show_task(struct task_struct *p)
                show_stack(p, NULL);
 }
-void show_state(void)
+void show_state_filter(unsigned long state_filter)
 {
        struct task_struct *g, *p;
 #if (BITS_PER_LONG == 32)
        printk("\n"
-               "                                               sibling\n");
+               "                         free                        sibling\n");
-        printk("  task             PC      pid father child younger older\n");
+        printk("  task             PC    stack   pid father child younger older\n");
 #else
        printk("\n"
-               "                                                       sibling\n");
+               "                                 free                        sibling\n");
-        printk("  task                 PC          pid father child younger older\n");
+        printk("  task                 PC        stack   pid father child younger older\n");
 #endif
        read_lock(&tasklist_lock);
        do_each_thread(g, p) {
@@ -4824,11 +4836,16 @@ void show_state(void)
                 * console might take alot of time:
                 */
                touch_nmi_watchdog();
-                show_task(p);
+                if (p->state & state_filter)
+                        show_task(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
-        debug_show_all_locks();
+        /*
+         * Only show locks if all tasks are dumped:
+         */
+        if (state_filter == -1)
+                debug_show_all_locks();
 }
 /**
@@ -6723,8 +6740,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
            sched_smt_power_savings_store);
 #endif
-#ifdef CONFIG_HOTPLUG_CPU
 /*
 * Force a reinitialization of the sched domains hierarchy.  The domains
 * and groups cannot be updated in place without racing with the balancing
@@ -6757,7 +6772,6 @@ static int update_sched_domains(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
-#endif
 void __init sched_init_smp(void)
 {
@@ -6867,6 +6881,7 @@ void __might_sleep(char *file, int line)
                                " context at %s:%d\n", file, line);
                printk("in_atomic():%d, irqs_disabled():%d\n",
                        in_atomic(), irqs_disabled());
+                debug_show_held_locks(current);
                dump_stack();
        }
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index df18c167a2a7..ec81defde339 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/capability.h>
+#include <linux/freezer.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -33,7 +34,7 @@
 * SLAB caches for signal bits.
 */
-static kmem_cache_t *sigqueue_cachep;
+static struct kmem_cache *sigqueue_cachep;
 /*
 * In POSIX a signal is sent either to a specific thread (Linux task)
@@ -1133,8 +1134,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
        return error;
 }
-int
+static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
        int error;
        rcu_read_lock();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf25015dce16..918e52df090e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
-                BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
-                BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
                p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk("ksoftirqd for %i failed\n", hotcpu);
diff --git a/kernel/sys.c b/kernel/sys.c
index 98489d82801b..a0c1a29a507f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
        return 0;
 }
-static void deferred_cad(void *dummy)
+static void deferred_cad(struct work_struct *dummy)
 {
        kernel_restart(NULL);
 }
@@ -892,7 +892,7 @@ static void deferred_cad(void *dummy)
 */
 void ctrl_alt_del(void)
 {
-        static DECLARE_WORK(cad_work, deferred_cad, NULL);
+        static DECLARE_WORK(cad_work, deferred_cad);
        if (C_A_D)
                schedule_work(&cad_work);
@@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 asmlinkage long sys_setuid(uid_t uid)
 {
        int old_euid = current->euid;
-        int old_ruid, old_suid, new_ruid, new_suid;
+        int old_ruid, old_suid, new_suid;
        int retval;
        retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
        if (retval)
                return retval;
-        old_ruid = new_ruid = current->uid;
+        old_ruid = current->uid;
        old_suid = current->suid;
        new_suid = old_suid;
        
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6fc5e17086f4..8e9f00fd6d18 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -171,7 +171,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
 static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
 static int proc_opensys(struct inode *, struct file *);
-struct file_operations proc_sys_file_operations = {
+const struct file_operations proc_sys_file_operations = {
        .open           = proc_opensys,
        .read           = proc_readsys,
        .write          = proc_writesys,
@@ -986,17 +986,6 @@ static ctl_table vm_table[] = {
                .extra1         = &zero,
        },
 #endif
-#ifdef CONFIG_SWAP
-        {
-                .ctl_name       = VM_SWAP_TOKEN_TIMEOUT,
-                .procname       = "swap_token_timeout",
-                .data           = &swap_token_default_timeout,
-                .maxlen         = sizeof(swap_token_default_timeout),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_jiffies,
-                .strategy       = &sysctl_jiffies,
-        },
-#endif
 #ifdef CONFIG_NUMA
        {
                .ctl_name       = VM_ZONE_RECLAIM_MODE,
@@ -1895,7 +1884,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
                        p = buf;
                        if (*p == '-' && left > 1) {
                                neg = 1;
-                                left--, p++;
+                                p++;
                        }
                        if (*p < '0' || *p > '9')
                                break;
@@ -2146,7 +2135,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
                        p = buf;
                        if (*p == '-' && left > 1) {
                                neg = 1;
-                                left--, p++;
+                                p++;
                        }
                        if (*p < '0' || *p > '9')
                                break;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d3d28919d4b4..4c3476fa058d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -34,7 +34,7 @@
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
-kmem_cache_t *taskstats_cache;
+struct kmem_cache *taskstats_cache;
 static struct genl_family family = {
        .id             = GENL_ID_GENERATE,
@@ -69,7 +69,7 @@ enum actions {
 };
 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
-                        void **replyp, size_t size)
+                                size_t size)
 {
        struct sk_buff *skb;
        void *reply;
@@ -94,7 +94,6 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
        }
        *skbp = skb;
-        *replyp = reply;
        return 0;
 }
@@ -119,10 +118,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 /*
 * Send taskstats data in @skb to listeners registered for @cpu's exit data
 */
-static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+static void send_cpu_listeners(struct sk_buff *skb,
+                                        struct listener_list *listeners)
 {
        struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
-        struct listener_list *listeners;
        struct listener *s, *tmp;
        struct sk_buff *skb_next, *skb_cur = skb;
        void *reply = genlmsg_data(genlhdr);
@@ -135,7 +134,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
        }
        rc = 0;
-        listeners = &per_cpu(listener_array, cpu);
        down_read(&listeners->sem);
        list_for_each_entry(s, &listeners->list, list) {
                skb_next = NULL;
@@ -186,6 +184,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
        } else
                get_task_struct(tsk);
+        memset(stats, 0, sizeof(*stats));
        /*
         * Each accounting subsystem adds calls to its functions to
         * fill in relevant parts of struct taskstsats as follows
@@ -228,6 +227,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
        if (first->signal->stats)
                memcpy(stats, first->signal->stats, sizeof(*stats));
+        else
+                memset(stats, 0, sizeof(*stats));
        tsk = first;
        do {
@@ -344,14 +345,36 @@ static int parse(struct nlattr *na, cpumask_t *mask)
        return ret;
 }
+static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
+{
+        struct nlattr *na, *ret;
+        int aggr;
+        aggr = (type == TASKSTATS_TYPE_PID)
+                        ? TASKSTATS_TYPE_AGGR_PID
+                        : TASKSTATS_TYPE_AGGR_TGID;
+        na = nla_nest_start(skb, aggr);
+        if (!na)
+                goto err;
+        if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+                goto err;
+        ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
+        if (!ret)
+                goto err;
+        nla_nest_end(skb, na);
+        return nla_data(ret);
+err:
+        return NULL;
+}
 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
        int rc = 0;
        struct sk_buff *rep_skb;
-        struct taskstats stats;
+        struct taskstats *stats;
-        void *reply;
        size_t size;
-        struct nlattr *na;
        cpumask_t mask;
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
@@ -372,83 +395,71 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        size = nla_total_size(sizeof(u32)) +
                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
-        memset(&stats, 0, sizeof(stats));
+        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
-        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
        if (rc < 0)
                return rc;
+        rc = -EINVAL;
        if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
                u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
-                rc = fill_pid(pid, NULL, &stats);
+                stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
-                if (rc < 0)
+                if (!stats)
                        goto err;
-                na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+                rc = fill_pid(pid, NULL, stats);
-                NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+                if (rc < 0)
-                NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+                        goto err;
-                                stats);
        } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
                u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
-                rc = fill_tgid(tgid, NULL, &stats);
+                stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
-                if (rc < 0)
+                if (!stats)
                        goto err;
-                na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+                rc = fill_tgid(tgid, NULL, stats);
-                NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+                if (rc < 0)
-                NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+                        goto err;
-                                stats);
+        } else
-        } else {
-                rc = -EINVAL;
                goto err;
-        }
-        nla_nest_end(rep_skb, na);
        return send_reply(rep_skb, info->snd_pid);
-nla_put_failure:
-        rc = genlmsg_cancel(rep_skb, reply);
 err:
        nlmsg_free(rep_skb);
        return rc;
 }
-void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
 {
-        struct listener_list *listeners;
+        struct signal_struct *sig = tsk->signal;
-        struct taskstats *tmp;
+        struct taskstats *stats;
-        /*
-         * This is the cpu on which the task is exiting currently and will
-         * be the one for which the exit event is sent, even if the cpu
-         * on which this function is running changes later.
-         */
-        *mycpu = raw_smp_processor_id();
-        *ptidstats = NULL;
+        if (sig->stats || thread_group_empty(tsk))
-        tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+                goto ret;
-        if (!tmp)
-                return;
-        listeners = &per_cpu(listener_array, *mycpu);
+        /* No problem if kmem_cache_zalloc() fails */
-        down_read(&listeners->sem);
+        stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
-        if (!list_empty(&listeners->list)) {
-                *ptidstats = tmp;
+        spin_lock_irq(&tsk->sighand->siglock);
-                tmp = NULL;
+        if (!sig->stats) {
+                sig->stats = stats;
+                stats = NULL;
        }
-        up_read(&listeners->sem);
+        spin_unlock_irq(&tsk->sighand->siglock);
-        kfree(tmp);
+        if (stats)
+                kmem_cache_free(taskstats_cache, stats);
+ret:
+        return sig->stats;
 }
 /* Send pid data out on exit */
-void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+void taskstats_exit(struct task_struct *tsk, int group_dead)
-                        int group_dead, unsigned int mycpu)
 {
        int rc;
+        struct listener_list *listeners;
+        struct taskstats *stats;
        struct sk_buff *rep_skb;
-        void *reply;
        size_t size;
        int is_thread_group;
-        struct nlattr *na;
        if (!family_registered)
                return;
@@ -459,7 +470,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
        size = nla_total_size(sizeof(u32)) +
                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
-        is_thread_group = (tsk->signal->stats != NULL);
+        is_thread_group = !!taskstats_tgid_alloc(tsk);
        if (is_thread_group) {
                /* PID + STATS + TGID + STATS */
                size = 2 * size;
@@ -467,49 +478,39 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
                fill_tgid_exit(tsk);
        }
-        if (!tidstats)
+        listeners = &__raw_get_cpu_var(listener_array);
+        if (list_empty(&listeners->list))
                return;
-        rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+        rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
-        if (rc < 0)
-                goto ret;
-        rc = fill_pid(tsk->pid, tsk, tidstats);
        if (rc < 0)
-                goto err_skb;
+                return;
-        na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
-        NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+        if (!stats)
-        NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+                goto err;
-                        *tidstats);
-        nla_nest_end(rep_skb, na);
-        if (!is_thread_group)
+        rc = fill_pid(tsk->pid, tsk, stats);
-                goto send;
+        if (rc < 0)
+                goto err;
        /*
         * Doesn't matter if tsk is the leader or the last group member leaving
         */
-        if (!group_dead)
+        if (!is_thread_group || !group_dead)
                goto send;
-        na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
-        NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+        if (!stats)
-        /* No locking needed for tsk->signal->stats since group is dead */
+                goto err;
-        NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-                        *tsk->signal->stats);
+        memcpy(stats, tsk->signal->stats, sizeof(*stats));
-        nla_nest_end(rep_skb, na);
 send:
-        send_cpu_listeners(rep_skb, mycpu);
+        send_cpu_listeners(rep_skb, listeners);
        return;
+err:
-nla_put_failure:
-        genlmsg_cancel(rep_skb, reply);
-err_skb:
        nlmsg_free(rep_skb);
-ret:
-        return;
 }
 static struct genl_ops taskstats_ops = {
diff --git a/kernel/user.c b/kernel/user.c
index 220e586127a0..4869563080e9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,7 +26,7 @@
 #define __uidhashfn(uid)        (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
 #define uidhashentry(uid)       (uidhash_table + __uidhashfn((uid)))
-static kmem_cache_t *uid_cachep;
+static struct kmem_cache *uid_cachep;
 static struct list_head uidhash_table[UIDHASH_SZ];
 /*
@@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid)
        if (!up) {
                struct user_struct *new;
-                new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
+                new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
                if (!new)
                        return NULL;
                new->uid = uid;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 17c2f03d2c27..c5257316f4b9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,6 +29,9 @@
 #include <linux/kthread.h>
 #include <linux/hardirq.h>
 #include <linux/mempolicy.h>
+#include <linux/freezer.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
 /*
 * The per-CPU workqueue (if single thread, we always use the first
@@ -55,6 +58,8 @@ struct cpu_workqueue_struct {
        struct task_struct *thread;
        int run_depth;          /* Detect run_workqueue() recursion depth */
+        int freezeable;         /* Freeze the thread during suspend */
 } ____cacheline_aligned;
 /*
@@ -80,6 +85,29 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
        return list_empty(&wq->list);
 }
+static inline void set_wq_data(struct work_struct *work, void *wq)
+{
+        unsigned long new, old, res;
+        /* assume the pending flag is already set and that the task has already
+         * been queued on this workqueue */
+        new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
+        res = work->management;
+        if (res != new) {
+                do {
+                        old = res;
+                        new = (unsigned long) wq;
+                        new |= (old & WORK_STRUCT_FLAG_MASK);
+                        res = cmpxchg(&work->management, old, new);
+                } while (res != old);
+        }
+}
+static inline void *get_wq_data(struct work_struct *work)
+{
+        return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK);
+}
 /* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
                         struct work_struct *work)
@@ -87,7 +115,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
        unsigned long flags;
        spin_lock_irqsave(&cwq->lock, flags);
-        work->wq_data = cwq;
+        set_wq_data(work, cwq);
        list_add_tail(&work->entry, &cwq->worklist);
        cwq->insert_sequence++;
        wake_up(&cwq->more_work);
@@ -108,7 +136,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
        int ret = 0, cpu = get_cpu();
-        if (!test_and_set_bit(0, &work->pending)) {
+        if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
                if (unlikely(is_single_threaded(wq)))
                        cpu = singlethread_cpu;
                BUG_ON(!list_empty(&work->entry));
@@ -122,38 +150,42 @@ EXPORT_SYMBOL_GPL(queue_work);
 static void delayed_work_timer_fn(unsigned long __data)
 {
-        struct work_struct *work = (struct work_struct *)__data;
+        struct delayed_work *dwork = (struct delayed_work *)__data;
-        struct workqueue_struct *wq = work->wq_data;
+        struct workqueue_struct *wq = get_wq_data(&dwork->work);
        int cpu = smp_processor_id();
        if (unlikely(is_single_threaded(wq)))
                cpu = singlethread_cpu;
-        __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+        __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
 }
 /**
 * queue_delayed_work - queue work on a workqueue after delay
 * @wq: workqueue to use
- * @work: work to queue
+ * @work: delayable work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Returns 0 if @work was already on a queue, non-zero otherwise.
 */
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
-                        struct work_struct *work, unsigned long delay)
+                        struct delayed_work *dwork, unsigned long delay)
 {
        int ret = 0;
-        struct timer_list *timer = &work->timer;
+        struct timer_list *timer = &dwork->timer;
+        struct work_struct *work = &dwork->work;
+        if (delay == 0)
+                return queue_work(wq, work);
-        if (!test_and_set_bit(0, &work->pending)) {
+        if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
                BUG_ON(timer_pending(timer));
                BUG_ON(!list_empty(&work->entry));
                /* This stores wq for the moment, for the timer_fn */
-                work->wq_data = wq;
+                set_wq_data(work, wq);
                timer->expires = jiffies + delay;
-                timer->data = (unsigned long)work;
+                timer->data = (unsigned long)dwork;
                timer->function = delayed_work_timer_fn;
                add_timer(timer);
                ret = 1;
@@ -172,19 +204,20 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
 * Returns 0 if @work was already on a queue, non-zero otherwise.
 */
 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-                        struct work_struct *work, unsigned long delay)
+                        struct delayed_work *dwork, unsigned long delay)
 {
        int ret = 0;
-        struct timer_list *timer = &work->timer;
+        struct timer_list *timer = &dwork->timer;
+        struct work_struct *work = &dwork->work;
-        if (!test_and_set_bit(0, &work->pending)) {
+        if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
                BUG_ON(timer_pending(timer));
                BUG_ON(!list_empty(&work->entry));
                /* This stores wq for the moment, for the timer_fn */
-                work->wq_data = wq;
+                set_wq_data(work, wq);
                timer->expires = jiffies + delay;
-                timer->data = (unsigned long)work;
+                timer->data = (unsigned long)dwork;
                timer->function = delayed_work_timer_fn;
                add_timer_on(timer, cpu);
                ret = 1;
@@ -212,15 +245,26 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
        while (!list_empty(&cwq->worklist)) {
                struct work_struct *work = list_entry(cwq->worklist.next,
                                                struct work_struct, entry);
-                void (*f) (void *) = work->func;
+                work_func_t f = work->func;
-                void *data = work->data;
                list_del_init(cwq->worklist.next);
                spin_unlock_irqrestore(&cwq->lock, flags);
-                BUG_ON(work->wq_data != cwq);
+                BUG_ON(get_wq_data(work) != cwq);
-                clear_bit(0, &work->pending);
+                if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
-                f(data);
+                        work_release(work);
+                f(work);
+                if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
+                        printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+                                        "%s/0x%08x/%d\n",
+                                        current->comm, preempt_count(),
+                                        current->pid);
+                        printk(KERN_ERR "    last function: ");
+                        print_symbol("%s\n", (unsigned long)f);
+                        debug_show_held_locks(current);
+                        dump_stack();
+                }
                spin_lock_irqsave(&cwq->lock, flags);
                cwq->remove_sequence++;
@@ -237,7 +281,8 @@ static int worker_thread(void *__cwq)
        struct k_sigaction sa;
        sigset_t blocked;
-        current->flags |= PF_NOFREEZE;
+        if (!cwq->freezeable)
+                current->flags |= PF_NOFREEZE;
        set_user_nice(current, -5);
@@ -260,6 +305,9 @@ static int worker_thread(void *__cwq)
        set_current_state(TASK_INTERRUPTIBLE);
        while (!kthread_should_stop()) {
+                if (cwq->freezeable)
+                        try_to_freeze();
                add_wait_queue(&cwq->more_work, &wait);
                if (list_empty(&cwq->worklist))
                        schedule();
@@ -336,7 +384,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 EXPORT_SYMBOL_GPL(flush_workqueue);
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
-                                                   int cpu)
+                                                   int cpu, int freezeable)
 {
        struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
        struct task_struct *p;
@@ -346,6 +394,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
        cwq->thread = NULL;
        cwq->insert_sequence = 0;
        cwq->remove_sequence = 0;
+        cwq->freezeable = freezeable;
        INIT_LIST_HEAD(&cwq->worklist);
        init_waitqueue_head(&cwq->more_work);
        init_waitqueue_head(&cwq->work_done);
@@ -361,7 +410,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 }
 struct workqueue_struct *__create_workqueue(const char *name,
-                                            int singlethread)
+                                            int singlethread, int freezeable)
 {
        int cpu, destroy = 0;
        struct workqueue_struct *wq;
@@ -381,7 +430,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
        mutex_lock(&workqueue_mutex);
        if (singlethread) {
                INIT_LIST_HEAD(&wq->list);
-                p = create_workqueue_thread(wq, singlethread_cpu);
+                p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
                if (!p)
                        destroy = 1;
                else
@@ -389,7 +438,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
        } else {
                list_add(&wq->list, &workqueues);
                for_each_online_cpu(cpu) {
-                        p = create_workqueue_thread(wq, cpu);
+                        p = create_workqueue_thread(wq, cpu, freezeable);
                        if (p) {
                                kthread_bind(p, cpu);
                                wake_up_process(p);
@@ -468,38 +517,37 @@ EXPORT_SYMBOL(schedule_work);
 /**
 * schedule_delayed_work - put work task in global workqueue after delay
- * @work: job to be done
+ * @dwork: job to be done
- * @delay: number of jiffies to wait
+ * @delay: number of jiffies to wait or 0 for immediate execution
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
-int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
+int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
 {
-        return queue_delayed_work(keventd_wq, work, delay);
+        return queue_delayed_work(keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 /**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
 * @cpu: cpu to use
- * @work: job to be done
+ * @dwork: job to be done
 * @delay: number of jiffies to wait
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue on the specified CPU.
 */
 int schedule_delayed_work_on(int cpu,
-                        struct work_struct *work, unsigned long delay)
+                        struct delayed_work *dwork, unsigned long delay)
 {
-        return queue_delayed_work_on(cpu, keventd_wq, work, delay);
+        return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
 /**
 * schedule_on_each_cpu - call a function on each online CPU from keventd
 * @func: the function to call
- * @info: a pointer to pass to func()
 *
 * Returns zero on success.
 * Returns -ve errno on failure.
@@ -508,7 +556,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 *
 * schedule_on_each_cpu() is very slow.
 */
-int schedule_on_each_cpu(void (*func)(void *info), void *info)
+int schedule_on_each_cpu(work_func_t func)
 {
        int cpu;
        struct work_struct *works;
@@ -519,7 +567,7 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info)
        mutex_lock(&workqueue_mutex);
        for_each_online_cpu(cpu) {
-                INIT_WORK(per_cpu_ptr(works, cpu), func, info);
+                INIT_WORK(per_cpu_ptr(works, cpu), func);
                __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
                                per_cpu_ptr(works, cpu));
        }
@@ -539,12 +587,12 @@ EXPORT_SYMBOL(flush_scheduled_work);
 * cancel_rearming_delayed_workqueue - reliably kill off a delayed
 *                      work whose handler rearms the delayed work.
 * @wq:   the controlling workqueue structure
- * @work: the delayed work struct
+ * @dwork: the delayed work struct
 */
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
-                                       struct work_struct *work)
+                                       struct delayed_work *dwork)
 {
-        while (!cancel_delayed_work(work))
+        while (!cancel_delayed_work(dwork))
                flush_workqueue(wq);
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
@@ -552,18 +600,17 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
 /**
 * cancel_rearming_delayed_work - reliably kill off a delayed keventd
 *                      work whose handler rearms the delayed work.
- * @work: the delayed work struct
+ * @dwork: the delayed work struct
 */
-void cancel_rearming_delayed_work(struct work_struct *work)
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
 {
-        cancel_rearming_delayed_workqueue(keventd_wq, work);
+        cancel_rearming_delayed_workqueue(keventd_wq, dwork);
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_work);
 /**
 * execute_in_process_context - reliably execute the routine with user context
 * @fn:         the function to execute
- * @data:       data to pass to the function
 * @ew:         guaranteed storage for the execute work structure (must
 *              be available when the work executes)
 *
@@ -573,15 +620,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
 * Returns:     0 - function was executed
 *              1 - function was scheduled for execution
 */
-int execute_in_process_context(void (*fn)(void *data), void *data,
+int execute_in_process_context(work_func_t fn, struct execute_work *ew)
-                               struct execute_work *ew)
 {
        if (!in_interrupt()) {
-                fn(data);
+                fn(&ew->work);
                return 0;
        }
-        INIT_WORK(&ew->work, fn, data);
+        INIT_WORK(&ew->work, fn);
        schedule_work(&ew->work);
        return 1;
@@ -609,7 +655,6 @@ int current_is_keventd(void)
 }
-#ifdef CONFIG_HOTPLUG_CPU
 /* Take the work from this (downed) CPU. */
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
@@ -642,7 +687,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                mutex_lock(&workqueue_mutex);
                /* Create a new workqueue thread for it. */
                list_for_each_entry(wq, &workqueues, list) {
-                        if (!create_workqueue_thread(wq, hotcpu)) {
+                        if (!create_workqueue_thread(wq, hotcpu, 0)) {
                                printk("workqueue for %i failed\n", hotcpu);
                                return NOTIFY_BAD;
                        }
@@ -692,7 +737,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
-#endif
 void init_workqueues(void)
 {