64 files changed, 3336 insertions, 1678 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 248e1c396f8b..4af15802ccd4 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -7,7 +7,7 @@ choice
        default HZ_250
        help
         Allows the configuration of the timer frequency. It is customary
-         to have the timer interrupt run at 1000 HZ but 100 HZ may be more
+         to have the timer interrupt run at 1000 Hz but 100 Hz may be more
         beneficial for servers and NUMA systems that do not need to have
         a fast response for user interaction and that may experience bus
         contention and cacheline bounces as a result of timer interrupts.
@@ -19,21 +19,30 @@ choice
        config HZ_100
                bool "100 HZ"
        help
-          100 HZ is a typical choice for servers, SMP and NUMA systems
+          100 Hz is a typical choice for servers, SMP and NUMA systems
          with lots of processors that may show reduced performance if
          too many timer interrupts are occurring.
        config HZ_250
                bool "250 HZ"
        help
-         250 HZ is a good compromise choice allowing server performance
+         250 Hz is a good compromise choice allowing server performance
         while also showing good interactive responsiveness even
-         on SMP and NUMA systems.
+         on SMP and NUMA systems. If you are going to be using NTSC video
+         or multimedia, selected 300Hz instead.
+        config HZ_300
+                bool "300 HZ"
+        help
+         300 Hz is a good compromise choice allowing server performance
+         while also showing good interactive responsiveness even
+         on SMP and NUMA systems and exactly dividing by both PAL and
+         NTSC frame rates for video and multimedia work.
        config HZ_1000
                bool "1000 HZ"
        help
-         1000 HZ is the preferred choice for desktop systems and other
+         1000 Hz is the preferred choice for desktop systems and other
         systems requiring fast interactive responses to events.
 endchoice
@@ -42,5 +51,6 @@ config HZ
        int
        default 100 if HZ_100
        default 250 if HZ_250
+        default 300 if HZ_300
        default 1000 if HZ_1000
diff --git a/kernel/acct.c b/kernel/acct.c
index 0aad5ca36a81..70d0d88e5554 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -89,7 +89,8 @@ struct acct_glbs {
        struct timer_list       timer;
 };
-static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
+static struct acct_glbs acct_globals __cacheline_aligned =
+        {__SPIN_LOCK_UNLOCKED(acct_globals.lock)};
 /*
 * Called whenever the timer says to check the free space.
@@ -117,7 +118,7 @@ static int check_free_space(struct file *file)
        spin_unlock(&acct_globals.lock);
        /* May block */
-        if (vfs_statfs(file->f_dentry, &sbuf))
+        if (vfs_statfs(file->f_path.dentry, &sbuf))
                return res;
        suspend = sbuf.f_blocks * SUSPEND;
        resume = sbuf.f_blocks * RESUME;
@@ -193,7 +194,7 @@ static void acct_file_reopen(struct file *file)
                add_timer(&acct_globals.timer);
        }
        if (old_acct) {
-                mnt_unpin(old_acct->f_vfsmnt);
+                mnt_unpin(old_acct->f_path.mnt);
                spin_unlock(&acct_globals.lock);
                do_acct_process(old_acct);
                filp_close(old_acct, NULL);
@@ -211,7 +212,7 @@ static int acct_on(char *name)
        if (IS_ERR(file))
                return PTR_ERR(file);
-        if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+        if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) {
                filp_close(file, NULL);
                return -EACCES;
        }
@@ -228,11 +229,11 @@ static int acct_on(char *name)
        }
        spin_lock(&acct_globals.lock);
-        mnt_pin(file->f_vfsmnt);
+        mnt_pin(file->f_path.mnt);
        acct_file_reopen(file);
        spin_unlock(&acct_globals.lock);
-        mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */
+        mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
        return 0;
 }
@@ -282,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name)
 void acct_auto_close_mnt(struct vfsmount *m)
 {
        spin_lock(&acct_globals.lock);
-        if (acct_globals.file && acct_globals.file->f_vfsmnt == m)
+        if (acct_globals.file && acct_globals.file->f_path.mnt == m)
                acct_file_reopen(NULL);
        spin_unlock(&acct_globals.lock);
 }
@@ -298,7 +299,7 @@ void acct_auto_close(struct super_block *sb)
 {
        spin_lock(&acct_globals.lock);
        if (acct_globals.file &&
-            acct_globals.file->f_vfsmnt->mnt_sb == sb) {
+            acct_globals.file->f_path.mnt->mnt_sb == sb) {
                acct_file_reopen(NULL);
        }
        spin_unlock(&acct_globals.lock);
@@ -427,6 +428,7 @@ static void do_acct_process(struct file *file)
        u64 elapsed;
        u64 run_time;
        struct timespec uptime;
+        struct tty_struct *tty;
        /*
         * First check to see if there is enough free_space to continue
@@ -483,16 +485,9 @@ static void do_acct_process(struct file *file)
        ac.ac_ppid = current->parent->tgid;
 #endif
-        mutex_lock(&tty_mutex);
-        /* FIXME: Whoever is responsible for current->signal locking needs
-           to use the same locking all over the kernel and document it */
-        read_lock(&tasklist_lock);
-        ac.ac_tty = current->signal->tty ?
-                old_encode_dev(tty_devnum(current->signal->tty)) : 0;
-        read_unlock(&tasklist_lock);
-        mutex_unlock(&tty_mutex);
        spin_lock_irq(&current->sighand->siglock);
+        tty = current->signal->tty;
+        ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
        ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
        ac.ac_flag = pacct->ac_flag;
diff --git a/kernel/audit.c b/kernel/audit.c
index 98106f6078b0..d9b690ac684b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -57,6 +57,7 @@
 #include <linux/netlink.h>
 #include <linux/selinux.h>
 #include <linux/inotify.h>
+#include <linux/freezer.h>
 #include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4f40d923af8e..2e896f8ae29e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
        struct audit_rule *rule;
        int i;
-        rule = kmalloc(sizeof(*rule), GFP_KERNEL);
+        rule = kzalloc(sizeof(*rule), GFP_KERNEL);
        if (unlikely(!rule))
                return NULL;
-        memset(rule, 0, sizeof(*rule));
        rule->flags = krule->flags | krule->listnr;
        rule->action = krule->action;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 42f2f1179711..298897559ca4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -64,6 +64,7 @@
 #include <linux/tty.h>
 #include <linux/selinux.h>
 #include <linux/binfmts.h>
+#include <linux/highmem.h>
 #include <linux/syscalls.h>
 #include "audit.h"
@@ -730,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context)
                printk(KERN_ERR "audit: freed %d contexts\n", count);
 }
-static void audit_log_task_context(struct audit_buffer *ab)
+void audit_log_task_context(struct audit_buffer *ab)
 {
        char *ctx = NULL;
        ssize_t len = 0;
@@ -759,6 +760,8 @@ error_path:
        return;
 }
+EXPORT_SYMBOL(audit_log_task_context);
 static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
        char name[sizeof(tsk->comm)];
@@ -778,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
                        if ((vma->vm_flags & VM_EXECUTABLE) &&
                            vma->vm_file) {
                                audit_log_d_path(ab, "exe=",
-                                                 vma->vm_file->f_dentry,
+                                                 vma->vm_file->f_path.dentry,
-                                                 vma->vm_file->f_vfsmnt);
+                                                 vma->vm_file->f_path.mnt);
                                break;
                        }
                        vma = vma->vm_next;
@@ -823,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                                 context->return_code);
        mutex_lock(&tty_mutex);
+        read_lock(&tasklist_lock);
        if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
                tty = tsk->signal->tty->name;
        else
                tty = "(none)";
+        read_unlock(&tasklist_lock);
        audit_log_format(ab,
                  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
                  " ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1487,6 +1492,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
        return ctx ? ctx->loginuid : -1;
 }
+EXPORT_SYMBOL(audit_get_loginuid);
 /**
 * __audit_mq_open - record audit data for a POSIX MQ open
 * @oflag: open flag
diff --git a/kernel/compat.c b/kernel/compat.c
index d4898aad6cfa..6952dd057300 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -982,4 +982,37 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
        }
        return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
 }
+asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
+                        compat_ulong_t maxnode,
+                        const compat_ulong_t __user *old_nodes,
+                        const compat_ulong_t __user *new_nodes)
+{
+        unsigned long __user *old = NULL;
+        unsigned long __user *new = NULL;
+        nodemask_t tmp_mask;
+        unsigned long nr_bits;
+        unsigned long size;
+        nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
+        size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
+        if (old_nodes) {
+                if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
+                        return -EFAULT;
+                old = compat_alloc_user_space(new_nodes ? size * 2 : size);
+                if (new_nodes)
+                        new = old + size / sizeof(unsigned long);
+                if (copy_to_user(old, nodes_addr(tmp_mask), size))
+                        return -EFAULT;
+        }
+        if (new_nodes) {
+                if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
+                        return -EFAULT;
+                if (new == NULL)
+                        new = compat_alloc_user_space(size);
+                if (copy_to_user(new, nodes_addr(tmp_mask), size))
+                        return -EFAULT;
+        }
+        return sys_migrate_pages(pid, nr_bits + 1, old, new);
+}
 #endif
diff --git a/kernel/configs.c b/kernel/configs.c
index f9e31974f4ad..8fa1fb28f8a7 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
        return count;
 }
-static struct file_operations ikconfig_file_ops = {
+static const struct file_operations ikconfig_file_ops = {
        .owner = THIS_MODULE,
        .read = ikconfig_read_current,
 };
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 663c920b2234..9124669f4586 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -58,8 +58,8 @@ void unlock_cpu_hotplug(void)
                recursive_depth--;
                return;
        }
-        mutex_unlock(&cpu_bitmask_lock);
        recursive = NULL;
+        mutex_unlock(&cpu_bitmask_lock);
 }
 EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
@@ -270,11 +270,7 @@ int disable_nonboot_cpus(void)
                        goto out;
                }
        }
-        error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
-        if (error) {
-                printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
-                goto out;
-        }
        /* We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
         */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6313c38c930e..232aed2b10f9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = {
 *
 *
 * When reading/writing to a file:
- *      - the cpuset to use in file->f_dentry->d_parent->d_fsdata
+ *      - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata
- *      - the 'cftype' of the file is file->f_dentry->d_fsdata
+ *      - the 'cftype' of the file is file->f_path.dentry->d_fsdata
 */
 struct cftype {
@@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
        }
        /* Remaining checks don't apply to root cpuset */
-        if ((par = cur->parent) == NULL)
+        if (cur == &top_cpuset)
                return 0;
+        par = cur->parent;
        /* We must be a subset of our parent cpuset */
        if (!is_cpuset_subset(trial, par))
                return -EACCES;
@@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
        cpu_exclusive_changed =
                (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
        mutex_lock(&callback_mutex);
-        if (turning_on)
+        cs->flags = trialcs.flags;
-                set_bit(bit, &cs->flags);
-        else
-                clear_bit(bit, &cs->flags);
        mutex_unlock(&callback_mutex);
        if (cpu_exclusive_changed)
@@ -1281,18 +1280,19 @@ typedef enum {
        FILE_TASKLIST,
 } cpuset_filetype_t;
-static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf,
+static ssize_t cpuset_common_file_write(struct file *file,
+                                        const char __user *userbuf,
                                        size_t nbytes, loff_t *unused_ppos)
 {
-        struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+        struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
-        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cftype *cft = __d_cft(file->f_path.dentry);
        cpuset_filetype_t type = cft->private;
        char *buffer;
        char *pathbuf = NULL;
        int retval = 0;
        /* Crude upper limit on largest legitimate cpulist user might write. */
-        if (nbytes > 100 + 6 * NR_CPUS)
+        if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))
                return -E2BIG;
        /* +1 for nul-terminator */
@@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf,
                                                size_t nbytes, loff_t *ppos)
 {
        ssize_t retval = 0;
-        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cftype *cft = __d_cft(file->f_path.dentry);
        if (!cft)
                return -ENODEV;
@@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
                                size_t nbytes, loff_t *ppos)
 {
-        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cftype *cft = __d_cft(file->f_path.dentry);
-        struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+        struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
        cpuset_filetype_t type = cft->private;
        char *page;
        ssize_t retval = 0;
@@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt
                                                                loff_t *ppos)
 {
        ssize_t retval = 0;
-        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cftype *cft = __d_cft(file->f_path.dentry);
        if (!cft)
                return -ENODEV;
@@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
        if (err)
                return err;
-        cft = __d_cft(file->f_dentry);
+        cft = __d_cft(file->f_path.dentry);
        if (!cft)
                return -ENODEV;
        if (cft->open)
@@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file)
 static int cpuset_file_release(struct inode *inode, struct file *file)
 {
-        struct cftype *cft = __d_cft(file->f_dentry);
+        struct cftype *cft = __d_cft(file->f_path.dentry);
        if (cft->release)
                return cft->release(inode, file);
        return 0;
@@ -1532,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
        return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
-static struct file_operations cpuset_file_operations = {
+static const struct file_operations cpuset_file_operations = {
        .read = cpuset_file_read,
        .write = cpuset_file_write,
        .llseek = generic_file_llseek,
@@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
 */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
-        struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
+        struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);
        struct ctr_struct *ctr;
        pid_t *pidarray;
        int npids;
@@ -2045,7 +2045,6 @@ out:
        return err;
 }
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
 /*
 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2109,9 +2108,7 @@ static void common_cpu_mem_hotplug_unplug(void)
        mutex_unlock(&callback_mutex);
        mutex_unlock(&manage_mutex);
 }
-#endif
-#ifdef CONFIG_HOTPLUG_CPU
 /*
 * The top_cpuset tracks what CPUs and Memory Nodes are online,
 * period.  This is necessary in order to make cpusets transparent
@@ -2128,7 +2125,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb,
        common_cpu_mem_hotplug_unplug();
        return 0;
 }
-#endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
@@ -2346,32 +2342,48 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 }
 /**
- * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
+ * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
 * @z: is this zone on an allowed node?
- * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
+ * @gfp_mask: memory allocation flags
 *
- * If we're in interrupt, yes, we can always allocate.  If zone
+ * If we're in interrupt, yes, we can always allocate.  If
+ * __GFP_THISNODE is set, yes, we can always allocate.  If zone
 * z's node is in our tasks mems_allowed, yes.  If it's not a
 * __GFP_HARDWALL request and this zone's nodes is in the nearest
 * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
 * Otherwise, no.
 *
+ * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
+ * reduces to cpuset_zone_allowed_hardwall().  Otherwise,
+ * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
+ * from an enclosing cpuset.
+ *
+ * cpuset_zone_allowed_hardwall() only handles the simpler case of
+ * hardwall cpusets, and never sleeps.
+ *
+ * The __GFP_THISNODE placement logic is really handled elsewhere,
+ * by forcibly using a zonelist starting at a specified node, and by
+ * (in get_page_from_freelist()) refusing to consider the zones for
+ * any node on the zonelist except the first.  By the time any such
+ * calls get to this routine, we should just shut up and say 'yes'.
+ *
 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
 * and do not allow allocations outside the current tasks cpuset.
 * GFP_KERNEL allocations are not so marked, so can escape to the
- * nearest mem_exclusive ancestor cpuset.
+ * nearest enclosing mem_exclusive ancestor cpuset.
 *
- * Scanning up parent cpusets requires callback_mutex.  The __alloc_pages()
+ * Scanning up parent cpusets requires callback_mutex.  The
- * routine only calls here with __GFP_HARDWALL bit _not_ set if
+ * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
- * it's a GFP_KERNEL allocation, and all nodes in the current tasks
+ * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
- * mems_allowed came up empty on the first pass over the zonelist.
+ * current tasks mems_allowed came up empty on the first pass over
- * So only GFP_KERNEL allocations, if all nodes in the cpuset are
+ * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
- * short of memory, might require taking the callback_mutex mutex.
+ * cpuset are short of memory, might require taking the callback_mutex
+ * mutex.
 *
 * The first call here from mm/page_alloc:get_page_from_freelist()
- * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
- * no allocation on a node outside the cpuset is allowed (unless in
+ * so no allocation on a node outside the cpuset is allowed (unless
- * interrupt, of course).
+ * in interrupt, of course).
 *
 * The second pass through get_page_from_freelist() doesn't even call
 * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
@@ -2384,12 +2396,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
 *
 * Rule:
- *    Don't call cpuset_zone_allowed() if you can't sleep, unless you
+ *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
 *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
 *    the code that might scan up ancestor cpusets and sleep.
- **/
+ */
-int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 {
        int node;                       /* node that zone z is on */
        const struct cpuset *cs;        /* current cpuset ancestors */
@@ -2419,6 +2431,40 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
        return allowed;
 }
+/*
+ * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags
+ *
+ * If we're in interrupt, yes, we can always allocate.
+ * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
+ * z's node is in our tasks mems_allowed, yes.   Otherwise, no.
+ *
+ * The __GFP_THISNODE placement logic is really handled elsewhere,
+ * by forcibly using a zonelist starting at a specified node, and by
+ * (in get_page_from_freelist()) refusing to consider the zones for
+ * any node on the zonelist except the first.  By the time any such
+ * calls get to this routine, we should just shut up and say 'yes'.
+ *
+ * Unlike the cpuset_zone_allowed_softwall() variant, above,
+ * this variant requires that the zone be in the current tasks
+ * mems_allowed or that we're in interrupt.  It does not scan up the
+ * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
+ * It never sleeps.
+ */
+int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+{
+        int node;                       /* node that zone z is on */
+        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+                return 1;
+        node = zone_to_nid(z);
+        if (node_isset(node, current->mems_allowed))
+                return 1;
+        return 0;
+}
 /**
 * cpuset_lock - lock out any changes to cpuset structures
 *
@@ -2610,7 +2656,7 @@ static int cpuset_open(struct inode *inode, struct file *file)
        return single_open(file, proc_cpuset_show, pid);
 }
-struct file_operations proc_cpuset_operations = {
+const struct file_operations proc_cpuset_operations = {
        .open           = cpuset_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 36752f124c6a..766d5912b26a 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -20,7 +20,7 @@
 #include <linux/delayacct.h>
 int delayacct_on __read_mostly = 1;     /* Delay accounting turned on/off */
-kmem_cache_t *delayacct_cache;
+struct kmem_cache *delayacct_cache;
 static int __init delayacct_setup_disable(char *str)
 {
@@ -41,7 +41,7 @@ void delayacct_init(void)
 void __delayacct_tsk_init(struct task_struct *tsk)
 {
-        tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+        tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL);
        if (tsk->delays)
                spin_lock_init(&tsk->delays->lock);
 }
@@ -66,6 +66,7 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
 {
        struct timespec ts;
        s64 ns;
+        unsigned long flags;
        do_posix_clock_monotonic_gettime(end);
        ts = timespec_sub(*end, *start);
@@ -73,10 +74,10 @@ static void delayacct_end(struct timespec *start, struct timespec *end,
        if (ns < 0)
                return;
-        spin_lock(&current->delays->lock);
+        spin_lock_irqsave(&current->delays->lock, flags);
        *total += ns;
        (*count)++;
-        spin_unlock(&current->delays->lock);
+        spin_unlock_irqrestore(&current->delays->lock, flags);
 }
 void __delayacct_blkio_start(void)
@@ -104,6 +105,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        s64 tmp;
        struct timespec ts;
        unsigned long t1,t2,t3;
+        unsigned long flags;
        /* Though tsk->delays accessed later, early exit avoids
         * unnecessary returning of other data
@@ -136,14 +138,14 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
        /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
-        spin_lock(&tsk->delays->lock);
+        spin_lock_irqsave(&tsk->delays->lock, flags);
        tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
        d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
        tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
        d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
        d->blkio_count += tsk->delays->blkio_count;
        d->swapin_count += tsk->delays->swapin_count;
-        spin_unlock(&tsk->delays->lock);
+        spin_unlock_irqrestore(&tsk->delays->lock, flags);
 done:
        return 0;
@@ -152,11 +154,12 @@ done:
 __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
 {
        __u64 ret;
+        unsigned long flags;
-        spin_lock(&tsk->delays->lock);
+        spin_lock_irqsave(&tsk->delays->lock, flags);
        ret = nsec_to_clock_t(tsk->delays->blkio_delay +
                                tsk->delays->swapin_delay);
-        spin_unlock(&tsk->delays->lock);
+        spin_unlock_irqrestore(&tsk->delays->lock, flags);
        return ret;
 }
diff --git a/kernel/dma.c b/kernel/dma.c
index 2020644c938a..937b13ca33ba 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file)
        return single_open(file, proc_dma_show, NULL);
 }
-static struct file_operations proc_dma_operations = {
+static const struct file_operations proc_dma_operations = {
        .open           = proc_dma_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
diff --git a/kernel/exit.c b/kernel/exit.c
index 06de6c4e8ca3..122fadb972fc 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,7 +13,7 @@
 #include <linux/completion.h>
 #include <linux/personality.h>
 #include <linux/tty.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/key.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
@@ -22,6 +22,7 @@
 #include <linux/file.h>
 #include <linux/binfmts.h>
 #include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/profile.h>
 #include <linux/mount.h>
@@ -48,7 +49,6 @@
 #include <asm/mmu_context.h>
 extern void sem_exit (void);
-extern struct task_struct *child_reaper;
 static void exit_mm(struct task_struct * tsk);
@@ -189,21 +189,18 @@ repeat:
 int session_of_pgrp(int pgrp)
 {
        struct task_struct *p;
-        int sid = -1;
+        int sid = 0;
        read_lock(&tasklist_lock);
-        do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
-                if (p->signal->session > 0) {
+        p = find_task_by_pid_type(PIDTYPE_PGID, pgrp);
-                        sid = p->signal->session;
+        if (p == NULL)
-                        goto out;
+                p = find_task_by_pid(pgrp);
-                }
+        if (p != NULL)
-        } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+                sid = process_session(p);
-        p = find_task_by_pid(pgrp);
-        if (p)
-                sid = p->signal->session;
-out:
        read_unlock(&tasklist_lock);
-        
        return sid;
 }
@@ -225,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
                                || p->exit_state
                                || is_init(p->real_parent))
                        continue;
-                if (process_group(p->real_parent) != pgrp
+                if (process_group(p->real_parent) != pgrp &&
-                            && p->real_parent->signal->session == p->signal->session) {
+                    process_session(p->real_parent) == process_session(p)) {
                        ret = 0;
                        break;
                }
@@ -260,7 +257,8 @@ static int has_stopped_jobs(int pgrp)
 }
 /**
- * reparent_to_init - Reparent the calling kernel thread to the init task.
+ * reparent_to_init - Reparent the calling kernel thread to the init task
+ * of the pid space that the thread belongs to.
 *
 * If a kernel thread is launched as a result of a system call, or if
 * it ever exits, it should generally reparent itself to init so that
@@ -278,8 +276,8 @@ static void reparent_to_init(void)
        ptrace_unlink(current);
        /* Reparent to init */
        remove_parent(current);
-        current->parent = child_reaper;
+        current->parent = child_reaper(current);
-        current->real_parent = child_reaper;
+        current->real_parent = child_reaper(current);
        add_parent(current);
        /* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -302,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp)
 {
        struct task_struct *curr = current->group_leader;
-        if (curr->signal->session != session) {
+        if (process_session(curr) != session) {
                detach_pid(curr, PIDTYPE_SID);
-                curr->signal->session = session;
+                set_signal_session(curr->signal, session);
                attach_pid(curr, PIDTYPE_SID, session);
        }
        if (process_group(curr) != pgrp) {
@@ -314,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp)
        }
 }
-void set_special_pids(pid_t session, pid_t pgrp)
+static void set_special_pids(pid_t session, pid_t pgrp)
 {
        write_lock_irq(&tasklist_lock);
        __set_special_pids(session, pgrp);
@@ -384,9 +382,7 @@ void daemonize(const char *name, ...)
        exit_mm(current);
        set_special_pids(1, 1);
-        mutex_lock(&tty_mutex);
+        proc_clear_tty(current);
-        current->signal->tty = NULL;
-        mutex_unlock(&tty_mutex);
        /* Block and flush all signals */
        sigfillset(&blocked);
@@ -429,7 +425,7 @@ static void close_files(struct files_struct * files)
        for (;;) {
                unsigned long set;
                i = j * __NFDBITS;
-                if (i >= fdt->max_fdset || i >= fdt->max_fds)
+                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds->fds_bits[j++];
                while (set) {
@@ -470,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files)
                 * you can free files immediately.
                 */
                fdt = files_fdtable(files);
-                if (fdt == &files->fdtab)
+                if (fdt != &files->fdtab)
-                        fdt->free_files = files;
-                else
                        kmem_cache_free(files_cachep, files);
-                free_fdtable(fdt);
+                call_rcu(&fdt->rcu, free_fdtable_rcu);
        }
 }
@@ -649,10 +643,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
         * outside, so the child pgrp is now orphaned.
         */
        if ((process_group(p) != process_group(father)) &&
-            (p->signal->session == father->signal->session)) {
+            (process_session(p) == process_session(father))) {
                int pgrp = process_group(p);
-                if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
+                if (will_become_orphaned_pgrp(pgrp, NULL) &&
+                    has_stopped_jobs(pgrp)) {
                        __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
                        __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
                }
@@ -663,7 +658,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 * When we die, we re-parent all our children.
 * Try to give them to another thread in our thread
 * group, and if no such member exists, give it to
- * the global child reaper process (ie "init")
+ * the child reaper process (ie "init") in our pid
+ * space.
 */
 static void
 forget_original_parent(struct task_struct *father, struct list_head *to_release)
@@ -674,7 +670,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release)
        do {
                reaper = next_thread(reaper);
                if (reaper == father) {
-                        reaper = child_reaper;
+                        reaper = child_reaper(father);
                        break;
                }
        } while (reaper->exit_state);
@@ -786,7 +782,7 @@ static void exit_notify(struct task_struct *tsk)
        t = tsk->real_parent;
        
        if ((process_group(t) != process_group(tsk)) &&
-            (t->signal->session == tsk->signal->session) &&
+            (process_session(t) == process_session(tsk)) &&
            will_become_orphaned_pgrp(process_group(tsk), tsk) &&
            has_stopped_jobs(process_group(tsk))) {
                __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
@@ -850,9 +846,7 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
        struct task_struct *tsk = current;
-        struct taskstats *tidstats;
        int group_dead;
-        unsigned int mycpu;
        profile_task_exit(tsk);
@@ -862,8 +856,13 @@ fastcall NORET_TYPE void do_exit(long code)
                panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");
-        if (unlikely(tsk == child_reaper))
+        if (unlikely(tsk == child_reaper(tsk))) {
-                panic("Attempted to kill init!");
+                if (tsk->nsproxy->pid_ns != &init_pid_ns)
+                        tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper;
+                else
+                        panic("Attempted to kill init!");
+        }
        if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
                current->ptrace_message = code;
@@ -890,8 +889,6 @@ fastcall NORET_TYPE void do_exit(long code)
                                current->comm, current->pid,
                                preempt_count());
-        taskstats_exit_alloc(&tidstats, &mycpu);
        acct_update_integrals(tsk);
        if (tsk->mm) {
                update_hiwater_rss(tsk->mm);
@@ -911,8 +908,8 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
        if (unlikely(tsk->audit_context))
                audit_free(tsk);
-        taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
-        taskstats_exit_free(tidstats);
+        taskstats_exit(tsk, group_dead);
        exit_mm(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3da978eec791..fc723e595cd5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,7 +18,7 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/completion.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/personality.h>
 #include <linux/mempolicy.h>
 #include <linux/sem.h>
@@ -36,6 +36,7 @@
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/futex.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/rcupdate.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
@@ -82,26 +83,26 @@ int nr_processes(void)
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 # define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
 # define free_task_struct(tsk)  kmem_cache_free(task_struct_cachep, (tsk))
-static kmem_cache_t *task_struct_cachep;
+static struct kmem_cache *task_struct_cachep;
 #endif
 /* SLAB cache for signal_struct structures (tsk->signal) */
-static kmem_cache_t *signal_cachep;
+static struct kmem_cache *signal_cachep;
 /* SLAB cache for sighand_struct structures (tsk->sighand) */
-kmem_cache_t *sighand_cachep;
+struct kmem_cache *sighand_cachep;
 /* SLAB cache for files_struct structures (tsk->files) */
-kmem_cache_t *files_cachep;
+struct kmem_cache *files_cachep;
 /* SLAB cache for fs_struct structures (tsk->fs) */
-kmem_cache_t *fs_cachep;
+struct kmem_cache *fs_cachep;
 /* SLAB cache for vm_area_struct structures */
-kmem_cache_t *vm_area_cachep;
+struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
-static kmem_cache_t *mm_cachep;
+static struct kmem_cache *mm_cachep;
 void free_task(struct task_struct *tsk)
 {
@@ -202,7 +203,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        struct mempolicy *pol;
        down_write(&oldmm->mmap_sem);
-        flush_cache_mm(oldmm);
+        flush_cache_dup_mm(oldmm);
        /*
         * Not linked in yet - no deadlock potential:
         */
@@ -237,7 +238,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                goto fail_nomem;
                        charge = len;
                }
-                tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+                tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                if (!tmp)
                        goto fail_nomem;
                *tmp = *mpnt;
@@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                anon_vma_link(tmp);
                file = tmp->vm_file;
                if (file) {
-                        struct inode *inode = file->f_dentry->d_inode;
+                        struct inode *inode = file->f_path.dentry->d_inode;
                        get_file(file);
                        if (tmp->vm_flags & VM_DENYWRITE)
                                atomic_dec(&inode->i_writecount);
@@ -319,7 +320,7 @@ static inline void mm_free_pgd(struct mm_struct * mm)
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
-#define allocate_mm()   (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
+#define allocate_mm()   (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)     (kmem_cache_free(mm_cachep, (mm)))
 #include <linux/init_task.h>
@@ -448,7 +449,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
                tsk->vfork_done = NULL;
                complete(vfork_done);
        }
-        if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
+        /*
+         * If we're exiting normally, clear a user-space tid field if
+         * requested.  We leave this alone when dying by signal, to leave
+         * the value intact in a core dump, and to save the unnecessary
+         * trouble otherwise.  Userland only wants this done for a sys_exit.
+         */
+        if (tsk->clear_child_tid
+            && !(tsk->flags & PF_SIGNALED)
+            && atomic_read(&mm->mm_users) > 1) {
                u32 __user * tidptr = tsk->clear_child_tid;
                tsk->clear_child_tid = NULL;
@@ -479,6 +489,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
        memcpy(mm, oldmm, sizeof(*mm));
+        /* Initializing for Swap token stuff */
+        mm->token_priority = 0;
+        mm->last_interval = 0;
        if (!mm_init(mm))
                goto fail_nomem;
@@ -542,6 +556,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
                goto fail_nomem;
 good_mm:
+        /* Initializing for Swap token stuff */
+        mm->token_priority = 0;
+        mm->last_interval = 0;
        tsk->mm = mm;
        tsk->active_mm = mm;
        return 0;
@@ -596,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 static int count_open_files(struct fdtable *fdt)
 {
-        int size = fdt->max_fdset;
+        int size = fdt->max_fds;
        int i;
        /* Find the last open fd */
@@ -613,7 +631,7 @@ static struct files_struct *alloc_files(void)
        struct files_struct *newf;
        struct fdtable *fdt;
-        newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+        newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
        if (!newf)
                goto out;
@@ -623,12 +641,10 @@ static struct files_struct *alloc_files(void)
        newf->next_fd = 0;
        fdt = &newf->fdtab;
        fdt->max_fds = NR_OPEN_DEFAULT;
-        fdt->max_fdset = EMBEDDED_FD_SET_SIZE;
        fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
        fdt->open_fds = (fd_set *)&newf->open_fds_init;
        fdt->fd = &newf->fd_array[0];
        INIT_RCU_HEAD(&fdt->rcu);
-        fdt->free_files = NULL;
        fdt->next = NULL;
        rcu_assign_pointer(newf->fdt, fdt);
 out:
@@ -644,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 {
        struct files_struct *newf;
        struct file **old_fds, **new_fds;
-        int open_files, size, i, expand;
+        int open_files, size, i;
        struct fdtable *old_fdt, *new_fdt;
        *errorp = -ENOMEM;
@@ -655,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
        spin_lock(&oldf->file_lock);
        old_fdt = files_fdtable(oldf);
        new_fdt = files_fdtable(newf);
-        size = old_fdt->max_fdset;
        open_files = count_open_files(old_fdt);
-        expand = 0;
        /*
-         * Check whether we need to allocate a larger fd array or fd set.
+         * Check whether we need to allocate a larger fd array and fd set.
-         * Note: we're not a clone task, so the open count won't  change.
+         * Note: we're not a clone task, so the open count won't change.
         */
-        if (open_files > new_fdt->max_fdset) {
-                new_fdt->max_fdset = 0;
-                expand = 1;
-        }
        if (open_files > new_fdt->max_fds) {
                new_fdt->max_fds = 0;
-                expand = 1;
-        }
-        /* if the old fdset gets grown now, we'll only copy up to "size" fds */
-        if (expand) {
                spin_unlock(&oldf->file_lock);
                spin_lock(&newf->file_lock);
                *errorp = expand_files(newf, open_files-1);
@@ -693,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
        old_fds = old_fdt->fd;
        new_fds = new_fdt->fd;
-        memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
+        memcpy(new_fdt->open_fds->fds_bits,
-        memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
+                old_fdt->open_fds->fds_bits, open_files/8);
+        memcpy(new_fdt->close_on_exec->fds_bits,
+                old_fdt->close_on_exec->fds_bits, open_files/8);
        for (i = open_files; i != 0; i--) {
                struct file *f = *old_fds++;
@@ -719,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
        /* This is long word aligned thus could use a optimized version */ 
        memset(new_fds, 0, size); 
-        if (new_fdt->max_fdset > open_files) {
+        if (new_fdt->max_fds > open_files) {
-                int left = (new_fdt->max_fdset-open_files)/8;
+                int left = (new_fdt->max_fds-open_files)/8;
                int start = open_files / (8 * sizeof(unsigned long));
                memset(&new_fdt->open_fds->fds_bits[start], 0, left);
                memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
        }
-out:
        return newf;
 out_release:
-        free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
-        free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
-        free_fd_array(new_fdt->fd, new_fdt->max_fds);
        kmem_cache_free(files_cachep, newf);
+out:
        return NULL;
 }
@@ -830,7 +834,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
        if (clone_flags & CLONE_THREAD) {
                atomic_inc(&current->signal->count);
                atomic_inc(&current->signal->live);
-                taskstats_tgid_alloc(current);
                return 0;
        }
        sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -1039,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->wchar = 0;           /* I/O counter: bytes written */
        p->syscr = 0;           /* I/O counter: read syscalls */
        p->syscw = 0;           /* I/O counter: write syscalls */
+        task_io_accounting_init(p);
        acct_clear_integrals(p);
        p->it_virt_expires = cputime_zero;
@@ -1243,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                if (thread_group_leader(p)) {
                        p->signal->tty = current->signal->tty;
                        p->signal->pgrp = process_group(current);
-                        p->signal->session = current->signal->session;
+                        set_signal_session(p->signal, process_session(current));
                        attach_pid(p, PIDTYPE_PGID, process_group(p));
-                        attach_pid(p, PIDTYPE_SID, p->signal->session);
+                        attach_pid(p, PIDTYPE_SID, process_session(p));
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        __get_cpu_var(process_counts)++;
@@ -1303,7 +1307,7 @@ fork_out:
        return ERR_PTR(retval);
 }
-struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
+noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
 {
        memset(regs, 0, sizeof(struct pt_regs));
        return regs;
@@ -1315,9 +1319,8 @@ struct task_struct * __devinit fork_idle(int cpu)
        struct pt_regs regs;
        task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
-        if (!task)
+        if (!IS_ERR(task))
-                return ERR_PTR(-ENOMEM);
+                init_idle(task, cpu);
-        init_idle(task, cpu);
        return task;
 }
@@ -1414,7 +1417,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
-static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags)
 {
        struct sighand_struct *sighand = data;
@@ -1510,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 }
 /*
- * Unshare the namespace structure if it is being shared
+ * Unshare the mnt_namespace structure if it is being shared
 */
-static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
+static int unshare_mnt_namespace(unsigned long unshare_flags,
+                struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
 {
-        struct namespace *ns = current->nsproxy->namespace;
+        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        if ((unshare_flags & CLONE_NEWNS) && ns) {
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
-                *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
+                *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
                if (!*new_nsp)
                        return -ENOMEM;
        }
@@ -1529,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new
 }
 /*
- * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
+ * Unsharing of sighand is not supported yet
- * supported yet
 */
 static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
 {
        struct sighand_struct *sigh = current->sighand;
-        if ((unshare_flags & CLONE_SIGHAND) &&
+        if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
-            (sigh && atomic_read(&sigh->count) > 1))
                return -EINVAL;
        else
                return 0;
@@ -1610,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 {
        int err = 0;
        struct fs_struct *fs, *new_fs = NULL;
-        struct namespace *ns, *new_ns = NULL;
+        struct mnt_namespace *ns, *new_ns = NULL;
-        struct sighand_struct *sigh, *new_sigh = NULL;
+        struct sighand_struct *new_sigh = NULL;
        struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
        struct files_struct *fd, *new_fd = NULL;
        struct sem_undo_list *new_ulist = NULL;
@@ -1632,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                goto bad_unshare_out;
        if ((err = unshare_fs(unshare_flags, &new_fs)))
                goto bad_unshare_cleanup_thread;
-        if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
+        if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
                goto bad_unshare_cleanup_fs;
        if ((err = unshare_sighand(unshare_flags, &new_sigh)))
                goto bad_unshare_cleanup_ns;
@@ -1656,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                }
        }
-        if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist ||
+        if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
                                new_uts || new_ipc) {
                task_lock(current);
@@ -1673,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
                }
                if (new_ns) {
-                        ns = current->nsproxy->namespace;
+                        ns = current->nsproxy->mnt_ns;
-                        current->nsproxy->namespace = new_ns;
+                        current->nsproxy->mnt_ns = new_ns;
                        new_ns = ns;
                }
-                if (new_sigh) {
-                        sigh = current->sighand;
-                        rcu_assign_pointer(current->sighand, new_sigh);
-                        new_sigh = sigh;
-                }
                if (new_mm) {
                        mm = current->mm;
                        active_mm = current->active_mm;
@@ -1741,7 +1737,7 @@ bad_unshare_cleanup_sigh:
 bad_unshare_cleanup_ns:
        if (new_ns)
-                put_namespace(new_ns);
+                put_mnt_ns(new_ns);
 bad_unshare_cleanup_fs:
        if (new_fs)
diff --git a/kernel/futex.c b/kernel/futex.c
index b364e0026191..5a737de857d3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
 /*
 * Get parameters which are the keys for a futex.
 *
- * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
+ * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
 * offset_within_page).  For private mappings, it's (uaddr, current->mm).
 * We can usually work out the index without swapping in the page.
 *
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
        /*
         * Linear file mappings are also simple.
         */
-        key->shared.inode = vma->vm_file->f_dentry->d_inode;
+        key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
        key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
        if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
                key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
        int ret;
-        inc_preempt_count();
+        pagefault_disable();
        ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
-        dec_preempt_count();
+        pagefault_enable();
        return ret ? -EFAULT : 0;
 }
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void)
        if (likely(current->pi_state_cache))
                return 0;
-        pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+        pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
        if (!pi_state)
                return -ENOMEM;
-        memset(pi_state, 0, sizeof(*pi_state));
        INIT_LIST_HEAD(&pi_state->list);
        /* pi_mutex gets initialized later */
        pi_state->owner = NULL;
@@ -553,7 +552,7 @@ static void wake_futex(struct futex_q *q)
         * at the end of wake_up_all() does not prevent this store from
         * moving.
         */
-        wmb();
+        smp_wmb();
        q->lock_ptr = NULL;
 }
@@ -585,9 +584,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        if (!(uval & FUTEX_OWNER_DIED)) {
                newval = FUTEX_WAITERS | new_owner->pid;
-                inc_preempt_count();
+                pagefault_disable();
                curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-                dec_preempt_count();
+                pagefault_enable();
                if (curval == -EFAULT)
                        return -EFAULT;
                if (curval != uval)
@@ -618,9 +617,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
         * There is no waiter, so we unlock the futex. The owner died
         * bit has not to be preserved here. We are the owner:
         */
-        inc_preempt_count();
+        pagefault_disable();
        oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
-        dec_preempt_count();
+        pagefault_enable();
        if (oldval == -EFAULT)
                return oldval;
@@ -1158,9 +1157,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
         */
        newval = current->pid;
-        inc_preempt_count();
+        pagefault_disable();
        curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
-        dec_preempt_count();
+        pagefault_enable();
        if (unlikely(curval == -EFAULT))
                goto uaddr_faulted;
@@ -1183,9 +1182,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
        uval = curval;
        newval = uval | FUTEX_WAITERS;
-        inc_preempt_count();
+        pagefault_disable();
        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-        dec_preempt_count();
+        pagefault_enable();
        if (unlikely(curval == -EFAULT))
                goto uaddr_faulted;
@@ -1215,10 +1214,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
                        newval = current->pid |
                                FUTEX_OWNER_DIED | FUTEX_WAITERS;
-                        inc_preempt_count();
+                        pagefault_disable();
                        curval = futex_atomic_cmpxchg_inatomic(uaddr,
                                                               uval, newval);
-                        dec_preempt_count();
+                        pagefault_enable();
                        if (unlikely(curval == -EFAULT))
                                goto uaddr_faulted;
@@ -1390,9 +1389,9 @@ retry_locked:
         * anyone else up:
         */
        if (!(uval & FUTEX_OWNER_DIED)) {
-                inc_preempt_count();
+                pagefault_disable();
                uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
-                dec_preempt_count();
+                pagefault_enable();
        }
        if (unlikely(uval == -EFAULT))
@@ -1493,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp,
        return ret;
 }
-static struct file_operations futex_fops = {
+static const struct file_operations futex_fops = {
        .release        = futex_close,
        .poll           = futex_poll,
 };
@@ -1507,6 +1506,13 @@ static int futex_fd(u32 __user *uaddr, int signal)
        struct futex_q *q;
        struct file *filp;
        int ret, err;
+        static unsigned long printk_interval;
+        if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
+                printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
+                        "will be removed from the kernel in June 2007\n",
+                        current->comm);
+        }
        ret = -EINVAL;
        if (!valid_signal(signal))
@@ -1522,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal)
                goto out;
        }
        filp->f_op = &futex_fops;
-        filp->f_vfsmnt = mntget(futex_mnt);
+        filp->f_path.mnt = mntget(futex_mnt);
-        filp->f_dentry = dget(futex_mnt->mnt_root);
+        filp->f_path.dentry = dget(futex_mnt->mnt_root);
-        filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+        filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
        if (signal) {
                err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
@@ -1851,10 +1857,16 @@ static struct file_system_type futex_fs_type = {
 static int __init init(void)
 {
-        unsigned int i;
+        int i = register_filesystem(&futex_fs_type);
+        if (i)
+                return i;
-        register_filesystem(&futex_fs_type);
        futex_mnt = kern_mount(&futex_fs_type);
+        if (IS_ERR(futex_mnt)) {
+                unregister_filesystem(&futex_fs_type);
+                return PTR_ERR(futex_mnt);
+        }
        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
                INIT_LIST_HEAD(&futex_queues[i].chain);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2d0dc3efe813..ebfd24a41858 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -233,6 +233,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
                chip->shutdown = chip->disable;
        if (!chip->name)
                chip->name = chip->typename;
+        if (!chip->end)
+                chip->end = dummy_irq_chip.end;
 }
 static inline void mask_ack_irq(struct irq_desc *desc, int irq)
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 42aa6f1a3f0f..aff1f0fabb0d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
                .chip = &no_irq_chip,
                .handle_irq = handle_bad_irq,
                .depth = 1,
-                .lock = SPIN_LOCK_UNLOCKED,
+                .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
 #ifdef CONFIG_SMP
                .affinity = CPU_MASK_ALL
 #endif
@@ -231,10 +231,10 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
                spin_unlock(&desc->lock);
                action_ret = handle_IRQ_event(irq, action);
-                spin_lock(&desc->lock);
                if (!noirqdebug)
                        note_interrupt(irq, desc, action_ret);
+                spin_lock(&desc->lock);
                if (likely(!(desc->status & IRQ_PENDING)))
                        break;
                desc->status &= ~IRQ_PENDING;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6879202afe9a..b385878c6e80 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,6 +216,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 {
        struct irq_desc *desc = irq_desc + irq;
        struct irqaction *old, **p;
+        const char *old_name = NULL;
        unsigned long flags;
        int shared = 0;
@@ -255,8 +256,10 @@ int setup_irq(unsigned int irq, struct irqaction *new)
                 * set the trigger type must match.
                 */
                if (!((old->flags & new->flags) & IRQF_SHARED) ||
-                    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK))
+                    ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+                        old_name = old->name;
                        goto mismatch;
+                }
 #if defined(CONFIG_IRQ_PER_CPU)
                /* All handlers must agree on per-cpuness */
@@ -322,11 +325,13 @@ int setup_irq(unsigned int irq, struct irqaction *new)
        return 0;
 mismatch:
-        spin_unlock_irqrestore(&desc->lock, flags);
        if (!(new->flags & IRQF_PROBE_SHARED)) {
                printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
+                if (old_name)
+                        printk(KERN_ERR "current handler: %s\n", old_name);
                dump_stack();
        }
+        spin_unlock_irqrestore(&desc->lock, flags);
        return -EBUSY;
 }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 9a352667007c..61f5c717a8f5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
        unsigned int irq = (int)(long)data, full_count = count, err;
        cpumask_t new_value, tmp;
-        if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
+        if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
+                                CHECK_IRQ_PER_CPU(irq_desc[irq].status))
                return -EIO;
        err = cpumask_parse_user(buffer, count, new_value);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index eeac3e313b2b..6f294ff4f9ee 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -20,6 +20,7 @@
 #include <linux/proc_fs.h>
 #include <linux/sched.h>        /* for cond_resched */
 #include <linux/mm.h>
+#include <linux/ctype.h>
 #include <asm/sections.h>
@@ -30,14 +31,14 @@
 #endif
 /* These will be re-linked against their real values during the second link stage */
-extern unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
-extern unsigned long kallsyms_num_syms __attribute__((weak,section("data")));
+extern const unsigned long kallsyms_num_syms __attribute__((weak));
-extern u8 kallsyms_names[] __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
-extern u8 kallsyms_token_table[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __attribute__((weak));
-extern u16 kallsyms_token_index[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
-extern unsigned long kallsyms_markers[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
 static inline int is_kernel_inittext(unsigned long addr)
 {
@@ -83,7 +84,7 @@ static int is_ksym_addr(unsigned long addr)
 static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
 {
        int len, skipped_first = 0;
-        u8 *tptr, *data;
+        const u8 *tptr, *data;
        /* get the compressed symbol length from the first symbol byte */
        data = &kallsyms_names[off];
@@ -131,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
 * kallsyms array */
 static unsigned int get_symbol_offset(unsigned long pos)
 {
-        u8 *name;
+        const u8 *name;
        int i;
        /* use the closest marker we have. We have markers every 256 positions,
@@ -301,13 +302,6 @@ struct kallsym_iter
        char name[KSYM_NAME_LEN+1];
 };
-/* Only label it "global" if it is exported. */
-static void upcase_if_global(struct kallsym_iter *iter)
-{
-        if (is_exported(iter->name, iter->owner))
-                iter->type += 'A' - 'a';
-}
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
        iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
@@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
        if (iter->owner == NULL)
                return 0;
-        upcase_if_global(iter);
+        /* Label it "global" if it is exported, "local" if not exported. */
+        iter->type = is_exported(iter->name, iter->owner)
+                ? toupper(iter->type) : tolower(iter->type);
        return 1;
 }
@@ -401,7 +398,7 @@ static int s_show(struct seq_file *m, void *p)
        return 0;
 }
-static struct seq_operations kallsyms_op = {
+static const struct seq_operations kallsyms_op = {
        .start = s_start,
        .next = s_next,
        .stop = s_stop,
@@ -436,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file)
        return seq_release(inode, file);
 }
-static struct file_operations kallsyms_operations = {
+static const struct file_operations kallsyms_operations = {
        .open = kallsyms_open,
        .read = seq_read,
        .llseek = seq_lseek,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index fcdd5d2bc3f4..2a59c8a01ae0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -20,6 +20,8 @@
 #include <linux/syscalls.h>
 #include <linux/ioport.h>
 #include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -108,11 +110,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
        /* Allocate a controlling structure */
        result = -ENOMEM;
-        image = kmalloc(sizeof(*image), GFP_KERNEL);
+        image = kzalloc(sizeof(*image), GFP_KERNEL);
        if (!image)
                goto out;
-        memset(image, 0, sizeof(*image));
        image->head = 0;
        image->entry = &image->head;
        image->last_entry = &image->head;
@@ -851,6 +852,7 @@ static int kimage_load_crash_segment(struct kimage *image,
                        memset(ptr + uchunk, 0, mchunk - uchunk);
                }
                result = copy_from_user(ptr, buf, uchunk);
+                kexec_flush_icache_page(page);
                kunmap(page);
                if (result) {
                        result = (result < 0) ? result : -EIO;
@@ -1067,6 +1069,60 @@ void crash_kexec(struct pt_regs *regs)
        }
 }
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+                            size_t data_len)
+{
+        struct elf_note note;
+        note.n_namesz = strlen(name) + 1;
+        note.n_descsz = data_len;
+        note.n_type   = type;
+        memcpy(buf, &note, sizeof(note));
+        buf += (sizeof(note) + 3)/4;
+        memcpy(buf, name, note.n_namesz);
+        buf += (note.n_namesz + 3)/4;
+        memcpy(buf, data, note.n_descsz);
+        buf += (note.n_descsz + 3)/4;
+        return buf;
+}
+static void final_note(u32 *buf)
+{
+        struct elf_note note;
+        note.n_namesz = 0;
+        note.n_descsz = 0;
+        note.n_type   = 0;
+        memcpy(buf, &note, sizeof(note));
+}
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+        struct elf_prstatus prstatus;
+        u32 *buf;
+        if ((cpu < 0) || (cpu >= NR_CPUS))
+                return;
+        /* Using ELF notes here is opportunistic.
+         * I need a well defined structure format
+         * for the data I pass, and I need tags
+         * on the data to indicate what information I have
+         * squirrelled away.  ELF notes happen to provide
+         * all of that, so there is no need to invent something new.
+         */
+        buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+        if (!buf)
+                return;
+        memset(&prstatus, 0, sizeof(prstatus));
+        prstatus.pr_pid = current->pid;
+        elf_core_copy_regs(&prstatus.pr_reg, regs);
+        buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+                                sizeof(prstatus));
+        final_note(buf);
+}
 static int __init crash_notes_memory_init(void)
 {
        /* Allocate memory for saving cpu registers. */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb4e29d924e4..3a7379aa31ca 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,7 +25,7 @@
 #include <linux/kmod.h>
 #include <linux/smp_lock.h>
 #include <linux/slab.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/completion.h>
 #include <linux/file.h>
 #include <linux/workqueue.h>
@@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module);
 #endif /* CONFIG_KMOD */
 struct subprocess_info {
+        struct work_struct work;
        struct completion *complete;
        char *path;
        char **argv;
@@ -221,9 +222,10 @@ static int wait_for_helper(void *data)
 }
 /* This is run by khelper thread  */
-static void __call_usermodehelper(void *data)
+static void __call_usermodehelper(struct work_struct *work)
 {
-        struct subprocess_info *sub_info = data;
+        struct subprocess_info *sub_info =
+                container_of(work, struct subprocess_info, work);
        pid_t pid;
        int wait = sub_info->wait;
@@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
 {
        DECLARE_COMPLETION_ONSTACK(done);
        struct subprocess_info sub_info = {
+                .work           = __WORK_INITIALIZER(sub_info.work,
+                                                     __call_usermodehelper),
                .complete       = &done,
                .path           = path,
                .argv           = argv,
@@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
                .wait           = wait,
                .retval         = 0,
        };
-        DECLARE_WORK(work, __call_usermodehelper, &sub_info);
        if (!khelper_wq)
                return -EBUSY;
@@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
        if (path[0] == '\0')
                return 0;
-        queue_work(khelper_wq, &work);
+        queue_work(khelper_wq, &sub_info.work);
        wait_for_completion(&done);
        return sub_info.retval;
 }
@@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
 {
        DECLARE_COMPLETION(done);
        struct subprocess_info sub_info = {
+                .work           = __WORK_INITIALIZER(sub_info.work,
+                                                     __call_usermodehelper),
                .complete       = &done,
                .path           = path,
                .argv           = argv,
@@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
                .retval         = 0,
        };
        struct file *f;
-        DECLARE_WORK(work, __call_usermodehelper, &sub_info);
        if (!khelper_wq)
                return -EBUSY;
@@ -307,18 +311,18 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
                return 0;
        f = create_write_pipe();
-        if (!f)
+        if (IS_ERR(f))
-                return -ENOMEM;
+                return PTR_ERR(f);
        *filp = f;
        f = create_read_pipe(f);
-        if (!f) {
+        if (IS_ERR(f)) {
                free_write_pipe(*filp);
-                return -ENOMEM;
+                return PTR_ERR(f);
        }
        sub_info.stdin = f;
-        queue_work(khelper_wq, &work);
+        queue_work(khelper_wq, &sub_info.work);
        wait_for_completion(&done);
        return sub_info.retval;
 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 610c837ad9e0..17ec4afb0994 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -38,6 +38,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
+#include <linux/freezer.h>
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
@@ -83,9 +84,36 @@ struct kprobe_insn_page {
        kprobe_opcode_t *insns;         /* Page of instruction slots */
        char slot_used[INSNS_PER_PAGE];
        int nused;
+        int ngarbage;
 };
 static struct hlist_head kprobe_insn_pages;
+static int kprobe_garbage_slots;
+static int collect_garbage_slots(void);
+static int __kprobes check_safety(void)
+{
+        int ret = 0;
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM)
+        ret = freeze_processes();
+        if (ret == 0) {
+                struct task_struct *p, *q;
+                do_each_thread(p, q) {
+                        if (p != current && p->state == TASK_RUNNING &&
+                            p->pid != 0) {
+                                printk("Check failed: %s is running\n",p->comm);
+                                ret = -1;
+                                goto loop_end;
+                        }
+                } while_each_thread(p, q);
+        }
+loop_end:
+        thaw_processes();
+#else
+        synchronize_sched();
+#endif
+        return ret;
+}
 /**
 * get_insn_slot() - Find a slot on an executable page for an instruction.
@@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
+      retry:
        hlist_for_each(pos, &kprobe_insn_pages) {
                kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
                if (kip->nused < INSNS_PER_PAGE) {
@@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
                }
        }
-        /* All out of space.  Need to allocate a new page. Use slot 0.*/
+        /* If there are any garbage slots, collect it and try again. */
+        if (kprobe_garbage_slots && collect_garbage_slots() == 0) {
+                goto retry;
+        }
+        /* All out of space.  Need to allocate a new page. Use slot 0. */
        kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
        if (!kip) {
                return NULL;
@@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
        memset(kip->slot_used, 0, INSNS_PER_PAGE);
        kip->slot_used[0] = 1;
        kip->nused = 1;
+        kip->ngarbage = 0;
        return kip->insns;
 }
-void __kprobes free_insn_slot(kprobe_opcode_t *slot)
+/* Return 1 if all garbages are collected, otherwise 0. */
+static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+{
+        kip->slot_used[idx] = 0;
+        kip->nused--;
+        if (kip->nused == 0) {
+                /*
+                 * Page is no longer in use.  Free it unless
+                 * it's the last one.  We keep the last one
+                 * so as not to have to set it up again the
+                 * next time somebody inserts a probe.
+                 */
+                hlist_del(&kip->hlist);
+                if (hlist_empty(&kprobe_insn_pages)) {
+                        INIT_HLIST_NODE(&kip->hlist);
+                        hlist_add_head(&kip->hlist,
+                                       &kprobe_insn_pages);
+                } else {
+                        module_free(NULL, kip->insns);
+                        kfree(kip);
+                }
+                return 1;
+        }
+        return 0;
+}
+static int __kprobes collect_garbage_slots(void)
+{
+        struct kprobe_insn_page *kip;
+        struct hlist_node *pos, *next;
+        /* Ensure no-one is preepmted on the garbages */
+        if (check_safety() != 0)
+                return -EAGAIN;
+        hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+                int i;
+                kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+                if (kip->ngarbage == 0)
+                        continue;
+                kip->ngarbage = 0;      /* we will collect all garbages */
+                for (i = 0; i < INSNS_PER_PAGE; i++) {
+                        if (kip->slot_used[i] == -1 &&
+                            collect_one_slot(kip, i))
+                                break;
+                }
+        }
+        kprobe_garbage_slots = 0;
+        return 0;
+}
+void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
 {
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
@@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
                if (kip->insns <= slot &&
                    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
                        int i = (slot - kip->insns) / MAX_INSN_SIZE;
-                        kip->slot_used[i] = 0;
+                        if (dirty) {
-                        kip->nused--;
+                                kip->slot_used[i] = -1;
-                        if (kip->nused == 0) {
+                                kip->ngarbage++;
-                                /*
+                        } else {
-                                 * Page is no longer in use.  Free it unless
+                                collect_one_slot(kip, i);
-                                 * it's the last one.  We keep the last one
-                                 * so as not to have to set it up again the
-                                 * next time somebody inserts a probe.
-                                 */
-                                hlist_del(&kip->hlist);
-                                if (hlist_empty(&kprobe_insn_pages)) {
-                                        INIT_HLIST_NODE(&kip->hlist);
-                                        hlist_add_head(&kip->hlist,
-                                                &kprobe_insn_pages);
-                                } else {
-                                        module_free(NULL, kip->insns);
-                                        kfree(kip);
-                                }
                        }
-                        return;
+                        break;
                }
        }
+        if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+                collect_garbage_slots();
+        }
 }
 #endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4f9c60ef95e8..1db8c72d0d38 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -31,6 +31,8 @@ struct kthread_create_info
        /* Result passed back to kthread_create() from keventd. */
        struct task_struct *result;
        struct completion done;
+        struct work_struct work;
 };
 struct kthread_stop_info
@@ -111,9 +113,10 @@ static int kthread(void *_create)
 }
 /* We are keventd: create a thread. */
-static void keventd_create_kthread(void *_create)
+static void keventd_create_kthread(struct work_struct *work)
 {
-        struct kthread_create_info *create = _create;
+        struct kthread_create_info *create =
+                container_of(work, struct kthread_create_info, work);
        int pid;
        /* We want our own signal handler (we take no signals by default). */
@@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
                                   ...)
 {
        struct kthread_create_info create;
-        DECLARE_WORK(work, keventd_create_kthread, &create);
        create.threadfn = threadfn;
        create.data = data;
        init_completion(&create.started);
        init_completion(&create.done);
+        INIT_WORK(&create.work, keventd_create_kthread);
        /*
         * The workqueue needs to start up first:
         */
        if (!helper_wq)
-                work.func(work.data);
+                create.work.func(&create.work);
        else {
-                queue_work(helper_wq, &work);
+                queue_work(helper_wq, &create.work);
                wait_for_completion(&create.done);
        }
        if (!IS_ERR(create.result)) {
diff --git a/kernel/latency.c b/kernel/latency.c
index 258f2555abbc..e63fcacb61a7 100644
--- a/kernel/latency.c
+++ b/kernel/latency.c
@@ -36,6 +36,7 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/notifier.h>
+#include <linux/jiffies.h>
 #include <asm/atomic.h>
 struct latency_info {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b739be2a6dc9..01e750559034 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,13 +43,49 @@
 #include "lockdep_internals.h"
 /*
- * hash_lock: protects the lockdep hashes and class/list/hash allocators.
+ * lockdep_lock: protects the lockdep graph, the hashes and the
+ *               class/list/hash allocators.
 *
 * This is one of the rare exceptions where it's justified
 * to use a raw spinlock - we really dont want the spinlock
- * code to recurse back into the lockdep code.
+ * code to recurse back into the lockdep code...
 */
-static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+static int graph_lock(void)
+{
+        __raw_spin_lock(&lockdep_lock);
+        /*
+         * Make sure that if another CPU detected a bug while
+         * walking the graph we dont change it (while the other
+         * CPU is busy printing out stuff with the graph lock
+         * dropped already)
+         */
+        if (!debug_locks) {
+                __raw_spin_unlock(&lockdep_lock);
+                return 0;
+        }
+        return 1;
+}
+static inline int graph_unlock(void)
+{
+        __raw_spin_unlock(&lockdep_lock);
+        return 0;
+}
+/*
+ * Turn lock debugging off and return with 0 if it was off already,
+ * and also release the graph lock:
+ */
+static inline int debug_locks_off_graph_unlock(void)
+{
+        int ret = debug_locks_off();
+        __raw_spin_unlock(&lockdep_lock);
+        return ret;
+}
 static int lockdep_initialized;
@@ -57,14 +93,15 @@ unsigned long nr_list_entries;
 static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 /*
- * Allocate a lockdep entry. (assumes hash_lock held, returns
+ * Allocate a lockdep entry. (assumes the graph_lock held, returns
 * with NULL on failure)
 */
 static struct lock_list *alloc_list_entry(void)
 {
        if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
-                __raw_spin_unlock(&hash_lock);
+                if (!debug_locks_off_graph_unlock())
-                debug_locks_off();
+                        return NULL;
                printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
                printk("turning off the locking correctness validator.\n");
                return NULL;
@@ -140,21 +177,12 @@ void lockdep_on(void)
 EXPORT_SYMBOL(lockdep_on);
-int lockdep_internal(void)
-{
-        return current->lockdep_recursion != 0;
-}
-EXPORT_SYMBOL(lockdep_internal);
 /*
 * Debugging switches:
 */
 #define VERBOSE                 0
-#ifdef VERBOSE
+#define VERY_VERBOSE            0
-# define VERY_VERBOSE           0
-#endif
 #if VERBOSE
 # define HARDIRQ_VERBOSE        1
@@ -179,8 +207,8 @@ static int class_filter(struct lock_class *class)
                        !strcmp(class->name, "&struct->lockfield"))
                return 1;
 #endif
-        /* Allow everything else. 0 would be filter everything else */
+        /* Filter everything else. 1 would be to allow everything else */
-        return 1;
+        return 0;
 }
 #endif
@@ -214,7 +242,7 @@ static int softirq_verbose(struct lock_class *class)
 /*
 * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the hash_lock.
+ * addresses. Protected by the graph_lock.
 */
 unsigned long nr_stack_trace_entries;
 static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
@@ -228,25 +256,20 @@ static int save_trace(struct stack_trace *trace)
        trace->skip = 3;
        trace->all_contexts = 0;
-        /* Make sure to not recurse in case the the unwinder needs to tak
-e          locks. */
-        lockdep_off();
        save_stack_trace(trace, NULL);
-        lockdep_on();
        trace->max_entries = trace->nr_entries;
        nr_stack_trace_entries += trace->nr_entries;
-        if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
-                return 0;
        if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
-                __raw_spin_unlock(&hash_lock);
+                if (!debug_locks_off_graph_unlock())
-                if (debug_locks_off()) {
+                        return 0;
-                        printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
-                        printk("turning off the locking correctness validator.\n");
+                printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
-                        dump_stack();
+                printk("turning off the locking correctness validator.\n");
-                }
+                dump_stack();
                return 0;
        }
@@ -357,7 +380,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4
 static void print_lock_name(struct lock_class *class)
 {
-        char str[128], c1, c2, c3, c4;
+        char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4;
        const char *name;
        get_usage_chars(class, &c1, &c2, &c3, &c4);
@@ -379,7 +402,7 @@ static void print_lock_name(struct lock_class *class)
 static void print_lockdep_cache(struct lockdep_map *lock)
 {
        const char *name;
-        char str[128];
+        char str[KSYM_NAME_LEN + 1];
        name = lock->name;
        if (!name)
@@ -449,7 +472,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
        print_lock_class_header(class, depth);
        list_for_each_entry(entry, &class->locks_after, entry) {
-                DEBUG_LOCKS_WARN_ON(!entry->class);
+                if (DEBUG_LOCKS_WARN_ON(!entry->class))
+                        return;
                print_lock_dependencies(entry->class, depth + 1);
                printk("%*s ... acquired at:\n",depth,"");
@@ -474,7 +499,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
                return 0;
        entry->class = this;
-        save_trace(&entry->trace);
+        if (!save_trace(&entry->trace))
+                return 0;
        /*
         * Since we never remove from the dependency list, the list can
@@ -532,9 +558,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
 {
        struct task_struct *curr = current;
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-        debug_locks_off();
-        if (debug_locks_silent)
                return 0;
        printk("\n=======================================================\n");
@@ -563,7 +587,9 @@ static noinline int print_circular_bug_tail(void)
                return 0;
        this.class = check_source->class;
-        save_trace(&this.trace);
+        if (!save_trace(&this.trace))
+                return 0;
        print_circular_bug_entry(&this, 0);
        printk("\nother info that might help us debug this:\n\n");
@@ -579,8 +605,10 @@ static noinline int print_circular_bug_tail(void)
 static int noinline print_infinite_recursion_bug(void)
 {
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock())
-        DEBUG_LOCKS_WARN_ON(1);
+                return 0;
+        WARN_ON(1);
        return 0;
 }
@@ -715,9 +743,7 @@ print_bad_irq_dependency(struct task_struct *curr,
                         enum lock_usage_bit bit2,
                         const char *irqclass)
 {
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-        debug_locks_off();
-        if (debug_locks_silent)
                return 0;
        printk("\n======================================================\n");
@@ -798,9 +824,7 @@ static int
 print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
                   struct held_lock *next)
 {
-        debug_locks_off();
+        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-        __raw_spin_unlock(&hash_lock);
-        if (debug_locks_silent)
                return 0;
        printk("\n=============================================\n");
@@ -966,27 +990,24 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
                               &prev->class->locks_after, next->acquire_ip);
        if (!ret)
                return 0;
-        /*
-         * Return value of 2 signals 'dependency already added',
-         * in that case we dont have to add the backlink either.
-         */
-        if (ret == 2)
-                return 2;
        ret = add_lock_to_list(next->class, prev->class,
                               &next->class->locks_before, next->acquire_ip);
+        if (!ret)
+                return 0;
        /*
         * Debugging printouts:
         */
        if (verbose(prev->class) || verbose(next->class)) {
-                __raw_spin_unlock(&hash_lock);
+                graph_unlock();
                printk("\n new dependency: ");
                print_lock_name(prev->class);
                printk(" => ");
                print_lock_name(next->class);
                printk("\n");
                dump_stack();
-                __raw_spin_lock(&hash_lock);
+                return graph_lock();
        }
        return 1;
 }
@@ -1025,7 +1046,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
                 * added:
                 */
                if (hlock->read != 2) {
-                        check_prev_add(curr, hlock, next);
+                        if (!check_prev_add(curr, hlock, next))
+                                return 0;
                        /*
                         * Stop after the first non-trylock entry,
                         * as non-trylock entries have added their
@@ -1050,8 +1072,10 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
        }
        return 1;
 out_bug:
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock())
-        DEBUG_LOCKS_WARN_ON(1);
+                return 0;
+        WARN_ON(1);
        return 0;
 }
@@ -1081,7 +1105,8 @@ static int static_obj(void *obj)
         */
        for_each_possible_cpu(i) {
                start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
-                end   = (unsigned long) &__per_cpu_end   + per_cpu_offset(i);
+                end   = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
+                                        + per_cpu_offset(i);
                if ((addr >= start) && (addr < end))
                        return 1;
@@ -1181,6 +1206,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        struct lockdep_subclass_key *key;
        struct list_head *hash_head;
        struct lock_class *class;
+        unsigned long flags;
        class = look_up_lock_class(lock, subclass);
        if (likely(class))
@@ -1202,7 +1228,11 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        key = lock->key->subkeys + subclass;
        hash_head = classhashentry(key);
-        __raw_spin_lock(&hash_lock);
+        raw_local_irq_save(flags);
+        if (!graph_lock()) {
+                raw_local_irq_restore(flags);
+                return NULL;
+        }
        /*
         * We have to do the hash-walk again, to avoid races
         * with another CPU:
@@ -1215,8 +1245,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
         * the hash:
         */
        if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
-                __raw_spin_unlock(&hash_lock);
+                if (!debug_locks_off_graph_unlock()) {
-                debug_locks_off();
+                        raw_local_irq_restore(flags);
+                        return NULL;
+                }
+                raw_local_irq_restore(flags);
                printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
                printk("turning off the locking correctness validator.\n");
                return NULL;
@@ -1237,16 +1271,24 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        list_add_tail_rcu(&class->hash_entry, hash_head);
        if (verbose(class)) {
-                __raw_spin_unlock(&hash_lock);
+                graph_unlock();
+                raw_local_irq_restore(flags);
                printk("\nnew class %p: %s", class->key, class->name);
                if (class->name_version > 1)
                        printk("#%d", class->name_version);
                printk("\n");
                dump_stack();
-                __raw_spin_lock(&hash_lock);
+                raw_local_irq_save(flags);
+                if (!graph_lock()) {
+                        raw_local_irq_restore(flags);
+                        return NULL;
+                }
        }
 out_unlock_set:
-        __raw_spin_unlock(&hash_lock);
+        graph_unlock();
+        raw_local_irq_restore(flags);
        if (!subclass || force)
                lock->class_cache = class;
@@ -1261,7 +1303,7 @@ out_unlock_set:
 * add it and return 0 - in this case the new dependency chain is
 * validated. If the key is already hashed, return 1.
 */
-static inline int lookup_chain_cache(u64 chain_key)
+static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
 {
        struct list_head *hash_head = chainhashentry(chain_key);
        struct lock_chain *chain;
@@ -1275,34 +1317,32 @@ static inline int lookup_chain_cache(u64 chain_key)
                if (chain->chain_key == chain_key) {
 cache_hit:
                        debug_atomic_inc(&chain_lookup_hits);
-                        /*
+                        if (very_verbose(class))
-                         * In the debugging case, force redundant checking
+                                printk("\nhash chain already cached, key: %016Lx tail class: [%p] %s\n", chain_key, class->key, class->name);
-                         * by returning 1:
-                         */
-#ifdef CONFIG_DEBUG_LOCKDEP
-                        __raw_spin_lock(&hash_lock);
-                        return 1;
-#endif
                        return 0;
                }
        }
+        if (very_verbose(class))
+                printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", chain_key, class->key, class->name);
        /*
         * Allocate a new chain entry from the static array, and add
         * it to the hash:
         */
-        __raw_spin_lock(&hash_lock);
+        if (!graph_lock())
+                return 0;
        /*
         * We have to walk the chain again locked - to avoid duplicates:
         */
        list_for_each_entry(chain, hash_head, entry) {
                if (chain->chain_key == chain_key) {
-                        __raw_spin_unlock(&hash_lock);
+                        graph_unlock();
                        goto cache_hit;
                }
        }
        if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
-                __raw_spin_unlock(&hash_lock);
+                if (!debug_locks_off_graph_unlock())
-                debug_locks_off();
+                        return 0;
                printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
                printk("turning off the locking correctness validator.\n");
                return 0;
@@ -1378,9 +1418,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
                        struct held_lock *this, int forwards,
                        const char *irqclass)
 {
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-        debug_locks_off();
-        if (debug_locks_silent)
                return 0;
        printk("\n=========================================================\n");
@@ -1450,7 +1488,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
        return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
 }
-static inline void print_irqtrace_events(struct task_struct *curr)
+void print_irqtrace_events(struct task_struct *curr)
 {
        printk("irq event stamp: %u\n", curr->irq_events);
        printk("hardirqs last  enabled at (%u): ", curr->hardirq_enable_event);
@@ -1463,19 +1501,13 @@ static inline void print_irqtrace_events(struct task_struct *curr)
        print_ip_sym(curr->softirq_disable_ip);
 }
-#else
-static inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
 #endif
 static int
 print_usage_bug(struct task_struct *curr, struct held_lock *this,
                enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
 {
-        __raw_spin_unlock(&hash_lock);
+        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
-        debug_locks_off();
-        if (debug_locks_silent)
                return 0;
        printk("\n=================================\n");
@@ -1536,12 +1568,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
        if (likely(this->class->usage_mask & new_mask))
                return 1;
-        __raw_spin_lock(&hash_lock);
+        if (!graph_lock())
+                return 0;
        /*
         * Make sure we didnt race:
         */
        if (unlikely(this->class->usage_mask & new_mask)) {
-                __raw_spin_unlock(&hash_lock);
+                graph_unlock();
                return 1;
        }
@@ -1727,15 +1760,16 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
                debug_atomic_dec(&nr_unused_locks);
                break;
        default:
-                debug_locks_off();
+                if (!debug_locks_off_graph_unlock())
+                        return 0;
                WARN_ON(1);
                return 0;
        }
-        __raw_spin_unlock(&hash_lock);
+        graph_unlock();
        /*
-         * We must printk outside of the hash_lock:
+         * We must printk outside of the graph_lock:
         */
        if (ret == 2) {
                printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
@@ -2133,9 +2167,9 @@ out_calc_hash:
         * We look up the chain_key and do the O(N^2) check and update of
         * the dependencies only if this is a new dependency chain.
         * (If lookup_chain_cache() returns with 1 it acquires
-         * hash_lock for us)
+         * graph_lock for us)
         */
-        if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) {
+        if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) {
                /*
                 * Check whether last held lock:
                 *
@@ -2166,7 +2200,7 @@ out_calc_hash:
                if (!chain_head && ret != 2)
                        if (!check_prevs_add(curr, hlock))
                                return 0;
-                __raw_spin_unlock(&hash_lock);
+                graph_unlock();
        }
        curr->lockdep_depth++;
        check_chain_key(curr);
@@ -2429,6 +2463,7 @@ EXPORT_SYMBOL_GPL(lock_release);
 void lockdep_reset(void)
 {
        unsigned long flags;
+        int i;
        raw_local_irq_save(flags);
        current->curr_chain_key = 0;
@@ -2439,6 +2474,8 @@ void lockdep_reset(void)
        nr_softirq_chains = 0;
        nr_process_chains = 0;
        debug_locks = 1;
+        for (i = 0; i < CHAINHASH_SIZE; i++)
+                INIT_LIST_HEAD(chainhash_table + i);
        raw_local_irq_restore(flags);
 }
@@ -2475,7 +2512,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
        int i;
        raw_local_irq_save(flags);
-        __raw_spin_lock(&hash_lock);
+        graph_lock();
        /*
         * Unhash all classes that were created by this module:
@@ -2489,7 +2526,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
                                zap_class(class);
        }
-        __raw_spin_unlock(&hash_lock);
+        graph_unlock();
        raw_local_irq_restore(flags);
 }
@@ -2517,20 +2554,20 @@ void lockdep_reset_lock(struct lockdep_map *lock)
         * Debug check: in the end all mapped classes should
         * be gone.
         */
-        __raw_spin_lock(&hash_lock);
+        graph_lock();
        for (i = 0; i < CLASSHASH_SIZE; i++) {
                head = classhash_table + i;
                if (list_empty(head))
                        continue;
                list_for_each_entry_safe(class, next, head, hash_entry) {
                        if (unlikely(class == lock->class_cache)) {
-                                __raw_spin_unlock(&hash_lock);
+                                if (debug_locks_off_graph_unlock())
-                                DEBUG_LOCKS_WARN_ON(1);
+                                        WARN_ON(1);
                                goto out_restore;
                        }
                }
        }
-        __raw_spin_unlock(&hash_lock);
+        graph_unlock();
 out_restore:
        raw_local_irq_restore(flags);
@@ -2644,6 +2681,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
        }
        local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
 static void print_held_locks_bug(struct task_struct *curr)
 {
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index eab043c83bb2..8ce09bc4613d 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -20,7 +20,7 @@
 #define MAX_LOCKDEP_KEYS_BITS   11
 #define MAX_LOCKDEP_KEYS        (1UL << MAX_LOCKDEP_KEYS_BITS)
-#define MAX_LOCKDEP_CHAINS_BITS 13
+#define MAX_LOCKDEP_CHAINS_BITS 14
 #define MAX_LOCKDEP_CHAINS      (1UL << MAX_LOCKDEP_CHAINS_BITS)
 /*
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index f6e72eaab3fa..b554b40a4aa6 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v)
        return 0;
 }
-static struct seq_operations lockdep_ops = {
+static const struct seq_operations lockdep_ops = {
        .start  = l_start,
        .next   = l_next,
        .stop   = l_stop,
@@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file)
        return res;
 }
-static struct file_operations proc_lockdep_operations = {
+static const struct file_operations proc_lockdep_operations = {
        .open           = lockdep_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
@@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file)
        return single_open(file, lockdep_stats_show, NULL);
 }
-static struct file_operations proc_lockdep_stats_operations = {
+static const struct file_operations proc_lockdep_stats_operations = {
        .open           = lockdep_stats_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
diff --git a/kernel/module.c b/kernel/module.c
index 5072a943fe35..b565eaeff7e6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -34,10 +34,10 @@
 #include <linux/err.h>
 #include <linux/vermagic.h>
 #include <linux/notifier.h>
+#include <linux/sched.h>
 #include <linux/stop_machine.h>
 #include <linux/device.h>
 #include <linux/string.h>
-#include <linux/sched.h>
 #include <linux/mutex.h>
 #include <linux/unwind.h>
 #include <asm/uaccess.h>
@@ -790,6 +790,19 @@ static struct module_attribute refcnt = {
        .show = show_refcnt,
 };
+void module_put(struct module *module)
+{
+        if (module) {
+                unsigned int cpu = get_cpu();
+                local_dec(&module->ref[cpu].count);
+                /* Maybe they're waiting for us to drop reference? */
+                if (unlikely(!module_is_live(module)))
+                        wake_up_process(module->waiter);
+                put_cpu();
+        }
+}
+EXPORT_SYMBOL(module_put);
 #else /* !CONFIG_MODULE_UNLOAD */
 static void print_unload_info(struct seq_file *m, struct module *mod)
 {
@@ -811,9 +824,34 @@ static inline void module_unload_init(struct module *mod)
 }
 #endif /* CONFIG_MODULE_UNLOAD */
+static ssize_t show_initstate(struct module_attribute *mattr,
+                           struct module *mod, char *buffer)
+{
+        const char *state = "unknown";
+        switch (mod->state) {
+        case MODULE_STATE_LIVE:
+                state = "live";
+                break;
+        case MODULE_STATE_COMING:
+                state = "coming";
+                break;
+        case MODULE_STATE_GOING:
+                state = "going";
+                break;
+        }
+        return sprintf(buffer, "%s\n", state);
+}
+static struct module_attribute initstate = {
+        .attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE },
+        .show = show_initstate,
+};
 static struct module_attribute *modinfo_attrs[] = {
        &modinfo_version,
        &modinfo_srcversion,
+        &initstate,
 #ifdef CONFIG_MODULE_UNLOAD
        &refcnt,
 #endif
@@ -1086,22 +1124,35 @@ static int mod_sysfs_setup(struct module *mod,
                goto out;
        kobj_set_kset_s(&mod->mkobj, module_subsys);
        mod->mkobj.mod = mod;
-        err = kobject_register(&mod->mkobj.kobj);
+        /* delay uevent until full sysfs population */
+        kobject_init(&mod->mkobj.kobj);
+        err = kobject_add(&mod->mkobj.kobj);
        if (err)
                goto out;
+        mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers");
+        if (!mod->drivers_dir)
+                goto out_unreg;
        err = module_param_sysfs_setup(mod, kparam, num_params);
        if (err)
-                goto out_unreg;
+                goto out_unreg_drivers;
        err = module_add_modinfo_attrs(mod);
        if (err)
-                goto out_unreg;
+                goto out_unreg_param;
+        kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
        return 0;
+out_unreg_drivers:
+        kobject_unregister(mod->drivers_dir);
+out_unreg_param:
+        module_param_sysfs_remove(mod);
 out_unreg:
-        kobject_unregister(&mod->mkobj.kobj);
+        kobject_del(&mod->mkobj.kobj);
+        kobject_put(&mod->mkobj.kobj);
 out:
        return err;
 }
@@ -1110,6 +1161,7 @@ static void mod_kobject_remove(struct module *mod)
 {
        module_remove_modinfo_attrs(mod);
        module_param_sysfs_remove(mod);
+        kobject_unregister(mod->drivers_dir);
        kobject_unregister(&mod->mkobj.kobj);
 }
@@ -1718,7 +1770,7 @@ static struct module *load_module(void __user *umod,
        set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
        if (strcmp(mod->name, "ndiswrapper") == 0)
-                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+                add_taint(TAINT_PROPRIETARY_MODULE);
        if (strcmp(mod->name, "driverloader") == 0)
                add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -2182,7 +2234,7 @@ static int m_show(struct seq_file *m, void *p)
   Where refcount is a number or -, and deps is a comma-separated list
   of depends or -.
 */
-struct seq_operations modules_op = {
+const struct seq_operations modules_op = {
        .start  = m_start,
        .next   = m_next,
        .stop   = m_stop,
@@ -2275,11 +2327,14 @@ void print_modules(void)
 void module_add_driver(struct module *mod, struct device_driver *drv)
 {
+        int no_warn;
        if (!mod || !drv)
                return;
-        /* Don't check return code; this call is idempotent */
+        /* Don't check return codes; these calls are idempotent */
-        sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
+        no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module");
+        no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, drv->name);
 }
 EXPORT_SYMBOL(module_add_driver);
@@ -2288,6 +2343,8 @@ void module_remove_driver(struct device_driver *drv)
        if (!drv)
                return;
        sysfs_remove_link(&drv->kobj, "module");
+        if (drv->owner && drv->owner->drivers_dir)
+                sysfs_remove_link(drv->owner->drivers_dir, drv->name);
 }
 EXPORT_SYMBOL(module_remove_driver);
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 18651641a7b5..841539d72c55 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 void debug_mutex_unlock(struct mutex *lock)
 {
+        if (unlikely(!debug_locks))
+                return;
        DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
        DEBUG_LOCKS_WARN_ON(lock->magic != lock);
        DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 8c71cf72a497..e7cbbb82765b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 }
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
+int __sched
+mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
+{
+        might_sleep();
+        return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass);
+}
+EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 #endif
 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 674aceb7335a..f5b9ee6f6bbb 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -17,8 +17,9 @@
 #include <linux/version.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
-#include <linux/namespace.h>
+#include <linux/mnt_namespace.h>
 #include <linux/utsname.h>
+#include <linux/pid_namespace.h>
 struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
@@ -60,12 +61,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig)
        struct nsproxy *ns = clone_namespaces(orig);
        if (ns) {
-                if (ns->namespace)
+                if (ns->mnt_ns)
-                        get_namespace(ns->namespace);
+                        get_mnt_ns(ns->mnt_ns);
                if (ns->uts_ns)
                        get_uts_ns(ns->uts_ns);
                if (ns->ipc_ns)
                        get_ipc_ns(ns->ipc_ns);
+                if (ns->pid_ns)
+                        get_pid_ns(ns->pid_ns);
        }
        return ns;
@@ -97,7 +100,7 @@ int copy_namespaces(int flags, struct task_struct *tsk)
        tsk->nsproxy = new_ns;
-        err = copy_namespace(flags, tsk);
+        err = copy_mnt_ns(flags, tsk);
        if (err)
                goto out_ns;
@@ -109,16 +112,23 @@ int copy_namespaces(int flags, struct task_struct *tsk)
        if (err)
                goto out_ipc;
+        err = copy_pid_ns(flags, tsk);
+        if (err)
+                goto out_pid;
 out:
        put_nsproxy(old_ns);
        return err;
+out_pid:
+        if (new_ns->ipc_ns)
+                put_ipc_ns(new_ns->ipc_ns);
 out_ipc:
        if (new_ns->uts_ns)
                put_uts_ns(new_ns->uts_ns);
 out_uts:
-        if (new_ns->namespace)
+        if (new_ns->mnt_ns)
-                put_namespace(new_ns->namespace);
+                put_mnt_ns(new_ns->mnt_ns);
 out_ns:
        tsk->nsproxy = old_ns;
        kfree(new_ns);
@@ -127,11 +137,13 @@ out_ns:
 void free_nsproxy(struct nsproxy *ns)
 {
-                if (ns->namespace)
+        if (ns->mnt_ns)
-                        put_namespace(ns->namespace);
+                put_mnt_ns(ns->mnt_ns);
-                if (ns->uts_ns)
+        if (ns->uts_ns)
-                        put_uts_ns(ns->uts_ns);
+                put_uts_ns(ns->uts_ns);
-                if (ns->ipc_ns)
+        if (ns->ipc_ns)
-                        put_ipc_ns(ns->ipc_ns);
+                put_ipc_ns(ns->ipc_ns);
-                kfree(ns);
+        if (ns->pid_ns)
+                put_pid_ns(ns->pid_ns);
+        kfree(ns);
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index b914392085f9..2efe9d8d367b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -26,12 +26,12 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/hash.h>
-#include <linux/pspace.h>
+#include <linux/pid_namespace.h>
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
 static struct hlist_head *pid_hash;
 static int pidhash_shift;
-static kmem_cache_t *pid_cachep;
+static struct kmem_cache *pid_cachep;
 int pid_max = PID_MAX_DEFAULT;
@@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT;
 #define BITS_PER_PAGE           (PAGE_SIZE*8)
 #define BITS_PER_PAGE_MASK      (BITS_PER_PAGE-1)
-static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
+static inline int mk_pid(struct pid_namespace *pid_ns,
+                struct pidmap *map, int off)
 {
-        return (map - pspace->pidmap)*BITS_PER_PAGE + off;
+        return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
 }
 #define find_next_offset(map, off)                                      \
@@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off)
 * value does not cause lots of bitmaps to be allocated, but
 * the scheme scales to up to 4 million PIDs, runtime.
 */
-struct pspace init_pspace = {
+struct pid_namespace init_pid_ns = {
+        .kref = {
+                .refcount       = ATOMIC_INIT(2),
+        },
        .pidmap = {
                [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
        },
-        .last_pid = 0
+        .last_pid = 0,
+        .child_reaper = &init_task
 };
 /*
@@ -80,25 +85,25 @@ struct pspace init_pspace = {
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-static fastcall void free_pidmap(struct pspace *pspace, int pid)
+static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid)
 {
-        struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE;
+        struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE;
        int offset = pid & BITS_PER_PAGE_MASK;
        clear_bit(offset, map->page);
        atomic_inc(&map->nr_free);
 }
-static int alloc_pidmap(struct pspace *pspace)
+static int alloc_pidmap(struct pid_namespace *pid_ns)
 {
-        int i, offset, max_scan, pid, last = pspace->last_pid;
+        int i, offset, max_scan, pid, last = pid_ns->last_pid;
        struct pidmap *map;
        pid = last + 1;
        if (pid >= pid_max)
                pid = RESERVED_PIDS;
        offset = pid & BITS_PER_PAGE_MASK;
-        map = &pspace->pidmap[pid/BITS_PER_PAGE];
+        map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
        max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset;
        for (i = 0; i <= max_scan; ++i) {
                if (unlikely(!map->page)) {
@@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace)
                        do {
                                if (!test_and_set_bit(offset, map->page)) {
                                        atomic_dec(&map->nr_free);
-                                        pspace->last_pid = pid;
+                                        pid_ns->last_pid = pid;
                                        return pid;
                                }
                                offset = find_next_offset(map, offset);
-                                pid = mk_pid(pspace, map, offset);
+                                pid = mk_pid(pid_ns, map, offset);
                        /*
                         * find_next_offset() found a bit, the pid from it
                         * is in-bounds, and if we fell back to the last
@@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace)
                                        (i != max_scan || pid < last ||
                                            !((last+1) & BITS_PER_PAGE_MASK)));
                }
-                if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+                if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
                        ++map;
                        offset = 0;
                } else {
-                        map = &pspace->pidmap[0];
+                        map = &pid_ns->pidmap[0];
                        offset = RESERVED_PIDS;
                        if (unlikely(last == offset))
                                break;
                }
-                pid = mk_pid(pspace, map, offset);
+                pid = mk_pid(pid_ns, map, offset);
        }
        return -1;
 }
-static int next_pidmap(struct pspace *pspace, int last)
+static int next_pidmap(struct pid_namespace *pid_ns, int last)
 {
        int offset;
        struct pidmap *map, *end;
        offset = (last + 1) & BITS_PER_PAGE_MASK;
-        map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE];
+        map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
-        end = &pspace->pidmap[PIDMAP_ENTRIES];
+        end = &pid_ns->pidmap[PIDMAP_ENTRIES];
        for (; map < end; map++, offset = 0) {
                if (unlikely(!map->page))
                        continue;
                offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
                if (offset < BITS_PER_PAGE)
-                        return mk_pid(pspace, map, offset);
+                        return mk_pid(pid_ns, map, offset);
        }
        return -1;
 }
@@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid)
        hlist_del_rcu(&pid->pid_chain);
        spin_unlock_irqrestore(&pidmap_lock, flags);
-        free_pidmap(&init_pspace, pid->nr);
+        free_pidmap(current->nsproxy->pid_ns, pid->nr);
        call_rcu(&pid->rcu, delayed_put_pid);
 }
@@ -206,7 +211,7 @@ struct pid *alloc_pid(void)
        if (!pid)
                goto out;
-        nr = alloc_pidmap(&init_pspace);
+        nr = alloc_pidmap(current->nsproxy->pid_ns);
        if (nr < 0)
                goto out_free;
@@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr)
                pid = find_pid(nr);
                if (pid)
                        break;
-                nr = next_pidmap(&init_pspace, nr);
+                nr = next_pidmap(current->nsproxy->pid_ns, nr);
        } while (nr > 0);
        return pid;
 }
 EXPORT_SYMBOL_GPL(find_get_pid);
+int copy_pid_ns(int flags, struct task_struct *tsk)
+{
+        struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
+        int err = 0;
+        if (!old_ns)
+                return 0;
+        get_pid_ns(old_ns);
+        return err;
+}
+void free_pid_ns(struct kref *kref)
+{
+        struct pid_namespace *ns;
+        ns = container_of(kref, struct pid_namespace, kref);
+        kfree(ns);
+}
 /*
 * The pid hash table is scaled according to the amount of memory in the
 * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -382,10 +407,10 @@ void __init pidhash_init(void)
 void __init pidmap_init(void)
 {
-        init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+        init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
        /* Reserve PID 0. We never call free_pidmap(0) */
-        set_bit(0, init_pspace.pidmap[0].page);
+        set_bit(0, init_pid_ns.pidmap[0].page);
-        atomic_dec(&init_pspace.pidmap[0].nr_free);
+        atomic_dec(&init_pid_ns.pidmap[0].nr_free);
        pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
                                        __alignof__(struct pid),
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 9cbb5d1be06f..5fe87de10ff0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -70,7 +70,7 @@
 /*
 * Lets keep our timers in a slab cache :-)
 */
-static kmem_cache_t *posix_timers_cache;
+static struct kmem_cache *posix_timers_cache;
 static struct idr posix_timers_id;
 static DEFINE_SPINLOCK(idr_lock);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 825068ca3479..ed296225dcd4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -20,13 +20,14 @@ config PM
          sending the processor to sleep and saving power.
 config PM_LEGACY
-        bool "Legacy Power Management API"
+        bool "Legacy Power Management API (DEPRECATED)"
        depends on PM
-        default y
+        default n
        ---help---
-           Support for pm_register() and friends.
+           Support for pm_register() and friends.  This old API is obsoleted
+           by the driver model.
-           If unsure, say Y.
+           If unsure, say N.
 config PM_DEBUG
        bool "Power Management Debug Support"
@@ -78,7 +79,7 @@ config PM_SYSFS_DEPRECATED
 config SOFTWARE_SUSPEND
        bool "Software Suspend"
-        depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP))
+        depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
        ---help---
          Enable the possibility of suspending the machine.
          It doesn't need ACPI or APM.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index ae6bbc903b7d..88fc5d7ac737 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -20,6 +20,7 @@
 #include <linux/pm.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
+#include <linux/freezer.h>
 #include "power.h"
@@ -27,6 +28,23 @@
 static int noresume = 0;
 char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
+sector_t swsusp_resume_block;
+/**
+ *      platform_prepare - prepare the machine for hibernation using the
+ *      platform driver if so configured and return an error code if it fails
+ */
+static inline int platform_prepare(void)
+{
+        int error = 0;
+        if (pm_disk_mode == PM_DISK_PLATFORM) {
+                if (pm_ops && pm_ops->prepare)
+                        error = pm_ops->prepare(PM_SUSPEND_DISK);
+        }
+        return error;
+}
 /**
 *      power_down - Shut machine down for hibernate.
@@ -40,13 +58,11 @@ dev_t swsusp_resume_device;
 static void power_down(suspend_disk_method_t mode)
 {
-        int error = 0;
        switch(mode) {
        case PM_DISK_PLATFORM:
                if (pm_ops && pm_ops->enter) {
                        kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
-                        error = pm_ops->enter(PM_SUSPEND_DISK);
+                        pm_ops->enter(PM_SUSPEND_DISK);
                        break;
                }
        case PM_DISK_SHUTDOWN:
@@ -73,7 +89,7 @@ static inline void platform_finish(void)
 static int prepare_processes(void)
 {
-        int error;
+        int error = 0;
        pm_prepare_console();
@@ -86,12 +102,24 @@ static int prepare_processes(void)
                goto thaw;
        }
+        if (pm_disk_mode == PM_DISK_TESTPROC) {
+                printk("swsusp debug: Waiting for 5 seconds.\n");
+                mdelay(5000);
+                goto thaw;
+        }
+        error = platform_prepare();
+        if (error)
+                goto thaw;
        /* Free memory before shutting down devices. */
        if (!(error = swsusp_shrink_memory()))
                return 0;
-thaw:
+        platform_finish();
+ thaw:
        thaw_processes();
-enable_cpus:
+ enable_cpus:
        enable_nonboot_cpus();
        pm_restore_console();
        return error;
@@ -122,13 +150,21 @@ int pm_suspend_disk(void)
        if (error)
                return error;
+        if (pm_disk_mode == PM_DISK_TESTPROC)
+                return 0;
        suspend_console();
        error = device_suspend(PMSG_FREEZE);
        if (error) {
                resume_console();
                printk("Some devices failed to suspend\n");
-                unprepare_processes();
+                goto Thaw;
-                return error;
+        }
+        if (pm_disk_mode == PM_DISK_TEST) {
+                printk("swsusp debug: Waiting for 5 seconds.\n");
+                mdelay(5000);
+                goto Done;
        }
        pr_debug("PM: snapshotting memory.\n");
@@ -145,16 +181,17 @@ int pm_suspend_disk(void)
                        power_down(pm_disk_mode);
                else {
                        swsusp_free();
-                        unprepare_processes();
+                        goto Thaw;
-                        return error;
                }
-        } else
+        } else {
                pr_debug("PM: Image restored successfully.\n");
+        }
        swsusp_free();
 Done:
        device_resume();
        resume_console();
+ Thaw:
        unprepare_processes();
        return error;
 }
@@ -176,10 +213,10 @@ static int software_resume(void)
 {
        int error;
-        down(&pm_sem);
+        mutex_lock(&pm_mutex);
        if (!swsusp_resume_device) {
                if (!strlen(resume_file)) {
-                        up(&pm_sem);
+                        mutex_unlock(&pm_mutex);
                        return -ENOENT;
                }
                swsusp_resume_device = name_to_dev_t(resume_file);
@@ -194,7 +231,7 @@ static int software_resume(void)
                 * FIXME: If noresume is specified, we need to find the partition
                 * and reset it back to normal swap space.
                 */
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
                return 0;
        }
@@ -238,7 +275,7 @@ static int software_resume(void)
        unprepare_processes();
 Done:
        /* For success case, the suspend path will release the lock */
-        up(&pm_sem);
+        mutex_unlock(&pm_mutex);
        pr_debug("PM: Resume from disk failed.\n");
        return 0;
 }
@@ -251,6 +288,8 @@ static const char * const pm_disk_modes[] = {
        [PM_DISK_PLATFORM]      = "platform",
        [PM_DISK_SHUTDOWN]      = "shutdown",
        [PM_DISK_REBOOT]        = "reboot",
+        [PM_DISK_TEST]          = "test",
+        [PM_DISK_TESTPROC]      = "testproc",
 };
 /**
@@ -297,7 +336,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
        p = memchr(buf, '\n', n);
        len = p ? p - buf : n;
-        down(&pm_sem);
+        mutex_lock(&pm_mutex);
        for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
                if (!strncmp(buf, pm_disk_modes[i], len)) {
                        mode = i;
@@ -305,21 +344,23 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
                }
        }
        if (mode) {
-                if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT)
+                if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT ||
+                     mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) {
                        pm_disk_mode = mode;
-                else {
+                } else {
                        if (pm_ops && pm_ops->enter &&
                            (mode == pm_ops->pm_disk_mode))
                                pm_disk_mode = mode;
                        else
                                error = -EINVAL;
                }
-        } else
+        } else {
                error = -EINVAL;
+        }
        pr_debug("PM: suspend-to-disk mode set to '%s'\n",
                 pm_disk_modes[mode]);
-        up(&pm_sem);
+        mutex_unlock(&pm_mutex);
        return error ? error : n;
 }
@@ -344,14 +385,14 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
        if (maj != MAJOR(res) || min != MINOR(res))
                goto out;
-        down(&pm_sem);
+        mutex_lock(&pm_mutex);
        swsusp_resume_device = res;
-        up(&pm_sem);
+        mutex_unlock(&pm_mutex);
        printk("Attempting manual resume\n");
        noresume = 0;
        software_resume();
        ret = n;
-out:
+ out:
        return ret;
 }
@@ -406,6 +447,19 @@ static int __init resume_setup(char *str)
        return 1;
 }
+static int __init resume_offset_setup(char *str)
+{
+        unsigned long long offset;
+        if (noresume)
+                return 1;
+        if (sscanf(str, "%llu", &offset) == 1)
+                swsusp_resume_block = offset;
+        return 1;
+}
 static int __init noresume_setup(char *str)
 {
        noresume = 1;
@@ -413,4 +467,5 @@ static int __init noresume_setup(char *str)
 }
 __setup("noresume", noresume_setup);
+__setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1210961a5aa7..ff3a6182f5f0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,6 +8,7 @@
 *
 */
+#include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
@@ -18,13 +19,14 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/resume-trace.h>
+#include <linux/freezer.h>
 #include "power.h"
 /*This is just an arbitrary number */
 #define FREE_PAGE_NUMBER (100)
-DECLARE_MUTEX(pm_sem);
+DEFINE_MUTEX(pm_mutex);
 struct pm_ops *pm_ops;
 suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM;
@@ -36,9 +38,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM;
 void pm_set_ops(struct pm_ops * ops)
 {
-        down(&pm_sem);
+        mutex_lock(&pm_mutex);
        pm_ops = ops;
-        up(&pm_sem);
+        mutex_unlock(&pm_mutex);
 }
@@ -182,7 +184,7 @@ static int enter_state(suspend_state_t state)
        if (!valid_state(state))
                return -ENODEV;
-        if (down_trylock(&pm_sem))
+        if (!mutex_trylock(&pm_mutex))
                return -EBUSY;
        if (state == PM_SUSPEND_DISK) {
@@ -200,7 +202,7 @@ static int enter_state(suspend_state_t state)
        pr_debug("PM: Finishing wakeup.\n");
        suspend_finish(state);
 Unlock:
-        up(&pm_sem);
+        mutex_unlock(&pm_mutex);
        return error;
 }
@@ -229,7 +231,7 @@ int pm_suspend(suspend_state_t state)
        return -EINVAL;
 }
+EXPORT_SYMBOL(pm_suspend);
 decl_subsys(power,NULL,NULL);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index bfe999f7b272..eb461b816bf4 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void)
        return -EPERM;
 }
 #endif
-extern struct semaphore pm_sem;
+extern struct mutex pm_mutex;
 #define power_attr(_name) \
 static struct subsys_attribute _name##_attr = { \
        .attr   = {                             \
@@ -42,6 +44,7 @@ extern const void __nosave_begin, __nosave_end;
 extern unsigned long image_size;
 extern int in_suspend;
 extern dev_t swsusp_resume_device;
+extern sector_t swsusp_resume_block;
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
@@ -102,8 +105,18 @@ struct snapshot_handle {
 extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+extern void snapshot_write_finalize(struct snapshot_handle *handle);
 extern int snapshot_image_loaded(struct snapshot_handle *handle);
-extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
+/*
+ * This structure is used to pass the values needed for the identification
+ * of the resume swap area from a user space to the kernel via the
+ * SNAPSHOT_SET_SWAP_AREA ioctl
+ */
+struct resume_swap_area {
+        loff_t offset;
+        u_int32_t dev;
+} __attribute__((packed));
 #define SNAPSHOT_IOC_MAGIC      '3'
 #define SNAPSHOT_FREEZE                 _IO(SNAPSHOT_IOC_MAGIC, 1)
@@ -117,7 +130,14 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 #define SNAPSHOT_FREE_SWAP_PAGES        _IO(SNAPSHOT_IOC_MAGIC, 9)
 #define SNAPSHOT_SET_SWAP_FILE          _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
 #define SNAPSHOT_S2RAM                  _IO(SNAPSHOT_IOC_MAGIC, 11)
-#define SNAPSHOT_IOC_MAXNR      11
+#define SNAPSHOT_PMOPS                  _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
+#define SNAPSHOT_SET_SWAP_AREA          _IOW(SNAPSHOT_IOC_MAGIC, 13, \
+                                                        struct resume_swap_area)
+#define SNAPSHOT_IOC_MAXNR      13
+#define PMOPS_PREPARE   1
+#define PMOPS_ENTER     2
+#define PMOPS_FINISH    3
 /**
 *      The bitmap is used for tracing allocated swap pages
@@ -141,7 +161,7 @@ struct bitmap_page {
 extern void free_bitmap(struct bitmap_page *bitmap);
 extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
-extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
+extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
 extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
 extern int swsusp_check(void);
@@ -153,3 +173,7 @@ extern int swsusp_read(void);
 extern int swsusp_write(void);
 extern void swsusp_close(void);
 extern int suspend_enter(suspend_state_t state);
+struct timeval;
+extern void swsusp_show_speed(struct timeval *, struct timeval *,
+                                unsigned int, char *);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index f1f900ac3164..678ec736076b 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -16,12 +16,12 @@
 * callback we use.
 */
-static void do_poweroff(void *dummy)
+static void do_poweroff(struct work_struct *dummy)
 {
        kernel_power_off();
 }
-static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
+static DECLARE_WORK(poweroff_work, do_poweroff);
 static void handle_poweroff(int key, struct tty_struct *tty)
 {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 72e72d2c61e6..6d566bf7085c 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -13,20 +13,22 @@
 #include <linux/suspend.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/freezer.h>
 /* 
 * Timeout for stopping processes
 */
 #define TIMEOUT (20 * HZ)
+#define FREEZER_KERNEL_THREADS 0
+#define FREEZER_USER_SPACE 1
 static inline int freezeable(struct task_struct * p)
 {
        if ((p == current) || 
            (p->flags & PF_NOFREEZE) ||
            (p->exit_state == EXIT_ZOMBIE) ||
-            (p->exit_state == EXIT_DEAD) ||
+            (p->exit_state == EXIT_DEAD))
-            (p->state == TASK_STOPPED))
                return 0;
        return 1;
 }
@@ -39,7 +41,6 @@ void refrigerator(void)
        long save;
        save = current->state;
        pr_debug("%s entered refrigerator\n", current->comm);
-        printk("=");
        frozen_process(current);
        spin_lock_irq(&current->sighand->siglock);
@@ -59,10 +60,16 @@ static inline void freeze_process(struct task_struct *p)
        unsigned long flags;
        if (!freezing(p)) {
-                freeze(p);
+                rmb();
-                spin_lock_irqsave(&p->sighand->siglock, flags);
+                if (!frozen(p)) {
-                signal_wake_up(p, 0);
+                        if (p->state == TASK_STOPPED)
-                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+                                force_sig_specific(SIGSTOP, p);
+                        freeze(p);
+                        spin_lock_irqsave(&p->sighand->siglock, flags);
+                        signal_wake_up(p, p->state == TASK_STOPPED);
+                        spin_unlock_irqrestore(&p->sighand->siglock, flags);
+                }
        }
 }
@@ -79,96 +86,134 @@ static void cancel_freezing(struct task_struct *p)
        }
 }
-/* 0 = success, else # of processes that we failed to stop */
+static inline int is_user_space(struct task_struct *p)
-int freeze_processes(void)
+{
+        return p->mm && !(p->flags & PF_BORROWED_MM);
+}
+static unsigned int try_to_freeze_tasks(int freeze_user_space)
 {
-        int todo, nr_user, user_frozen;
-        unsigned long start_time;
        struct task_struct *g, *p;
+        unsigned long end_time;
+        unsigned int todo;
-        printk( "Stopping tasks: " );
+        end_time = jiffies + TIMEOUT;
-        start_time = jiffies;
-        user_frozen = 0;
        do {
-                nr_user = todo = 0;
+                todo = 0;
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
                        if (!freezeable(p))
                                continue;
                        if (frozen(p))
                                continue;
                        if (p->state == TASK_TRACED && frozen(p->parent)) {
                                cancel_freezing(p);
                                continue;
                        }
-                        if (p->mm && !(p->flags & PF_BORROWED_MM)) {
+                        if (is_user_space(p)) {
-                                /* The task is a user-space one.
+                                if (!freeze_user_space)
-                                 * Freeze it unless there's a vfork completion
+                                        continue;
-                                 * pending
+                                /* Freeze the task unless there is a vfork
+                                 * completion pending
                                 */
                                if (!p->vfork_done)
                                        freeze_process(p);
-                                nr_user++;
                        } else {
-                                /* Freeze only if the user space is frozen */
+                                if (freeze_user_space)
-                                if (user_frozen)
+                                        continue;
-                                        freeze_process(p);
-                                todo++;
+                                freeze_process(p);
                        }
+                        todo++;
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
-                todo += nr_user;
-                if (!user_frozen && !nr_user) {
-                        sys_sync();
-                        start_time = jiffies;
-                }
-                user_frozen = !nr_user;
                yield();                        /* Yield is okay here */
-                if (todo && time_after(jiffies, start_time + TIMEOUT))
+                if (todo && time_after(jiffies, end_time))
                        break;
-        } while(todo);
+        } while (todo);
-        /* This does not unfreeze processes that are already frozen
-         * (we have slightly ugly calling convention in that respect,
-         * and caller must call thaw_processes() if something fails),
-         * but it cleans up leftover PF_FREEZE requests.
-         */
        if (todo) {
-                printk( "\n" );
+                /* This does not unfreeze processes that are already frozen
-                printk(KERN_ERR " stopping tasks timed out "
+                 * (we have slightly ugly calling convention in that respect,
-                        "after %d seconds (%d tasks remaining):\n",
+                 * and caller must call thaw_processes() if something fails),
-                        TIMEOUT / HZ, todo);
+                 * but it cleans up leftover PF_FREEZE requests.
+                 */
+                printk("\n");
+                printk(KERN_ERR "Stopping %s timed out after %d seconds "
+                                "(%d tasks refusing to freeze):\n",
+                                freeze_user_space ? "user space processes" :
+                                        "kernel threads",
+                                TIMEOUT / HZ, todo);
                read_lock(&tasklist_lock);
                do_each_thread(g, p) {
+                        if (is_user_space(p) == !freeze_user_space)
+                                continue;
                        if (freezeable(p) && !frozen(p))
-                                printk(KERN_ERR "  %s\n", p->comm);
+                                printk(KERN_ERR " %s\n", p->comm);
                        cancel_freezing(p);
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
-                return todo;
        }
-        printk( "|\n" );
+        return todo;
+}
+/**
+ *      freeze_processes - tell processes to enter the refrigerator
+ *
+ *      Returns 0 on success, or the number of processes that didn't freeze,
+ *      although they were told to.
+ */
+int freeze_processes(void)
+{
+        unsigned int nr_unfrozen;
+        printk("Stopping tasks ... ");
+        nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE);
+        if (nr_unfrozen)
+                return nr_unfrozen;
+        sys_sync();
+        nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
+        if (nr_unfrozen)
+                return nr_unfrozen;
+        printk("done.\n");
        BUG_ON(in_atomic());
        return 0;
 }
-void thaw_processes(void)
+static void thaw_tasks(int thaw_user_space)
 {
        struct task_struct *g, *p;
-        printk( "Restarting tasks..." );
        read_lock(&tasklist_lock);
        do_each_thread(g, p) {
                if (!freezeable(p))
                        continue;
+                if (is_user_space(p) == !thaw_user_space)
+                        continue;
                if (!thaw_process(p))
-                        printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
+                        printk(KERN_WARNING " Strange, %s not stopped\n",
+                                p->comm );
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
+}
+void thaw_processes(void)
+{
+        printk("Restarting tasks ... ");
+        thaw_tasks(FREEZER_KERNEL_THREADS);
+        thaw_tasks(FREEZER_USER_SPACE);
        schedule();
-        printk( " done\n" );
+        printk("done.\n");
 }
 EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 99f9b7d177d6..c024606221c4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,15 +1,15 @@
 /*
 * linux/kernel/power/snapshot.c
 *
- * This file provide system snapshot/restore functionality.
+ * This file provides system snapshot/restore functionality for swsusp.
 *
 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
 *
- * This file is released under the GPLv2, and is based on swsusp.c.
+ * This file is released under the GPLv2.
 *
 */
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -34,137 +34,24 @@
 #include "power.h"
-/* List of PBEs used for creating and restoring the suspend image */
+/* List of PBEs needed for restoring the pages that were allocated before
+ * the suspend and included in the suspend image, but have also been
+ * allocated by the "resume" kernel, so their contents cannot be written
+ * directly to their "original" page frames.
+ */
 struct pbe *restore_pblist;
-static unsigned int nr_copy_pages;
+/* Pointer to an auxiliary buffer (1 page) */
-static unsigned int nr_meta_pages;
 static void *buffer;
-#ifdef CONFIG_HIGHMEM
-unsigned int count_highmem_pages(void)
-{
-        struct zone *zone;
-        unsigned long zone_pfn;
-        unsigned int n = 0;
-        for_each_zone (zone)
-                if (is_highmem(zone)) {
-                        mark_free_pages(zone);
-                        for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) {
-                                struct page *page;
-                                unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-                                if (!pfn_valid(pfn))
-                                        continue;
-                                page = pfn_to_page(pfn);
-                                if (PageReserved(page))
-                                        continue;
-                                if (PageNosaveFree(page))
-                                        continue;
-                                n++;
-                        }
-                }
-        return n;
-}
-struct highmem_page {
-        char *data;
-        struct page *page;
-        struct highmem_page *next;
-};
-static struct highmem_page *highmem_copy;
-static int save_highmem_zone(struct zone *zone)
-{
-        unsigned long zone_pfn;
-        mark_free_pages(zone);
-        for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-                struct page *page;
-                struct highmem_page *save;
-                void *kaddr;
-                unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-                if (!(pfn%10000))
-                        printk(".");
-                if (!pfn_valid(pfn))
-                        continue;
-                page = pfn_to_page(pfn);
-                /*
-                 * This condition results from rvmalloc() sans vmalloc_32()
-                 * and architectural memory reservations. This should be
-                 * corrected eventually when the cases giving rise to this
-                 * are better understood.
-                 */
-                if (PageReserved(page))
-                        continue;
-                BUG_ON(PageNosave(page));
-                if (PageNosaveFree(page))
-                        continue;
-                save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
-                if (!save)
-                        return -ENOMEM;
-                save->next = highmem_copy;
-                save->page = page;
-                save->data = (void *) get_zeroed_page(GFP_ATOMIC);
-                if (!save->data) {
-                        kfree(save);
-                        return -ENOMEM;
-                }
-                kaddr = kmap_atomic(page, KM_USER0);
-                memcpy(save->data, kaddr, PAGE_SIZE);
-                kunmap_atomic(kaddr, KM_USER0);
-                highmem_copy = save;
-        }
-        return 0;
-}
-int save_highmem(void)
-{
-        struct zone *zone;
-        int res = 0;
-        pr_debug("swsusp: Saving Highmem");
-        drain_local_pages();
-        for_each_zone (zone) {
-                if (is_highmem(zone))
-                        res = save_highmem_zone(zone);
-                if (res)
-                        return res;
-        }
-        printk("\n");
-        return 0;
-}
-int restore_highmem(void)
-{
-        printk("swsusp: Restoring Highmem\n");
-        while (highmem_copy) {
-                struct highmem_page *save = highmem_copy;
-                void *kaddr;
-                highmem_copy = save->next;
-                kaddr = kmap_atomic(save->page, KM_USER0);
-                memcpy(kaddr, save->data, PAGE_SIZE);
-                kunmap_atomic(kaddr, KM_USER0);
-                free_page((long) save->data);
-                kfree(save);
-        }
-        return 0;
-}
-#else
-static inline unsigned int count_highmem_pages(void) {return 0;}
-static inline int save_highmem(void) {return 0;}
-static inline int restore_highmem(void) {return 0;}
-#endif
 /**
 *      @safe_needed - on resume, for storing the PBE list and the image,
 *      we can only use memory pages that do not conflict with the pages
- *      used before suspend.
+ *      used before suspend.  The unsafe pages have PageNosaveFree set
+ *      and we count them using unsafe_pages.
 *
- *      The unsafe pages are marked with the PG_nosave_free flag
+ *      Each allocated image page is marked as PageNosave and PageNosaveFree
- *      and we count them using unsafe_pages
+ *      so that swsusp_free() can release it.
 */
 #define PG_ANY          0
@@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;}
 static unsigned int allocated_unsafe_pages;
-static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+static void *get_image_page(gfp_t gfp_mask, int safe_needed)
 {
        void *res;
@@ -195,20 +82,39 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 unsigned long get_safe_page(gfp_t gfp_mask)
 {
-        return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
+        return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+}
+static struct page *alloc_image_page(gfp_t gfp_mask)
+{
+        struct page *page;
+        page = alloc_page(gfp_mask);
+        if (page) {
+                SetPageNosave(page);
+                SetPageNosaveFree(page);
+        }
+        return page;
 }
 /**
 *      free_image_page - free page represented by @addr, allocated with
- *      alloc_image_page (page flags set by it must be cleared)
+ *      get_image_page (page flags set by it must be cleared)
 */
 static inline void free_image_page(void *addr, int clear_nosave_free)
 {
-        ClearPageNosave(virt_to_page(addr));
+        struct page *page;
+        BUG_ON(!virt_addr_valid(addr));
+        page = virt_to_page(addr);
+        ClearPageNosave(page);
        if (clear_nosave_free)
-                ClearPageNosaveFree(virt_to_page(addr));
+                ClearPageNosaveFree(page);
-        free_page((unsigned long)addr);
+        __free_page(page);
 }
 /* struct linked_page is used to build chains of pages */
@@ -269,7 +175,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
        if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
                struct linked_page *lp;
-                lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
+                lp = get_image_page(ca->gfp_mask, ca->safe_needed);
                if (!lp)
                        return NULL;
@@ -446,8 +352,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
        /* Compute the number of zones */
        nr = 0;
-        for_each_zone (zone)
+        for_each_zone(zone)
-                if (populated_zone(zone) && !is_highmem(zone))
+                if (populated_zone(zone))
                        nr++;
        /* Allocate the list of zones bitmap objects */
@@ -459,10 +365,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
        }
        /* Initialize the zone bitmap objects */
-        for_each_zone (zone) {
+        for_each_zone(zone) {
                unsigned long pfn;
-                if (!populated_zone(zone) || is_highmem(zone))
+                if (!populated_zone(zone))
                        continue;
                zone_bm->start_pfn = zone->zone_start_pfn;
@@ -481,7 +387,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
                while (bb) {
                        unsigned long *ptr;
-                        ptr = alloc_image_page(gfp_mask, safe_needed);
+                        ptr = get_image_page(gfp_mask, safe_needed);
                        bb->data = ptr;
                        if (!ptr)
                                goto Free;
@@ -505,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
        memory_bm_position_reset(bm);
        return 0;
-Free:
+ Free:
        bm->p_list = ca.chain;
        memory_bm_free(bm, PG_UNSAFE_CLEAR);
        return -ENOMEM;
@@ -651,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
        memory_bm_position_reset(bm);
        return BM_END_OF_MAP;
-Return_pfn:
+ Return_pfn:
        bm->cur.chunk = chunk;
        bm->cur.bit = bit;
        return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
@@ -669,10 +575,82 @@ unsigned int snapshot_additional_pages(struct zone *zone)
        res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
        res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
-        return res;
+        return 2 * res;
+}
+#ifdef CONFIG_HIGHMEM
+/**
+ *      count_free_highmem_pages - compute the total number of free highmem
+ *      pages, system-wide.
+ */
+static unsigned int count_free_highmem_pages(void)
+{
+        struct zone *zone;
+        unsigned int cnt = 0;
+        for_each_zone(zone)
+                if (populated_zone(zone) && is_highmem(zone))
+                        cnt += zone->free_pages;
+        return cnt;
+}
+/**
+ *      saveable_highmem_page - Determine whether a highmem page should be
+ *      included in the suspend image.
+ *
+ *      We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+ *      and it isn't a part of a free chunk of pages.
+ */
+static struct page *saveable_highmem_page(unsigned long pfn)
+{
+        struct page *page;
+        if (!pfn_valid(pfn))
+                return NULL;
+        page = pfn_to_page(pfn);
+        BUG_ON(!PageHighMem(page));
+        if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
+                return NULL;
+        return page;
 }
 /**
+ *      count_highmem_pages - compute the total number of saveable highmem
+ *      pages.
+ */
+unsigned int count_highmem_pages(void)
+{
+        struct zone *zone;
+        unsigned int n = 0;
+        for_each_zone(zone) {
+                unsigned long pfn, max_zone_pfn;
+                if (!is_highmem(zone))
+                        continue;
+                mark_free_pages(zone);
+                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+                        if (saveable_highmem_page(pfn))
+                                n++;
+        }
+        return n;
+}
+#else
+static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
+static inline unsigned int count_highmem_pages(void) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+/**
 *      pfn_is_nosave - check if given pfn is in the 'nosave' section
 */
@@ -684,12 +662,12 @@ static inline int pfn_is_nosave(unsigned long pfn)
 }
 /**
- *      saveable - Determine whether a page should be cloned or not.
+ *      saveable - Determine whether a non-highmem page should be included in
- *      @pfn:   The page
+ *      the suspend image.
 *
- *      We save a page if it isn't Nosave, and is not in the range of pages
+ *      We should save the page if it isn't Nosave, and is not in the range
- *      statically defined as 'unsaveable', and it
+ *      of pages statically defined as 'unsaveable', and it isn't a part of
- *      isn't a part of a free chunk of pages.
+ *      a free chunk of pages.
 */
 static struct page *saveable_page(unsigned long pfn)
@@ -701,76 +679,130 @@ static struct page *saveable_page(unsigned long pfn)
        page = pfn_to_page(pfn);
-        if (PageNosave(page))
+        BUG_ON(PageHighMem(page));
+        if (PageNosave(page) || PageNosaveFree(page))
                return NULL;
        if (PageReserved(page) && pfn_is_nosave(pfn))
                return NULL;
-        if (PageNosaveFree(page))
-                return NULL;
        return page;
 }
+/**
+ *      count_data_pages - compute the total number of saveable non-highmem
+ *      pages.
+ */
 unsigned int count_data_pages(void)
 {
        struct zone *zone;
        unsigned long pfn, max_zone_pfn;
        unsigned int n = 0;
-        for_each_zone (zone) {
+        for_each_zone(zone) {
                if (is_highmem(zone))
                        continue;
                mark_free_pages(zone);
                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-                        n += !!saveable_page(pfn);
+                        if(saveable_page(pfn))
+                                n++;
        }
        return n;
 }
-static inline void copy_data_page(long *dst, long *src)
+/* This is needed, because copy_page and memcpy are not usable for copying
+ * task structs.
+ */
+static inline void do_copy_page(long *dst, long *src)
 {
        int n;
-        /* copy_page and memcpy are not usable for copying task structs. */
        for (n = PAGE_SIZE / sizeof(long); n; n--)
                *dst++ = *src++;
 }
+#ifdef CONFIG_HIGHMEM
+static inline struct page *
+page_is_saveable(struct zone *zone, unsigned long pfn)
+{
+        return is_highmem(zone) ?
+                        saveable_highmem_page(pfn) : saveable_page(pfn);
+}
+static inline void
+copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+        struct page *s_page, *d_page;
+        void *src, *dst;
+        s_page = pfn_to_page(src_pfn);
+        d_page = pfn_to_page(dst_pfn);
+        if (PageHighMem(s_page)) {
+                src = kmap_atomic(s_page, KM_USER0);
+                dst = kmap_atomic(d_page, KM_USER1);
+                do_copy_page(dst, src);
+                kunmap_atomic(src, KM_USER0);
+                kunmap_atomic(dst, KM_USER1);
+        } else {
+                src = page_address(s_page);
+                if (PageHighMem(d_page)) {
+                        /* Page pointed to by src may contain some kernel
+                         * data modified by kmap_atomic()
+                         */
+                        do_copy_page(buffer, src);
+                        dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
+                        memcpy(dst, buffer, PAGE_SIZE);
+                        kunmap_atomic(dst, KM_USER0);
+                } else {
+                        dst = page_address(d_page);
+                        do_copy_page(dst, src);
+                }
+        }
+}
+#else
+#define page_is_saveable(zone, pfn)     saveable_page(pfn)
+static inline void
+copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+{
+        do_copy_page(page_address(pfn_to_page(dst_pfn)),
+                        page_address(pfn_to_page(src_pfn)));
+}
+#endif /* CONFIG_HIGHMEM */
 static void
 copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 {
        struct zone *zone;
        unsigned long pfn;
-        for_each_zone (zone) {
+        for_each_zone(zone) {
                unsigned long max_zone_pfn;
-                if (is_highmem(zone))
-                        continue;
                mark_free_pages(zone);
                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-                        if (saveable_page(pfn))
+                        if (page_is_saveable(zone, pfn))
                                memory_bm_set_bit(orig_bm, pfn);
        }
        memory_bm_position_reset(orig_bm);
        memory_bm_position_reset(copy_bm);
        do {
                pfn = memory_bm_next_pfn(orig_bm);
-                if (likely(pfn != BM_END_OF_MAP)) {
+                if (likely(pfn != BM_END_OF_MAP))
-                        struct page *page;
+                        copy_data_page(memory_bm_next_pfn(copy_bm), pfn);
-                        void *src;
-                        page = pfn_to_page(pfn);
-                        src = page_address(page);
-                        page = pfn_to_page(memory_bm_next_pfn(copy_bm));
-                        copy_data_page(page_address(page), src);
-                }
        } while (pfn != BM_END_OF_MAP);
 }
+/* Total number of image pages */
+static unsigned int nr_copy_pages;
+/* Number of pages needed for saving the original pfns of the image pages */
+static unsigned int nr_meta_pages;
 /**
 *      swsusp_free - free pages allocated for the suspend.
 *
@@ -792,7 +824,7 @@ void swsusp_free(void)
                                if (PageNosave(page) && PageNosaveFree(page)) {
                                        ClearPageNosave(page);
                                        ClearPageNosaveFree(page);
-                                        free_page((long) page_address(page));
+                                        __free_page(page);
                                }
                        }
        }
@@ -802,34 +834,108 @@ void swsusp_free(void)
        buffer = NULL;
 }
+#ifdef CONFIG_HIGHMEM
+/**
+  *     count_pages_for_highmem - compute the number of non-highmem pages
+  *     that will be necessary for creating copies of highmem pages.
+  */
+static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
+{
+        unsigned int free_highmem = count_free_highmem_pages();
+        if (free_highmem >= nr_highmem)
+                nr_highmem = 0;
+        else
+                nr_highmem -= free_highmem;
+        return nr_highmem;
+}
+#else
+static unsigned int
+count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
+#endif /* CONFIG_HIGHMEM */
 /**
- *      enough_free_mem - Make sure we enough free memory to snapshot.
+ *      enough_free_mem - Make sure we have enough free memory for the
- *
+ *      snapshot image.
- *      Returns TRUE or FALSE after checking the number of available
- *      free pages.
 */
-static int enough_free_mem(unsigned int nr_pages)
+static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 {
        struct zone *zone;
        unsigned int free = 0, meta = 0;
-        for_each_zone (zone)
+        for_each_zone(zone) {
-                if (!is_highmem(zone)) {
+                meta += snapshot_additional_pages(zone);
+                if (!is_highmem(zone))
                        free += zone->free_pages;
-                        meta += snapshot_additional_pages(zone);
+        }
-                }
-        pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
+        nr_pages += count_pages_for_highmem(nr_highmem);
+        pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n",
                nr_pages, PAGES_FOR_IO, meta, free);
        return free > nr_pages + PAGES_FOR_IO + meta;
 }
+#ifdef CONFIG_HIGHMEM
+/**
+ *      get_highmem_buffer - if there are some highmem pages in the suspend
+ *      image, we may need the buffer to copy them and/or load their data.
+ */
+static inline int get_highmem_buffer(int safe_needed)
+{
+        buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+        return buffer ? 0 : -ENOMEM;
+}
+/**
+ *      alloc_highmem_image_pages - allocate some highmem pages for the image.
+ *      Try to allocate as many pages as needed, but if the number of free
+ *      highmem pages is lesser than that, allocate them all.
+ */
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
+{
+        unsigned int to_alloc = count_free_highmem_pages();
+        if (to_alloc > nr_highmem)
+                to_alloc = nr_highmem;
+        nr_highmem -= to_alloc;
+        while (to_alloc-- > 0) {
+                struct page *page;
+                page = alloc_image_page(__GFP_HIGHMEM);
+                memory_bm_set_bit(bm, page_to_pfn(page));
+        }
+        return nr_highmem;
+}
+#else
+static inline int get_highmem_buffer(int safe_needed) { return 0; }
+static inline unsigned int
+alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
+#endif /* CONFIG_HIGHMEM */
+/**
+ *      swsusp_alloc - allocate memory for the suspend image
+ *
+ *      We first try to allocate as many highmem pages as there are
+ *      saveable highmem pages in the system.  If that fails, we allocate
+ *      non-highmem pages for the copies of the remaining highmem ones.
+ *
+ *      In this approach it is likely that the copies of highmem pages will
+ *      also be located in the high memory, because of the way in which
+ *      copy_data_pages() works.
+ */
 static int
 swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
-                unsigned int nr_pages)
+                unsigned int nr_pages, unsigned int nr_highmem)
 {
        int error;
@@ -841,46 +947,61 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
        if (error)
                goto Free;
+        if (nr_highmem > 0) {
+                error = get_highmem_buffer(PG_ANY);
+                if (error)
+                        goto Free;
+                nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem);
+        }
        while (nr_pages-- > 0) {
-                struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+                struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
                if (!page)
                        goto Free;
-                SetPageNosave(page);
-                SetPageNosaveFree(page);
                memory_bm_set_bit(copy_bm, page_to_pfn(page));
        }
        return 0;
-Free:
+ Free:
        swsusp_free();
        return -ENOMEM;
 }
-/* Memory bitmap used for marking saveable pages */
+/* Memory bitmap used for marking saveable pages (during suspend) or the
+ * suspend image pages (during resume)
+ */
 static struct memory_bitmap orig_bm;
-/* Memory bitmap used for marking allocated pages that will contain the copies
+/* Memory bitmap used on suspend for marking allocated pages that will contain
- * of saveable pages
+ * the copies of saveable pages.  During resume it is initially used for
+ * marking the suspend image pages, but then its set bits are duplicated in
+ * @orig_bm and it is released.  Next, on systems with high memory, it may be
+ * used for marking "safe" highmem pages, but it has to be reinitialized for
+ * this purpose.
 */
 static struct memory_bitmap copy_bm;
 asmlinkage int swsusp_save(void)
 {
-        unsigned int nr_pages;
+        unsigned int nr_pages, nr_highmem;
-        pr_debug("swsusp: critical section: \n");
+        printk("swsusp: critical section: \n");
        drain_local_pages();
        nr_pages = count_data_pages();
-        printk("swsusp: Need to copy %u pages\n", nr_pages);
+        nr_highmem = count_highmem_pages();
+        printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem);
-        if (!enough_free_mem(nr_pages)) {
+        if (!enough_free_mem(nr_pages, nr_highmem)) {
                printk(KERN_ERR "swsusp: Not enough free memory\n");
                return -ENOMEM;
        }
-        if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
+        if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
+                printk(KERN_ERR "swsusp: Memory allocation failed\n");
                return -ENOMEM;
+        }
        /* During allocating of suspend pagedir, new cold pages may appear.
         * Kill them.
@@ -894,10 +1015,12 @@ asmlinkage int swsusp_save(void)
         * touch swap space! Except we must write out our image of course.
         */
+        nr_pages += nr_highmem;
        nr_copy_pages = nr_pages;
-        nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
        printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
        return 0;
 }
@@ -960,7 +1083,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
        if (!buffer) {
                /* This makes the buffer be freed by swsusp_free() */
-                buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
+                buffer = get_image_page(GFP_ATOMIC, PG_ANY);
                if (!buffer)
                        return -ENOMEM;
        }
@@ -975,9 +1098,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
                        memset(buffer, 0, PAGE_SIZE);
                        pack_pfns(buffer, &orig_bm);
                } else {
-                        unsigned long pfn = memory_bm_next_pfn(&copy_bm);
+                        struct page *page;
-                        handle->buffer = page_address(pfn_to_page(pfn));
+                        page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
+                        if (PageHighMem(page)) {
+                                /* Highmem pages are copied to the buffer,
+                                 * because we can't return with a kmapped
+                                 * highmem page (we may not be called again).
+                                 */
+                                void *kaddr;
+                                kaddr = kmap_atomic(page, KM_USER0);
+                                memcpy(buffer, kaddr, PAGE_SIZE);
+                                kunmap_atomic(kaddr, KM_USER0);
+                                handle->buffer = buffer;
+                        } else {
+                                handle->buffer = page_address(page);
+                        }
                }
                handle->prev = handle->cur;
        }
@@ -1005,7 +1142,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
        unsigned long pfn, max_zone_pfn;
        /* Clear page flags */
-        for_each_zone (zone) {
+        for_each_zone(zone) {
                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                        if (pfn_valid(pfn))
@@ -1101,6 +1238,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
        }
 }
+/* List of "safe" pages that may be used to store data loaded from the suspend
+ * image
+ */
+static struct linked_page *safe_pages_list;
+#ifdef CONFIG_HIGHMEM
+/* struct highmem_pbe is used for creating the list of highmem pages that
+ * should be restored atomically during the resume from disk, because the page
+ * frames they have occupied before the suspend are in use.
+ */
+struct highmem_pbe {
+        struct page *copy_page; /* data is here now */
+        struct page *orig_page; /* data was here before the suspend */
+        struct highmem_pbe *next;
+};
+/* List of highmem PBEs needed for restoring the highmem pages that were
+ * allocated before the suspend and included in the suspend image, but have
+ * also been allocated by the "resume" kernel, so their contents cannot be
+ * written directly to their "original" page frames.
+ */
+static struct highmem_pbe *highmem_pblist;
+/**
+ *      count_highmem_image_pages - compute the number of highmem pages in the
+ *      suspend image.  The bits in the memory bitmap @bm that correspond to the
+ *      image pages are assumed to be set.
+ */
+static unsigned int count_highmem_image_pages(struct memory_bitmap *bm)
+{
+        unsigned long pfn;
+        unsigned int cnt = 0;
+        memory_bm_position_reset(bm);
+        pfn = memory_bm_next_pfn(bm);
+        while (pfn != BM_END_OF_MAP) {
+                if (PageHighMem(pfn_to_page(pfn)))
+                        cnt++;
+                pfn = memory_bm_next_pfn(bm);
+        }
+        return cnt;
+}
+/**
+ *      prepare_highmem_image - try to allocate as many highmem pages as
+ *      there are highmem image pages (@nr_highmem_p points to the variable
+ *      containing the number of highmem image pages).  The pages that are
+ *      "safe" (ie. will not be overwritten when the suspend image is
+ *      restored) have the corresponding bits set in @bm (it must be
+ *      unitialized).
+ *
+ *      NOTE: This function should not be called if there are no highmem
+ *      image pages.
+ */
+static unsigned int safe_highmem_pages;
+static struct memory_bitmap *safe_highmem_bm;
+static int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+        unsigned int to_alloc;
+        if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE))
+                return -ENOMEM;
+        if (get_highmem_buffer(PG_SAFE))
+                return -ENOMEM;
+        to_alloc = count_free_highmem_pages();
+        if (to_alloc > *nr_highmem_p)
+                to_alloc = *nr_highmem_p;
+        else
+                *nr_highmem_p = to_alloc;
+        safe_highmem_pages = 0;
+        while (to_alloc-- > 0) {
+                struct page *page;
+                page = alloc_page(__GFP_HIGHMEM);
+                if (!PageNosaveFree(page)) {
+                        /* The page is "safe", set its bit the bitmap */
+                        memory_bm_set_bit(bm, page_to_pfn(page));
+                        safe_highmem_pages++;
+                }
+                /* Mark the page as allocated */
+                SetPageNosave(page);
+                SetPageNosaveFree(page);
+        }
+        memory_bm_position_reset(bm);
+        safe_highmem_bm = bm;
+        return 0;
+}
+/**
+ *      get_highmem_page_buffer - for given highmem image page find the buffer
+ *      that suspend_write_next() should set for its caller to write to.
+ *
+ *      If the page is to be saved to its "original" page frame or a copy of
+ *      the page is to be made in the highmem, @buffer is returned.  Otherwise,
+ *      the copy of the page is to be made in normal memory, so the address of
+ *      the copy is returned.
+ *
+ *      If @buffer is returned, the caller of suspend_write_next() will write
+ *      the page's contents to @buffer, so they will have to be copied to the
+ *      right location on the next call to suspend_write_next() and it is done
+ *      with the help of copy_last_highmem_page().  For this purpose, if
+ *      @buffer is returned, @last_highmem page is set to the page to which
+ *      the data will have to be copied from @buffer.
+ */
+static struct page *last_highmem_page;
+static void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+        struct highmem_pbe *pbe;
+        void *kaddr;
+        if (PageNosave(page) && PageNosaveFree(page)) {
+                /* We have allocated the "original" page frame and we can
+                 * use it directly to store the loaded page.
+                 */
+                last_highmem_page = page;
+                return buffer;
+        }
+        /* The "original" page frame has not been allocated and we have to
+         * use a "safe" page frame to store the loaded page.
+         */
+        pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
+        if (!pbe) {
+                swsusp_free();
+                return NULL;
+        }
+        pbe->orig_page = page;
+        if (safe_highmem_pages > 0) {
+                struct page *tmp;
+                /* Copy of the page will be stored in high memory */
+                kaddr = buffer;
+                tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm));
+                safe_highmem_pages--;
+                last_highmem_page = tmp;
+                pbe->copy_page = tmp;
+        } else {
+                /* Copy of the page will be stored in normal memory */
+                kaddr = safe_pages_list;
+                safe_pages_list = safe_pages_list->next;
+                pbe->copy_page = virt_to_page(kaddr);
+        }
+        pbe->next = highmem_pblist;
+        highmem_pblist = pbe;
+        return kaddr;
+}
+/**
+ *      copy_last_highmem_page - copy the contents of a highmem image from
+ *      @buffer, where the caller of snapshot_write_next() has place them,
+ *      to the right location represented by @last_highmem_page .
+ */
+static void copy_last_highmem_page(void)
+{
+        if (last_highmem_page) {
+                void *dst;
+                dst = kmap_atomic(last_highmem_page, KM_USER0);
+                memcpy(dst, buffer, PAGE_SIZE);
+                kunmap_atomic(dst, KM_USER0);
+                last_highmem_page = NULL;
+        }
+}
+static inline int last_highmem_page_copied(void)
+{
+        return !last_highmem_page;
+}
+static inline void free_highmem_data(void)
+{
+        if (safe_highmem_bm)
+                memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR);
+        if (buffer)
+                free_image_page(buffer, PG_UNSAFE_CLEAR);
+}
+#else
+static inline int get_safe_write_buffer(void) { return 0; }
+static unsigned int
+count_highmem_image_pages(struct memory_bitmap *bm) { return 0; }
+static inline int
+prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
+{
+        return 0;
+}
+static inline void *
+get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
+{
+        return NULL;
+}
+static inline void copy_last_highmem_page(void) {}
+static inline int last_highmem_page_copied(void) { return 1; }
+static inline void free_highmem_data(void) {}
+#endif /* CONFIG_HIGHMEM */
 /**
 *      prepare_image - use the memory bitmap @bm to mark the pages that will
 *      be overwritten in the process of restoring the system memory state
@@ -1110,20 +1459,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 *      The idea is to allocate a new memory bitmap first and then allocate
 *      as many pages as needed for the image data, but not to assign these
 *      pages to specific tasks initially.  Instead, we just mark them as
- *      allocated and create a list of "safe" pages that will be used later.
+ *      allocated and create a lists of "safe" pages that will be used
+ *      later.  On systems with high memory a list of "safe" highmem pages is
+ *      also created.
 */
 #define PBES_PER_LINKED_PAGE    (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-static struct linked_page *safe_pages_list;
 static int
 prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
-        unsigned int nr_pages;
+        unsigned int nr_pages, nr_highmem;
        struct linked_page *sp_list, *lp;
        int error;
+        /* If there is no highmem, the buffer will not be necessary */
+        free_image_page(buffer, PG_UNSAFE_CLEAR);
+        buffer = NULL;
+        nr_highmem = count_highmem_image_pages(bm);
        error = mark_unsafe_pages(bm);
        if (error)
                goto Free;
@@ -1134,6 +1488,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
        duplicate_memory_bitmap(new_bm, bm);
        memory_bm_free(bm, PG_UNSAFE_KEEP);
+        if (nr_highmem > 0) {
+                error = prepare_highmem_image(bm, &nr_highmem);
+                if (error)
+                        goto Free;
+        }
        /* Reserve some safe pages for potential later use.
         *
         * NOTE: This way we make sure there will be enough safe pages for the
@@ -1142,10 +1501,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
         */
        sp_list = NULL;
        /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
-        nr_pages = nr_copy_pages - allocated_unsafe_pages;
+        nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
        nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
        while (nr_pages > 0) {
-                lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
+                lp = get_image_page(GFP_ATOMIC, PG_SAFE);
                if (!lp) {
                        error = -ENOMEM;
                        goto Free;
@@ -1156,7 +1515,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
        }
        /* Preallocate memory for the image */
        safe_pages_list = NULL;
-        nr_pages = nr_copy_pages - allocated_unsafe_pages;
+        nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages;
        while (nr_pages > 0) {
                lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
                if (!lp) {
@@ -1181,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
        }
        return 0;
-Free:
+ Free:
        swsusp_free();
        return error;
 }
@@ -1196,6 +1555,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
        struct pbe *pbe;
        struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
+        if (PageHighMem(page))
+                return get_highmem_page_buffer(page, ca);
        if (PageNosave(page) && PageNosaveFree(page))
                /* We have allocated the "original" page frame and we can
                 * use it directly to store the loaded page.
@@ -1210,12 +1572,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
                swsusp_free();
                return NULL;
        }
-        pbe->orig_address = (unsigned long)page_address(page);
+        pbe->orig_address = page_address(page);
-        pbe->address = (unsigned long)safe_pages_list;
+        pbe->address = safe_pages_list;
        safe_pages_list = safe_pages_list->next;
        pbe->next = restore_pblist;
        restore_pblist = pbe;
-        return (void *)pbe->address;
+        return pbe->address;
 }
 /**
@@ -1249,14 +1611,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
        if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
                return 0;
-        if (!buffer) {
+        if (handle->offset == 0) {
-                /* This makes the buffer be freed by swsusp_free() */
+                if (!buffer)
-                buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
+                        /* This makes the buffer be freed by swsusp_free() */
+                        buffer = get_image_page(GFP_ATOMIC, PG_ANY);
                if (!buffer)
                        return -ENOMEM;
-        }
-        if (!handle->offset)
                handle->buffer = buffer;
+        }
        handle->sync_read = 1;
        if (handle->prev < handle->cur) {
                if (handle->prev == 0) {
@@ -1284,8 +1648,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
                                        return -ENOMEM;
                        }
                } else {
+                        copy_last_highmem_page();
                        handle->buffer = get_buffer(&orig_bm, &ca);
-                        handle->sync_read = 0;
+                        if (handle->buffer != buffer)
+                                handle->sync_read = 0;
                }
                handle->prev = handle->cur;
        }
@@ -1301,15 +1667,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
        return count;
 }
+/**
+ *      snapshot_write_finalize - must be called after the last call to
+ *      snapshot_write_next() in case the last page in the image happens
+ *      to be a highmem page and its contents should be stored in the
+ *      highmem.  Additionally, it releases the memory that will not be
+ *      used any more.
+ */
+void snapshot_write_finalize(struct snapshot_handle *handle)
+{
+        copy_last_highmem_page();
+        /* Free only if we have loaded the image entirely */
+        if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) {
+                memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+                free_highmem_data();
+        }
+}
 int snapshot_image_loaded(struct snapshot_handle *handle)
 {
-        return !(!nr_copy_pages ||
+        return !(!nr_copy_pages || !last_highmem_page_copied() ||
                        handle->cur <= nr_meta_pages + nr_copy_pages);
 }
-void snapshot_free_unused_memory(struct snapshot_handle *handle)
+#ifdef CONFIG_HIGHMEM
+/* Assumes that @buf is ready and points to a "safe" page */
+static inline void
+swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
 {
-        /* Free only if we have loaded the image entirely */
+        void *kaddr1, *kaddr2;
-        if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
-                memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
+        kaddr1 = kmap_atomic(p1, KM_USER0);
+        kaddr2 = kmap_atomic(p2, KM_USER1);
+        memcpy(buf, kaddr1, PAGE_SIZE);
+        memcpy(kaddr1, kaddr2, PAGE_SIZE);
+        memcpy(kaddr2, buf, PAGE_SIZE);
+        kunmap_atomic(kaddr1, KM_USER0);
+        kunmap_atomic(kaddr2, KM_USER1);
+}
+/**
+ *      restore_highmem - for each highmem page that was allocated before
+ *      the suspend and included in the suspend image, and also has been
+ *      allocated by the "resume" kernel swap its current (ie. "before
+ *      resume") contents with the previous (ie. "before suspend") one.
+ *
+ *      If the resume eventually fails, we can call this function once
+ *      again and restore the "before resume" highmem state.
+ */
+int restore_highmem(void)
+{
+        struct highmem_pbe *pbe = highmem_pblist;
+        void *buf;
+        if (!pbe)
+                return 0;
+        buf = get_image_page(GFP_ATOMIC, PG_SAFE);
+        if (!buf)
+                return -ENOMEM;
+        while (pbe) {
+                swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf);
+                pbe = pbe->next;
+        }
+        free_image_page(buf, PG_UNSAFE_CLEAR);
+        return 0;
 }
+#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1a3b0dd2c3fc..f133d4a6d817 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -34,34 +34,123 @@ extern char resume_file[];
 #define SWSUSP_SIG      "S1SUSPEND"
 static struct swsusp_header {
-        char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+        char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
-        swp_entry_t image;
+        sector_t image;
        char    orig_sig[10];
        char    sig[10];
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
 /*
- * Saving part...
+ * General things
 */
 static unsigned short root_swap = 0xffff;
+static struct block_device *resume_bdev;
+/**
+ *      submit - submit BIO request.
+ *      @rw:    READ or WRITE.
+ *      @off    physical offset of page.
+ *      @page:  page we're reading or writing.
+ *      @bio_chain: list of pending biod (for async reading)
+ *
+ *      Straight from the textbook - allocate and initialize the bio.
+ *      If we're reading, make sure the page is marked as dirty.
+ *      Then submit it and, if @bio_chain == NULL, wait.
+ */
+static int submit(int rw, pgoff_t page_off, struct page *page,
+                        struct bio **bio_chain)
+{
+        struct bio *bio;
+        bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_sector = page_off * (PAGE_SIZE >> 9);
+        bio->bi_bdev = resume_bdev;
+        bio->bi_end_io = end_swap_bio_read;
+        if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+                printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
+                bio_put(bio);
+                return -EFAULT;
+        }
+        lock_page(page);
+        bio_get(bio);
+        if (bio_chain == NULL) {
+                submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+                wait_on_page_locked(page);
+                if (rw == READ)
+                        bio_set_pages_dirty(bio);
+                bio_put(bio);
+        } else {
+                if (rw == READ)
+                        get_page(page); /* These pages are freed later */
+                bio->bi_private = *bio_chain;
+                *bio_chain = bio;
+                submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+        }
+        return 0;
+}
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+        return submit(READ, page_off, virt_to_page(addr), bio_chain);
+}
+static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
+{
+        return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
+}
+static int wait_on_bio_chain(struct bio **bio_chain)
+{
+        struct bio *bio;
+        struct bio *next_bio;
+        int ret = 0;
+        if (bio_chain == NULL)
+                return 0;
+        bio = *bio_chain;
+        if (bio == NULL)
+                return 0;
+        while (bio) {
+                struct page *page;
+                next_bio = bio->bi_private;
+                page = bio->bi_io_vec[0].bv_page;
+                wait_on_page_locked(page);
+                if (!PageUptodate(page) || PageError(page))
+                        ret = -EIO;
+                put_page(page);
+                bio_put(bio);
+                bio = next_bio;
+        }
+        *bio_chain = NULL;
+        return ret;
+}
+/*
+ * Saving part
+ */
-static int mark_swapfiles(swp_entry_t start)
+static int mark_swapfiles(sector_t start)
 {
        int error;
-        rw_swap_page_sync(READ, swp_entry(root_swap, 0),
+        bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
-                          virt_to_page((unsigned long)&swsusp_header), NULL);
        if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
            !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
                memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
                memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
                swsusp_header.image = start;
-                error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
+                error = bio_write_page(swsusp_resume_block,
-                                virt_to_page((unsigned long)&swsusp_header),
+                                        &swsusp_header, NULL);
-                                NULL);
        } else {
-                pr_debug("swsusp: Partition is not swap space.\n");
+                printk(KERN_ERR "swsusp: Swap header not found!\n");
                error = -ENODEV;
        }
        return error;
@@ -74,12 +163,21 @@ static int mark_swapfiles(swp_entry_t start)
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
-        int res = swap_type_of(swsusp_resume_device);
+        int res;
+        res = swap_type_of(swsusp_resume_device, swsusp_resume_block);
+        if (res < 0)
+                return res;
+        root_swap = res;
+        resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_WRITE);
+        if (IS_ERR(resume_bdev))
+                return PTR_ERR(resume_bdev);
+        res = set_blocksize(resume_bdev, PAGE_SIZE);
+        if (res < 0)
+                blkdev_put(resume_bdev);
-        if (res >= 0) {
-                root_swap = res;
-                return 0;
-        }
        return res;
 }
@@ -90,36 +188,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 *      @bio_chain:     Link the next write BIO here
 */
-static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
+static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 {
-        swp_entry_t entry;
+        void *src;
-        int error = -ENOSPC;
+        if (!offset)
-        if (offset) {
+                return -ENOSPC;
-                struct page *page = virt_to_page(buf);
+        if (bio_chain) {
-                if (bio_chain) {
+                src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
-                        /*
+                if (src) {
-                         * Whether or not we successfully allocated a copy page,
+                        memcpy(src, buf, PAGE_SIZE);
-                         * we take a ref on the page here.  It gets undone in
+                } else {
-                         * wait_on_bio_chain().
+                        WARN_ON_ONCE(1);
-                         */
+                        bio_chain = NULL;       /* Go synchronous */
-                        struct page *page_copy;
+                        src = buf;
-                        page_copy = alloc_page(GFP_ATOMIC);
-                        if (page_copy == NULL) {
-                                WARN_ON_ONCE(1);
-                                bio_chain = NULL;       /* Go synchronous */
-                                get_page(page);
-                        } else {
-                                memcpy(page_address(page_copy),
-                                        page_address(page), PAGE_SIZE);
-                                page = page_copy;
-                        }
                }
-                entry = swp_entry(root_swap, offset);
+        } else {
-                error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
+                src = buf;
        }
-        return error;
+        return bio_write_page(offset, src, bio_chain);
 }
 /*
@@ -137,11 +225,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
 *      at a time.
 */
-#define MAP_PAGE_ENTRIES        (PAGE_SIZE / sizeof(long) - 1)
+#define MAP_PAGE_ENTRIES        (PAGE_SIZE / sizeof(sector_t) - 1)
 struct swap_map_page {
-        unsigned long           entries[MAP_PAGE_ENTRIES];
+        sector_t entries[MAP_PAGE_ENTRIES];
-        unsigned long           next_swap;
+        sector_t next_swap;
 };
 /**
@@ -151,7 +239,7 @@ struct swap_map_page {
 struct swap_map_handle {
        struct swap_map_page *cur;
-        unsigned long cur_swap;
+        sector_t cur_swap;
        struct bitmap_page *bitmap;
        unsigned int k;
 };
@@ -166,26 +254,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
        handle->bitmap = NULL;
 }
-static void show_speed(struct timeval *start, struct timeval *stop,
-                        unsigned nr_pages, char *msg)
-{
-        s64 elapsed_centisecs64;
-        int centisecs;
-        int k;
-        int kps;
-        elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
-        do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
-        centisecs = elapsed_centisecs64;
-        if (centisecs == 0)
-                centisecs = 1;  /* avoid div-by-zero */
-        k = nr_pages * (PAGE_SIZE / 1024);
-        kps = (k * 100) / centisecs;
-        printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
-                        centisecs / 100, centisecs % 100,
-                        kps / 1000, (kps % 1000) / 10);
-}
 static int get_swap_writer(struct swap_map_handle *handle)
 {
        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -196,7 +264,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
                release_swap_writer(handle);
                return -ENOMEM;
        }
-        handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+        handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
        if (!handle->cur_swap) {
                release_swap_writer(handle);
                return -ENOSPC;
@@ -205,43 +273,15 @@ static int get_swap_writer(struct swap_map_handle *handle)
        return 0;
 }
-static int wait_on_bio_chain(struct bio **bio_chain)
-{
-        struct bio *bio;
-        struct bio *next_bio;
-        int ret = 0;
-        if (bio_chain == NULL)
-                return 0;
-        bio = *bio_chain;
-        if (bio == NULL)
-                return 0;
-        while (bio) {
-                struct page *page;
-                next_bio = bio->bi_private;
-                page = bio->bi_io_vec[0].bv_page;
-                wait_on_page_locked(page);
-                if (!PageUptodate(page) || PageError(page))
-                        ret = -EIO;
-                put_page(page);
-                bio_put(bio);
-                bio = next_bio;
-        }
-        *bio_chain = NULL;
-        return ret;
-}
 static int swap_write_page(struct swap_map_handle *handle, void *buf,
                                struct bio **bio_chain)
 {
        int error = 0;
-        unsigned long offset;
+        sector_t offset;
        if (!handle->cur)
                return -EINVAL;
-        offset = alloc_swap_page(root_swap, handle->bitmap);
+        offset = alloc_swapdev_block(root_swap, handle->bitmap);
        error = write_page(buf, offset, bio_chain);
        if (error)
                return error;
@@ -250,7 +290,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                error = wait_on_bio_chain(bio_chain);
                if (error)
                        goto out;
-                offset = alloc_swap_page(root_swap, handle->bitmap);
+                offset = alloc_swapdev_block(root_swap, handle->bitmap);
                if (!offset)
                        return -ENOSPC;
                handle->cur->next_swap = offset;
@@ -261,7 +301,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                handle->cur_swap = offset;
                handle->k = 0;
        }
-out:
+ out:
        return error;
 }
@@ -315,7 +355,7 @@ static int save_image(struct swap_map_handle *handle,
                error = err2;
        if (!error)
                printk("\b\b\b\bdone\n");
-        show_speed(&start, &stop, nr_to_write, "Wrote");
+        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
        return error;
 }
@@ -350,100 +390,50 @@ int swsusp_write(void)
        struct swsusp_info *header;
        int error;
-        if ((error = swsusp_swap_check())) {
+        error = swsusp_swap_check();
+        if (error) {
                printk(KERN_ERR "swsusp: Cannot find swap device, try "
                                "swapon -a.\n");
                return error;
        }
        memset(&snapshot, 0, sizeof(struct snapshot_handle));
        error = snapshot_read_next(&snapshot, PAGE_SIZE);
-        if (error < PAGE_SIZE)
+        if (error < PAGE_SIZE) {
-                return error < 0 ? error : -EFAULT;
+                if (error >= 0)
+                        error = -EFAULT;
+                goto out;
+        }
        header = (struct swsusp_info *)data_of(snapshot);
        if (!enough_swap(header->pages)) {
                printk(KERN_ERR "swsusp: Not enough free swap\n");
-                return -ENOSPC;
+                error = -ENOSPC;
+                goto out;
        }
        error = get_swap_writer(&handle);
        if (!error) {
-                unsigned long start = handle.cur_swap;
+                sector_t start = handle.cur_swap;
                error = swap_write_page(&handle, header, NULL);
                if (!error)
                        error = save_image(&handle, &snapshot,
                                        header->pages - 1);
                if (!error) {
                        flush_swap_writer(&handle);
                        printk("S");
-                        error = mark_swapfiles(swp_entry(root_swap, start));
+                        error = mark_swapfiles(start);
                        printk("|\n");
                }
        }
        if (error)
                free_all_swap_pages(root_swap, handle.bitmap);
        release_swap_writer(&handle);
+ out:
+        swsusp_close();
        return error;
 }
-static struct block_device *resume_bdev;
-/**
- *      submit - submit BIO request.
- *      @rw:    READ or WRITE.
- *      @off    physical offset of page.
- *      @page:  page we're reading or writing.
- *      @bio_chain: list of pending biod (for async reading)
- *
- *      Straight from the textbook - allocate and initialize the bio.
- *      If we're reading, make sure the page is marked as dirty.
- *      Then submit it and, if @bio_chain == NULL, wait.
- */
-static int submit(int rw, pgoff_t page_off, struct page *page,
-                        struct bio **bio_chain)
-{
-        struct bio *bio;
-        bio = bio_alloc(GFP_ATOMIC, 1);
-        if (!bio)
-                return -ENOMEM;
-        bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-        bio->bi_bdev = resume_bdev;
-        bio->bi_end_io = end_swap_bio_read;
-        if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-                printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
-                bio_put(bio);
-                return -EFAULT;
-        }
-        lock_page(page);
-        bio_get(bio);
-        if (bio_chain == NULL) {
-                submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-                wait_on_page_locked(page);
-                if (rw == READ)
-                        bio_set_pages_dirty(bio);
-                bio_put(bio);
-        } else {
-                if (rw == READ)
-                        get_page(page); /* These pages are freed later */
-                bio->bi_private = *bio_chain;
-                *bio_chain = bio;
-                submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-        }
-        return 0;
-}
-static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
-{
-        return submit(READ, page_off, virt_to_page(addr), bio_chain);
-}
-static int bio_write_page(pgoff_t page_off, void *addr)
-{
-        return submit(WRITE, page_off, virt_to_page(addr), NULL);
-}
 /**
 *      The following functions allow us to read data using a swap map
 *      in a file-alike way
@@ -456,17 +446,18 @@ static void release_swap_reader(struct swap_map_handle *handle)
        handle->cur = NULL;
 }
-static int get_swap_reader(struct swap_map_handle *handle,
+static int get_swap_reader(struct swap_map_handle *handle, sector_t start)
-                                      swp_entry_t start)
 {
        int error;
-        if (!swp_offset(start))
+        if (!start)
                return -EINVAL;
-        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
+        handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
        if (!handle->cur)
                return -ENOMEM;
-        error = bio_read_page(swp_offset(start), handle->cur, NULL);
+        error = bio_read_page(start, handle->cur, NULL);
        if (error) {
                release_swap_reader(handle);
                return error;
@@ -478,7 +469,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
 static int swap_read_page(struct swap_map_handle *handle, void *buf,
                                struct bio **bio_chain)
 {
-        unsigned long offset;
+        sector_t offset;
        int error;
        if (!handle->cur)
@@ -547,11 +538,11 @@ static int load_image(struct swap_map_handle *handle,
                error = err2;
        if (!error) {
                printk("\b\b\b\bdone\n");
-                snapshot_free_unused_memory(snapshot);
+                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
                        error = -ENODATA;
        }
-        show_speed(&start, &stop, nr_to_read, "Read");
+        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
        return error;
 }
@@ -600,12 +591,16 @@ int swsusp_check(void)
        if (!IS_ERR(resume_bdev)) {
                set_blocksize(resume_bdev, PAGE_SIZE);
                memset(&swsusp_header, 0, sizeof(swsusp_header));
-                if ((error = bio_read_page(0, &swsusp_header, NULL)))
+                error = bio_read_page(swsusp_resume_block,
+                                        &swsusp_header, NULL);
+                if (error)
                        return error;
                if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
                        memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
                        /* Reset swap signature now */
-                        error = bio_write_page(0, &swsusp_header);
+                        error = bio_write_page(swsusp_resume_block,
+                                                &swsusp_header, NULL);
                } else {
                        return -EINVAL;
                }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0b66659dc516..31aa0390c777 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -49,6 +49,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/highmem.h>
+#include <linux/time.h>
 #include "power.h"
@@ -64,10 +65,8 @@ int in_suspend __nosavedata = 0;
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
-int save_highmem(void);
 int restore_highmem(void);
 #else
-static inline int save_highmem(void) { return 0; }
 static inline int restore_highmem(void) { return 0; }
 static inline unsigned int count_highmem_pages(void) { return 0; }
 #endif
@@ -134,18 +133,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
        return 0;
 }
-unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
+sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
 {
        unsigned long offset;
        offset = swp_offset(get_swap_page_of_type(swap));
        if (offset) {
-                if (bitmap_set(bitmap, offset)) {
+                if (bitmap_set(bitmap, offset))
                        swap_free(swp_entry(swap, offset));
-                        offset = 0;
+                else
-                }
+                        return swapdev_block(swap, offset);
        }
-        return offset;
+        return 0;
 }
 void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
@@ -166,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 }
 /**
+ *      swsusp_show_speed - print the time elapsed between two events represented by
+ *      @start and @stop
+ *
+ *      @nr_pages -     number of pages processed between @start and @stop
+ *      @msg -          introductory message to print
+ */
+void swsusp_show_speed(struct timeval *start, struct timeval *stop,
+                        unsigned nr_pages, char *msg)
+{
+        s64 elapsed_centisecs64;
+        int centisecs;
+        int k;
+        int kps;
+        elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+        do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+        centisecs = elapsed_centisecs64;
+        if (centisecs == 0)
+                centisecs = 1;  /* avoid div-by-zero */
+        k = nr_pages * (PAGE_SIZE / 1024);
+        kps = (k * 100) / centisecs;
+        printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+                        centisecs / 100, centisecs % 100,
+                        kps / 1000, (kps % 1000) / 10);
+}
+/**
 *      swsusp_shrink_memory -  Try to free as much memory as needed
 *
 *      ... but do not OOM-kill anyone
@@ -184,23 +211,37 @@ static inline unsigned long __shrink_memory(long tmp)
 int swsusp_shrink_memory(void)
 {
-        long size, tmp;
+        long tmp;
        struct zone *zone;
        unsigned long pages = 0;
        unsigned int i = 0;
        char *p = "-\\|/";
+        struct timeval start, stop;
        printk("Shrinking memory...  ");
+        do_gettimeofday(&start);
        do {
-                size = 2 * count_highmem_pages();
+                long size, highmem_size;
-                size += size / 50 + count_data_pages() + PAGES_FOR_IO;
+                highmem_size = count_highmem_pages();
+                size = count_data_pages() + PAGES_FOR_IO;
                tmp = size;
+                size += highmem_size;
                for_each_zone (zone)
-                        if (!is_highmem(zone) && populated_zone(zone)) {
+                        if (populated_zone(zone)) {
-                                tmp -= zone->free_pages;
+                                if (is_highmem(zone)) {
-                                tmp += zone->lowmem_reserve[ZONE_NORMAL];
+                                        highmem_size -= zone->free_pages;
-                                tmp += snapshot_additional_pages(zone);
+                                } else {
+                                        tmp -= zone->free_pages;
+                                        tmp += zone->lowmem_reserve[ZONE_NORMAL];
+                                        tmp += snapshot_additional_pages(zone);
+                                }
                        }
+                if (highmem_size < 0)
+                        highmem_size = 0;
+                tmp += highmem_size;
                if (tmp > 0) {
                        tmp = __shrink_memory(tmp);
                        if (!tmp)
@@ -212,7 +253,9 @@ int swsusp_shrink_memory(void)
                }
                printk("\b%c", p[i++%4]);
        } while (tmp > 0);
+        do_gettimeofday(&stop);
        printk("\bdone (%lu pages freed)\n", pages);
+        swsusp_show_speed(&start, &stop, pages, "Freed");
        return 0;
 }
@@ -223,6 +266,7 @@ int swsusp_suspend(void)
        if ((error = arch_prepare_suspend()))
                return error;
        local_irq_disable();
        /* At this point, device_suspend() has been called, but *not*
         * device_power_down(). We *must* device_power_down() now.
@@ -235,23 +279,16 @@ int swsusp_suspend(void)
                goto Enable_irqs;
        }
-        if ((error = save_highmem())) {
-                printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
-                goto Restore_highmem;
-        }
        save_processor_state();
        if ((error = swsusp_arch_suspend()))
                printk(KERN_ERR "Error %d suspending\n", error);
        /* Restore control flow magically appears here */
        restore_processor_state();
-Restore_highmem:
-        restore_highmem();
        /* NOTE:  device_power_up() is just a resume() for devices
         * that suspended with irqs off ... no overall powerup.
         */
        device_power_up();
-Enable_irqs:
+ Enable_irqs:
        local_irq_enable();
        return error;
 }
@@ -268,18 +305,23 @@ int swsusp_resume(void)
                printk(KERN_ERR "Some devices failed to power down, very bad\n");
        /* We'll ignore saved state, but this gets preempt count (etc) right */
        save_processor_state();
-        error = swsusp_arch_resume();
+        error = restore_highmem();
-        /* Code below is only ever reached in case of failure. Otherwise
+        if (!error) {
-         * execution continues at place where swsusp_arch_suspend was called
+                error = swsusp_arch_resume();
-         */
+                /* The code below is only ever reached in case of a failure.
-        BUG_ON(!error);
+                 * Otherwise execution continues at place where
+                 * swsusp_arch_suspend() was called
+                 */
+                BUG_ON(!error);
+                /* This call to restore_highmem() undos the previous one */
+                restore_highmem();
+        }
        /* The only reason why swsusp_arch_resume() can fail is memory being
         * very tight, so we have to free it as soon as we can to avoid
         * subsequent failures
         */
        swsusp_free();
        restore_processor_state();
-        restore_highmem();
        touch_softlockup_watchdog();
        device_power_up();
        local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index d991d3b0e5a4..89443b85163b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -11,6 +11,7 @@
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
+#include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
 #include <linux/miscdevice.h>
@@ -21,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
+#include <linux/freezer.h>
 #include <asm/uaccess.h>
@@ -54,7 +56,8 @@ static int snapshot_open(struct inode *inode, struct file *filp)
        filp->private_data = data;
        memset(&data->handle, 0, sizeof(struct snapshot_handle));
        if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
-                data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+                data->swap = swsusp_resume_device ?
+                                swap_type_of(swsusp_resume_device, 0) : -1;
                data->mode = O_RDONLY;
        } else {
                data->swap = -1;
@@ -76,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
        free_all_swap_pages(data->swap, data->bitmap);
        free_bitmap(data->bitmap);
        if (data->frozen) {
-                down(&pm_sem);
+                mutex_lock(&pm_mutex);
                thaw_processes();
                enable_nonboot_cpus();
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
        }
        atomic_inc(&device_available);
        return 0;
@@ -124,7 +127,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 {
        int error = 0;
        struct snapshot_data *data;
-        loff_t offset, avail;
+        loff_t avail;
+        sector_t offset;
        if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
                return -ENOTTY;
@@ -140,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
        case SNAPSHOT_FREEZE:
                if (data->frozen)
                        break;
-                down(&pm_sem);
+                mutex_lock(&pm_mutex);
                error = disable_nonboot_cpus();
                if (!error) {
                        error = freeze_processes();
@@ -150,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                                error = -EBUSY;
                        }
                }
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
                if (!error)
                        data->frozen = 1;
                break;
@@ -158,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
        case SNAPSHOT_UNFREEZE:
                if (!data->frozen)
                        break;
-                down(&pm_sem);
+                mutex_lock(&pm_mutex);
                thaw_processes();
                enable_nonboot_cpus();
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
                data->frozen = 0;
                break;
@@ -170,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                        error = -EPERM;
                        break;
                }
-                down(&pm_sem);
+                mutex_lock(&pm_mutex);
                /* Free memory before shutting down devices. */
                error = swsusp_shrink_memory();
                if (!error) {
@@ -183,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                        }
                        resume_console();
                }
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
                if (!error)
                        error = put_user(in_suspend, (unsigned int __user *)arg);
                if (!error)
@@ -191,13 +195,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                break;
        case SNAPSHOT_ATOMIC_RESTORE:
+                snapshot_write_finalize(&data->handle);
                if (data->mode != O_WRONLY || !data->frozen ||
                    !snapshot_image_loaded(&data->handle)) {
                        error = -EPERM;
                        break;
                }
-                snapshot_free_unused_memory(&data->handle);
+                mutex_lock(&pm_mutex);
-                down(&pm_sem);
                pm_prepare_console();
                suspend_console();
                error = device_suspend(PMSG_PRETHAW);
@@ -207,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                }
                resume_console();
                pm_restore_console();
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
                break;
        case SNAPSHOT_FREE:
@@ -238,10 +242,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                                break;
                        }
                }
-                offset = alloc_swap_page(data->swap, data->bitmap);
+                offset = alloc_swapdev_block(data->swap, data->bitmap);
                if (offset) {
                        offset <<= PAGE_SHIFT;
-                        error = put_user(offset, (loff_t __user *)arg);
+                        error = put_user(offset, (sector_t __user *)arg);
                } else {
                        error = -ENOSPC;
                }
@@ -264,7 +268,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                         * so we need to recode them
                         */
                        if (old_decode_dev(arg)) {
-                                data->swap = swap_type_of(old_decode_dev(arg));
+                                data->swap = swap_type_of(old_decode_dev(arg), 0);
                                if (data->swap < 0)
                                        error = -ENODEV;
                        } else {
@@ -282,7 +286,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                        break;
                }
-                if (down_trylock(&pm_sem)) {
+                if (!mutex_trylock(&pm_mutex)) {
                        error = -EBUSY;
                        break;
                }
@@ -309,8 +313,66 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                if (pm_ops->finish)
                        pm_ops->finish(PM_SUSPEND_MEM);
-OutS3:
+ OutS3:
-                up(&pm_sem);
+                mutex_unlock(&pm_mutex);
+                break;
+        case SNAPSHOT_PMOPS:
+                switch (arg) {
+                case PMOPS_PREPARE:
+                        if (pm_ops->prepare) {
+                                error = pm_ops->prepare(PM_SUSPEND_DISK);
+                        }
+                        break;
+                case PMOPS_ENTER:
+                        kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+                        error = pm_ops->enter(PM_SUSPEND_DISK);
+                        break;
+                case PMOPS_FINISH:
+                        if (pm_ops && pm_ops->finish) {
+                                pm_ops->finish(PM_SUSPEND_DISK);
+                        }
+                        break;
+                default:
+                        printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
+                        error = -EINVAL;
+                }
+                break;
+        case SNAPSHOT_SET_SWAP_AREA:
+                if (data->bitmap) {
+                        error = -EPERM;
+                } else {
+                        struct resume_swap_area swap_area;
+                        dev_t swdev;
+                        error = copy_from_user(&swap_area, (void __user *)arg,
+                                        sizeof(struct resume_swap_area));
+                        if (error) {
+                                error = -EFAULT;
+                                break;
+                        }
+                        /*
+                         * User space encodes device types as two-byte values,
+                         * so we need to recode them
+                         */
+                        swdev = old_decode_dev(swap_area.dev);
+                        if (swdev) {
+                                offset = swap_area.offset;
+                                data->swap = swap_type_of(swdev, offset);
+                                if (data->swap < 0)
+                                        error = -ENODEV;
+                        } else {
+                                data->swap = -1;
+                                error = -EINVAL;
+                        }
+                }
                break;
        default:
@@ -321,7 +383,7 @@ OutS3:
        return error;
 }
-static struct file_operations snapshot_fops = {
+static const struct file_operations snapshot_fops = {
        .open = snapshot_open,
        .release = snapshot_release,
        .read = snapshot_read,
diff --git a/kernel/printk.c b/kernel/printk.c
index f7d427ef5038..185bb45eacf7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
+#include <linux/jiffies.h>
 #include <asm/uaccess.h>
@@ -52,8 +53,6 @@ int console_printk[4] = {
        DEFAULT_CONSOLE_LOGLEVEL,       /* default_console_loglevel */
 };
-EXPORT_UNUSED_SYMBOL(console_printk);  /*  June 2006  */
 /*
 * Low lever drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
@@ -334,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
        }
 }
+static int __read_mostly ignore_loglevel;
+int __init ignore_loglevel_setup(char *str)
+{
+        ignore_loglevel = 1;
+        printk(KERN_INFO "debug: ignoring loglevel setting.\n");
+        return 1;
+}
+__setup("ignore_loglevel", ignore_loglevel_setup);
 /*
 * Write out chars from start to end - 1 inclusive
 */
 static void _call_console_drivers(unsigned long start,
                                unsigned long end, int msg_log_level)
 {
-        if (msg_log_level < console_loglevel &&
+        if ((msg_log_level < console_loglevel || ignore_loglevel) &&
                        console_drivers && start != end) {
                if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
                        /* wrapped write */
@@ -630,12 +641,7 @@ EXPORT_SYMBOL(vprintk);
 asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
-        return 0;
+        return -ENOSYS;
-}
-int do_syslog(int type, char __user *buf, int len)
-{
-        return 0;
 }
 static void call_console_drivers(unsigned long start, unsigned long end)
@@ -776,7 +782,6 @@ int is_console_locked(void)
 {
        return console_locked;
 }
-EXPORT_UNUSED_SYMBOL(is_console_locked);  /*  June 2006  */
 /**
 * release_console_sem - unlock the console system
@@ -1101,3 +1106,23 @@ int printk_ratelimit(void)
                                printk_ratelimit_burst);
 }
 EXPORT_SYMBOL(printk_ratelimit);
+/**
+ * printk_timed_ratelimit - caller-controlled printk ratelimiting
+ * @caller_jiffies: pointer to caller's state
+ * @interval_msecs: minimum interval between prints
+ *
+ * printk_timed_ratelimit() returns true if more than @interval_msecs
+ * milliseconds have elapsed since the last time printk_timed_ratelimit()
+ * returned true.
+ */
+bool printk_timed_ratelimit(unsigned long *caller_jiffies,
+                        unsigned int interval_msecs)
+{
+        if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
+                *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
+                return true;
+        }
+        return false;
+}
+EXPORT_SYMBOL(printk_timed_ratelimit);
diff --git a/kernel/profile.c b/kernel/profile.c
index f940b462eec9..fb5e03d57e9d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -40,7 +40,7 @@ int (*timer_hook)(struct pt_regs *) __read_mostly;
 static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
-static int prof_on __read_mostly;
+int prof_on __read_mostly;
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
@@ -51,9 +51,19 @@ static DEFINE_MUTEX(profile_flip_mutex);
 static int __init profile_setup(char * str)
 {
        static char __initdata schedstr[] = "schedule";
+        static char __initdata sleepstr[] = "sleep";
        int par;
-        if (!strncmp(str, schedstr, strlen(schedstr))) {
+        if (!strncmp(str, sleepstr, strlen(sleepstr))) {
+                prof_on = SLEEP_PROFILING;
+                if (str[strlen(sleepstr)] == ',')
+                        str += strlen(sleepstr) + 1;
+                if (get_option(&str, &par))
+                        prof_shift = par;
+                printk(KERN_INFO
+                        "kernel sleep profiling enabled (shift: %ld)\n",
+                        prof_shift);
+        } else if (!strncmp(str, sleepstr, strlen(sleepstr))) {
                prof_on = SCHED_PROFILING;
                if (str[strlen(schedstr)] == ',')
                        str += strlen(schedstr) + 1;
@@ -204,7 +214,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister);
 * positions to which hits are accounted during short intervals (e.g.
 * several seconds) is usually very small. Exclusion from buffer
 * flipping is provided by interrupt disablement (note that for
- * SCHED_PROFILING profile_hit() may be called from process context).
+ * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from
+ * process context).
 * The hash function is meant to be lightweight as opposed to strong,
 * and was vaguely inspired by ppc64 firmware-supported inverted
 * pagetable hash functions, but uses a full hashtable full of finite
@@ -257,7 +268,7 @@ static void profile_discard_flip_buffers(void)
        mutex_unlock(&profile_flip_mutex);
 }
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
        int i, j, cpu;
@@ -274,21 +285,31 @@ void profile_hit(int type, void *__pc)
                put_cpu();
                return;
        }
+        /*
+         * We buffer the global profiler buffer into a per-CPU
+         * queue and thus reduce the number of global (and possibly
+         * NUMA-alien) accesses. The write-queue is self-coalescing:
+         */
        local_irq_save(flags);
        do {
                for (j = 0; j < PROFILE_GRPSZ; ++j) {
                        if (hits[i + j].pc == pc) {
-                                hits[i + j].hits++;
+                                hits[i + j].hits += nr_hits;
                                goto out;
                        } else if (!hits[i + j].hits) {
                                hits[i + j].pc = pc;
-                                hits[i + j].hits = 1;
+                                hits[i + j].hits = nr_hits;
                                goto out;
                        }
                }
                i = (i + secondary) & (NR_PROFILE_HIT - 1);
        } while (i != primary);
-        atomic_inc(&prof_buffer[pc]);
+        /*
+         * Add the current hit(s) and flush the write-queue out
+         * to the global buffer:
+         */
+        atomic_add(nr_hits, &prof_buffer[pc]);
        for (i = 0; i < NR_PROFILE_HIT; ++i) {
                atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]);
                hits[i].pc = hits[i].hits = 0;
@@ -298,7 +319,6 @@ out:
        put_cpu();
 }
-#ifdef CONFIG_HOTPLUG_CPU
 static int __devinit profile_cpu_callback(struct notifier_block *info,
                                        unsigned long action, void *__cpu)
 {
@@ -351,19 +371,19 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
        }
        return NOTIFY_OK;
 }
-#endif /* CONFIG_HOTPLUG_CPU */
 #else /* !CONFIG_SMP */
 #define profile_flip_buffers()          do { } while (0)
 #define profile_discard_flip_buffers()  do { } while (0)
+#define profile_cpu_callback            NULL
-void profile_hit(int type, void *__pc)
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
 {
        unsigned long pc;
        if (prof_on != type || !prof_buffer)
                return;
        pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
-        atomic_inc(&prof_buffer[min(pc, prof_len - 1)]);
+        atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 }
 #endif /* !CONFIG_SMP */
@@ -442,7 +462,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        read = 0;
        while (p < sizeof(unsigned int) && count > 0) {
-                put_user(*((char *)(&sample_step)+p),buf);
+                if (put_user(*((char *)(&sample_step)+p),buf))
+                        return -EFAULT;
                buf++; p++; count--; read++;
        }
        pnt = (char *)prof_buffer + p - sizeof(atomic_t);
@@ -480,7 +501,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
        return count;
 }
-static struct file_operations proc_profile_operations = {
+static const struct file_operations proc_profile_operations = {
        .read           = read_profile,
        .write          = write_profile,
 };
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 26bb5ffe1ef1..3554b76da84c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
        list = rdp->donelist;
        while (list) {
-                next = rdp->donelist = list->next;
+                next = list->next;
+                prefetch(next);
                list->func(list);
                list = next;
                if (++count >= rdp->blimit)
                        break;
        }
+        rdp->donelist = list;
        local_irq_disable();
        rdp->qlen -= count;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e2bda18f6f42..c52f981ea008 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void)
        cleanup_srcu_struct(&srcu_ctl);
 }
-static int srcu_torture_read_lock(void)
+static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
 {
        return srcu_read_lock(&srcu_ctl);
 }
@@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
                schedule_timeout_interruptible(longdelay);
 }
-static void srcu_torture_read_unlock(int idx)
+static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
 {
        srcu_read_unlock(&srcu_ctl, idx);
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index f04bbdb56ac2..a4701e7ba7d0 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -138,7 +138,7 @@ depopulate:
 */
 struct rchan_buf *relay_create_buf(struct rchan *chan)
 {
-        struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
+        struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
        if (!buf)
                return NULL;
@@ -308,9 +308,10 @@ static struct rchan_callbacks default_channel_callbacks = {
 *      reason waking is deferred is that calling directly from write
 *      causes problems if you're writing from say the scheduler.
 */
-static void wakeup_readers(void *private)
+static void wakeup_readers(struct work_struct *work)
 {
-        struct rchan_buf *buf = private;
+        struct rchan_buf *buf =
+                container_of(work, struct rchan_buf, wake_readers.work);
        wake_up_interruptible(&buf->read_wait);
 }
@@ -328,7 +329,7 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
        if (init) {
                init_waitqueue_head(&buf->read_wait);
                kref_init(&buf->kref);
-                INIT_WORK(&buf->wake_readers, NULL, NULL);
+                INIT_DELAYED_WORK(&buf->wake_readers, NULL);
        } else {
                cancel_delayed_work(&buf->wake_readers);
                flush_scheduled_work();
@@ -478,7 +479,7 @@ struct rchan *relay_open(const char *base_filename,
        if (!(subbuf_size && n_subbufs))
                return NULL;
-        chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
+        chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
        if (!chan)
                return NULL;
@@ -549,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
                        buf->padding[old_subbuf];
                smp_mb();
                if (waitqueue_active(&buf->read_wait)) {
-                        PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+                        PREPARE_DELAYED_WORK(&buf->wake_readers,
+                                             wakeup_readers);
                        schedule_delayed_work(&buf->wake_readers, 1);
                }
        }
@@ -957,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
        if (!desc->count)
                return 0;
-        mutex_lock(&filp->f_dentry->d_inode->i_mutex);
+        mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
        do {
                if (!relay_file_read_avail(buf, *ppos))
                        break;
@@ -977,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp,
                        *ppos = relay_file_read_end_pos(buf, read_start, ret);
                }
        } while (desc->count && ret);
-        mutex_unlock(&filp->f_dentry->d_inode->i_mutex);
+        mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
        return desc->written;
 }
@@ -1011,7 +1013,7 @@ static ssize_t relay_file_sendfile(struct file *filp,
                                       actor, &desc);
 }
-struct file_operations relay_file_operations = {
+const struct file_operations relay_file_operations = {
        .open           = relay_file_open,
        .poll           = relay_file_poll,
        .mmap           = relay_file_mmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index 6de60c12143e..7b9a497419d9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v)
        return 0;
 }
-static struct seq_operations resource_op = {
+static const struct seq_operations resource_op = {
        .start  = r_start,
        .next   = r_next,
        .stop   = r_stop,
@@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file)
        return res;
 }
-static struct file_operations proc_ioports_operations = {
+static const struct file_operations proc_ioports_operations = {
        .open           = ioports_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
        .release        = seq_release,
 };
-static struct file_operations proc_iomem_operations = {
+static const struct file_operations proc_iomem_operations = {
        .open           = iomem_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 6dcea9dd8c94..015fc633c96c 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/sysdev.h>
 #include <linux/timer.h>
+#include <linux/freezer.h>
 #include "rtmutex.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 3399701c680e..5cd833bc2173 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -34,7 +34,7 @@
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
-#include <linux/suspend.h>
+#include <linux/freezer.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/delay.h>
@@ -225,8 +225,10 @@ struct rq {
        unsigned long nr_uninterruptible;
        unsigned long expired_timestamp;
-        unsigned long long timestamp_last_tick;
+        /* Cached timestamp set by update_cpu_clock() */
+        unsigned long long most_recent_timestamp;
        struct task_struct *curr, *idle;
+        unsigned long next_balance;
        struct mm_struct *prev_mm;
        struct prio_array *active, *expired, arrays[2];
        int best_expired_prio;
@@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
 * bump this up when changing the output format or the meaning of an existing
 * format, so that tools can adapt (or abort)
 */
-#define SCHEDSTAT_VERSION 12
+#define SCHEDSTAT_VERSION 14
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
                        seq_printf(seq, "domain%d %s", dcnt++, mask_str);
                        for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
                                        itype++) {
-                                seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu",
+                                seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
+                                                "%lu",
                                    sd->lb_cnt[itype],
                                    sd->lb_balanced[itype],
                                    sd->lb_failed[itype],
@@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
                                    sd->lb_nobusyq[itype],
                                    sd->lb_nobusyg[itype]);
                        }
-                        seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
+                        seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
+                            " %lu %lu %lu\n",
                            sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
                            sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
                            sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
-                            sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
+                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
+                            sd->ttwu_move_balance);
                }
                preempt_enable();
 #endif
@@ -505,7 +510,7 @@ static int schedstat_open(struct inode *inode, struct file *file)
        return res;
 }
-struct file_operations proc_schedstat_operations = {
+const struct file_operations proc_schedstat_operations = {
        .open    = schedstat_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
@@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
 #endif
 /*
- * rq_lock - lock a given runqueue and disable interrupts.
+ * this_rq_lock - lock this runqueue and disable interrupts.
 */
 static inline struct rq *this_rq_lock(void)
        __acquires(rq->lock)
@@ -938,18 +943,31 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 {
        unsigned long long now;
+        if (rt_task(p))
+                goto out;
        now = sched_clock();
 #ifdef CONFIG_SMP
        if (!local) {
                /* Compensate for drifting sched_clock */
                struct rq *this_rq = this_rq();
-                now = (now - this_rq->timestamp_last_tick)
+                now = (now - this_rq->most_recent_timestamp)
-                        + rq->timestamp_last_tick;
+                        + rq->most_recent_timestamp;
        }
 #endif
-        if (!rt_task(p))
+        /*
-                p->prio = recalc_task_prio(p, now);
+         * Sleep time is in units of nanosecs, so shift by 20 to get a
+         * milliseconds-range estimation of the amount of time that the task
+         * spent sleeping:
+         */
+        if (unlikely(prof_on == SLEEP_PROFILING)) {
+                if (p->state == TASK_UNINTERRUPTIBLE)
+                        profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+                                     (now - p->timestamp) >> 20);
+        }
+        p->prio = recalc_task_prio(p, now);
        /*
         * This checks to make sure it's not an uninterruptible task
@@ -974,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
                }
        }
        p->timestamp = now;
+out:
        __activate_task(p, rq);
 }
@@ -1439,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                if (this_sd->flags & SD_WAKE_AFFINE) {
                        unsigned long tl = this_load;
-                        unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
+                        unsigned long tl_per_task;
+                        tl_per_task = cpu_avg_load_per_task(this_cpu);
                        /*
                         * If sync wakeup then subtract the (maximum possible)
@@ -1677,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 * Not the local CPU - must adjust timestamp. This should
                 * get optimised away in the !CONFIG_SMP case.
                 */
-                p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
+                p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
-                                        + rq->timestamp_last_tick;
+                                        + rq->most_recent_timestamp;
                __activate_task(p, rq);
                if (TASK_PREEMPTS_CURR(p, rq))
                        resched_task(rq->curr);
@@ -1941,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
        __acquires(rq1->lock)
        __acquires(rq2->lock)
 {
+        BUG_ON(!irqs_disabled());
        if (rq1 == rq2) {
                spin_lock(&rq1->lock);
                __acquire(rq2->lock);   /* Fake it out ;) */
@@ -1980,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
+        if (unlikely(!irqs_disabled())) {
+                /* printk() doesn't work good under rq->lock */
+                spin_unlock(&this_rq->lock);
+                BUG_ON(1);
+        }
        if (unlikely(!spin_trylock(&busiest->lock))) {
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
@@ -2050,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array,
        set_task_cpu(p, this_cpu);
        inc_nr_running(p, this_rq);
        enqueue_task(p, this_array);
-        p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
+        p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
-                                + this_rq->timestamp_last_tick;
+                                + this_rq->most_recent_timestamp;
        /*
         * Note that idle threads have a prio of MAX_PRIO, for this test
         * to be always true for them.
@@ -2087,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) too many balance attempts have failed.
         */
-        if (sd->nr_balance_failed > sd->cache_nice_tries)
+        if (sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+                if (task_hot(p, rq->most_recent_timestamp, sd))
+                        schedstat_inc(sd, lb_hot_gained[idle]);
+#endif
                return 1;
+        }
-        if (task_hot(p, rq->timestamp_last_tick, sd))
+        if (task_hot(p, rq->most_recent_timestamp, sd))
                return 0;
        return 1;
 }
@@ -2188,11 +2219,6 @@ skip_queue:
                goto skip_bitmap;
        }
-#ifdef CONFIG_SCHEDSTATS
-        if (task_hot(tmp, busiest->timestamp_last_tick, sd))
-                schedstat_inc(sd, lb_hot_gained[idle]);
-#endif
        pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
        pulled++;
        rem_load_move -= tmp->load_weight;
@@ -2230,7 +2256,7 @@ out:
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
-                   cpumask_t *cpus)
+                   cpumask_t *cpus, int *balance)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2259,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                unsigned long load, group_capacity;
                int local_group;
                int i;
+                unsigned int balance_cpu = -1, first_idle_cpu = 0;
                unsigned long sum_nr_running, sum_weighted_load;
                local_group = cpu_isset(this_cpu, group->cpumask);
+                if (local_group)
+                        balance_cpu = first_cpu(group->cpumask);
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -2278,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                                *sd_idle = 0;
                        /* Bias balancing toward cpus of our domain */
-                        if (local_group)
+                        if (local_group) {
+                                if (idle_cpu(i) && !first_idle_cpu) {
+                                        first_idle_cpu = 1;
+                                        balance_cpu = i;
+                                }
                                load = target_load(i, load_idx);
-                        else
+                        } else
                                load = source_load(i, load_idx);
                        avg_load += load;
@@ -2288,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        sum_weighted_load += rq->raw_weighted_load;
                }
+                /*
+                 * First idle cpu or the first cpu(busiest) in this sched group
+                 * is eligible for doing load balancing at this and above
+                 * domains.
+                 */
+                if (local_group && balance_cpu != this_cpu && balance) {
+                        *balance = 0;
+                        goto ret;
+                }
                total_load += avg_load;
                total_pwr += group->cpu_power;
@@ -2447,18 +2492,21 @@ small_imbalance:
                pwr_now /= SCHED_LOAD_SCALE;
                /* Amount of load we'd subtract */
-                tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
+                tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
+                        busiest->cpu_power;
                if (max_load > tmp)
                        pwr_move += busiest->cpu_power *
                                min(busiest_load_per_task, max_load - tmp);
                /* Amount of load we'd add */
-                if (max_load*busiest->cpu_power <
+                if (max_load * busiest->cpu_power <
-                                busiest_load_per_task*SCHED_LOAD_SCALE)
+                                busiest_load_per_task * SCHED_LOAD_SCALE)
-                        tmp = max_load*busiest->cpu_power/this->cpu_power;
+                        tmp = max_load * busiest->cpu_power / this->cpu_power;
                else
-                        tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
+                        tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
-                pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
+                                this->cpu_power;
+                pwr_move += this->cpu_power *
+                        min(this_load_per_task, this_load + tmp);
                pwr_move /= SCHED_LOAD_SCALE;
                /* Move if we gain throughput */
@@ -2479,8 +2527,8 @@ out_balanced:
                *imbalance = min_load_per_task;
                return group_min;
        }
-ret:
 #endif
+ret:
        *imbalance = 0;
        return NULL;
 }
@@ -2529,17 +2577,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n)
 /*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
- *
- * Called with this_rq unlocked.
 */
 static int load_balance(int this_cpu, struct rq *this_rq,
-                        struct sched_domain *sd, enum idle_type idle)
+                        struct sched_domain *sd, enum idle_type idle,
+                        int *balance)
 {
        int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
        cpumask_t cpus = CPU_MASK_ALL;
+        unsigned long flags;
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -2555,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 redo:
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
-                                                        &cpus);
+                                   &cpus, balance);
+        if (*balance == 0)
+                goto out_balanced;
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[idle]);
                goto out_balanced;
@@ -2579,11 +2631,13 @@ redo:
                 * still unbalanced. nr_moved simply stays zero, so it is
                 * correctly treated as an imbalance.
                 */
+                local_irq_save(flags);
                double_rq_lock(this_rq, busiest);
                nr_moved = move_tasks(this_rq, this_cpu, busiest,
                                      minus_1_or_zero(busiest->nr_running),
                                      imbalance, sd, idle, &all_pinned);
                double_rq_unlock(this_rq, busiest);
+                local_irq_restore(flags);
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned)) {
@@ -2600,13 +2654,13 @@ redo:
                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                        spin_lock(&busiest->lock);
+                        spin_lock_irqsave(&busiest->lock, flags);
                        /* don't kick the migration_thread, if the curr
                         * task on busiest cpu can't be moved to this_cpu
                         */
                        if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
-                                spin_unlock(&busiest->lock);
+                                spin_unlock_irqrestore(&busiest->lock, flags);
                                all_pinned = 1;
                                goto out_one_pinned;
                        }
@@ -2616,7 +2670,7 @@ redo:
                                busiest->push_cpu = this_cpu;
                                active_balance = 1;
                        }
-                        spin_unlock(&busiest->lock);
+                        spin_unlock_irqrestore(&busiest->lock, flags);
                        if (active_balance)
                                wake_up_process(busiest->migration_thread);
@@ -2695,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
 redo:
        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
-                                &sd_idle, &cpus);
+                                   &sd_idle, &cpus, NULL);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                goto out_balanced;
@@ -2755,14 +2809,28 @@ out_balanced:
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
        struct sched_domain *sd;
+        int pulled_task = 0;
+        unsigned long next_balance = jiffies + 60 *  HZ;
        for_each_domain(this_cpu, sd) {
                if (sd->flags & SD_BALANCE_NEWIDLE) {
                        /* If we've pulled tasks over stop searching: */
-                        if (load_balance_newidle(this_cpu, this_rq, sd))
+                        pulled_task = load_balance_newidle(this_cpu,
+                                                        this_rq, sd);
+                        if (time_after(next_balance,
+                                  sd->last_balance + sd->balance_interval))
+                                next_balance = sd->last_balance
+                                                + sd->balance_interval;
+                        if (pulled_task)
                                break;
                }
        }
+        if (!pulled_task)
+                /*
+                 * We are going idle. next_balance may be set based on
+                 * a busy processor. So reset next_balance.
+                 */
+                this_rq->next_balance = next_balance;
 }
 /*
@@ -2815,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
        spin_unlock(&target_rq->lock);
 }
-/*
+static void update_load(struct rq *this_rq)
- * rebalance_tick will get called every timer tick, on every CPU.
- *
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in arch_init_sched_domains.
- */
-/* Don't have all balancing operations going off at once: */
-static inline unsigned long cpu_offset(int cpu)
 {
-        return jiffies + cpu * HZ / NR_CPUS;
+        unsigned long this_load;
-}
-static void
-rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
-{
-        unsigned long this_load, interval, j = cpu_offset(this_cpu);
-        struct sched_domain *sd;
        int i, scale;
        this_load = this_rq->raw_weighted_load;
@@ -2854,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
                        new_load += scale-1;
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
        }
+}
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ *
+ * It checks each scheduling domain to see if it is due to be balanced,
+ * and initiates a balancing operation if so.
+ *
+ * Balancing parameters are set up in arch_init_sched_domains.
+ */
+static DEFINE_SPINLOCK(balancing);
+static void run_rebalance_domains(struct softirq_action *h)
+{
+        int this_cpu = smp_processor_id(), balance = 1;
+        struct rq *this_rq = cpu_rq(this_cpu);
+        unsigned long interval;
+        struct sched_domain *sd;
+        /*
+         * We are idle if there are no processes running. This
+         * is valid even if we are the idle process (SMT).
+         */
+        enum idle_type idle = !this_rq->nr_running ?
+                                SCHED_IDLE : NOT_IDLE;
+        /* Earliest time when we have to call run_rebalance_domains again */
+        unsigned long next_balance = jiffies + 60*HZ;
        for_each_domain(this_cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
@@ -2868,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
                if (unlikely(!interval))
                        interval = 1;
-                if (j - sd->last_balance >= interval) {
+                if (sd->flags & SD_SERIALIZE) {
-                        if (load_balance(this_cpu, this_rq, sd, idle)) {
+                        if (!spin_trylock(&balancing))
+                                goto out;
+                }
+                if (time_after_eq(jiffies, sd->last_balance + interval)) {
+                        if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@ -2877,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
                                 */
                                idle = NOT_IDLE;
                        }
-                        sd->last_balance += interval;
+                        sd->last_balance = jiffies;
                }
+                if (sd->flags & SD_SERIALIZE)
+                        spin_unlock(&balancing);
+out:
+                if (time_after(next_balance, sd->last_balance + interval))
+                        next_balance = sd->last_balance + interval;
+                /*
+                 * Stop the load balance at this level. There is another
+                 * CPU in our sched group which is doing load balancing more
+                 * actively.
+                 */
+                if (!balance)
+                        break;
        }
+        this_rq->next_balance = next_balance;
 }
 #else
 /*
 * on UP we do not need to balance between CPUs:
 */
-static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
-{
-}
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
 #endif
-static inline int wake_priority_sleeper(struct rq *rq)
+static inline void wake_priority_sleeper(struct rq *rq)
 {
-        int ret = 0;
 #ifdef CONFIG_SCHED_SMT
+        if (!rq->nr_running)
+                return;
        spin_lock(&rq->lock);
        /*
         * If an SMT sibling task has been put to sleep for priority
         * reasons reschedule the idle task to see if it can now run.
         */
-        if (rq->nr_running) {
+        if (rq->nr_running)
                resched_task(rq->idle);
-                ret = 1;
-        }
        spin_unlock(&rq->lock);
 #endif
-        return ret;
 }
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -2923,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 static inline void
 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
 {
-        p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
+        p->sched_time += now - p->last_ran;
+        p->last_ran = rq->most_recent_timestamp = now;
 }
 /*
@@ -2936,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p)
        unsigned long flags;
        local_irq_save(flags);
-        ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
+        ns = p->sched_time + sched_clock() - p->last_ran;
-        ns = p->sched_time + sched_clock() - ns;
        local_irq_restore(flags);
        return ns;
@@ -3037,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
                cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
-/*
+static void task_running_tick(struct rq *rq, struct task_struct *p)
- * This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
- */
-void scheduler_tick(void)
 {
-        unsigned long long now = sched_clock();
-        struct task_struct *p = current;
-        int cpu = smp_processor_id();
-        struct rq *rq = cpu_rq(cpu);
-        update_cpu_clock(p, rq, now);
-        rq->timestamp_last_tick = now;
-        if (p == rq->idle) {
-                if (wake_priority_sleeper(rq))
-                        goto out;
-                rebalance_tick(cpu, rq, SCHED_IDLE);
-                return;
-        }
-        /* Task might have expired already, but not scheduled off yet */
        if (p->array != rq->active) {
+                /* Task has expired but was not scheduled yet */
                set_tsk_need_resched(p);
-                goto out;
+                return;
        }
        spin_lock(&rq->lock);
        /*
@@ -3133,8 +3201,34 @@ void scheduler_tick(void)
        }
 out_unlock:
        spin_unlock(&rq->lock);
-out:
+}
-        rebalance_tick(cpu, rq, NOT_IDLE);
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
+ */
+void scheduler_tick(void)
+{
+        unsigned long long now = sched_clock();
+        struct task_struct *p = current;
+        int cpu = smp_processor_id();
+        struct rq *rq = cpu_rq(cpu);
+        update_cpu_clock(p, rq, now);
+        if (p == rq->idle)
+                /* Task on the idle queue */
+                wake_priority_sleeper(rq);
+        else
+                task_running_tick(rq, p);
+#ifdef CONFIG_SMP
+        update_load(rq);
+        if (time_after_eq(jiffies, rq->next_balance))
+                raise_softirq(SCHED_SOFTIRQ);
+#endif
 }
 #ifdef CONFIG_SCHED_SMT
@@ -3280,7 +3374,8 @@ void fastcall add_preempt_count(int val)
        /*
         * Spinlock count overflowing soon?
         */
-        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+                                PREEMPT_MASK - 10);
 }
 EXPORT_SYMBOL(add_preempt_count);
@@ -3333,6 +3428,9 @@ asmlinkage void __sched schedule(void)
                printk(KERN_ERR "BUG: scheduling while atomic: "
                        "%s/0x%08x/%d\n",
                        current->comm, preempt_count(), current->pid);
+                debug_show_held_locks(current);
+                if (irqs_disabled())
+                        print_irqtrace_events(current);
                dump_stack();
        }
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4804,18 +4902,18 @@ static void show_task(struct task_struct *p)
                show_stack(p, NULL);
 }
-void show_state(void)
+void show_state_filter(unsigned long state_filter)
 {
        struct task_struct *g, *p;
 #if (BITS_PER_LONG == 32)
        printk("\n"
-               "                                               sibling\n");
+               "                         free                        sibling\n");
-        printk("  task             PC      pid father child younger older\n");
+        printk("  task             PC    stack   pid father child younger older\n");
 #else
        printk("\n"
-               "                                                       sibling\n");
+               "                                 free                        sibling\n");
-        printk("  task                 PC          pid father child younger older\n");
+        printk("  task                 PC        stack   pid father child younger older\n");
 #endif
        read_lock(&tasklist_lock);
        do_each_thread(g, p) {
@@ -4824,11 +4922,16 @@ void show_state(void)
                 * console might take alot of time:
                 */
                touch_nmi_watchdog();
-                show_task(p);
+                if (p->state & state_filter)
+                        show_task(p);
        } while_each_thread(g, p);
        read_unlock(&tasklist_lock);
-        debug_show_all_locks();
+        /*
+         * Only show locks if all tasks are dumped:
+         */
+        if (state_filter == -1)
+                debug_show_all_locks();
 }
 /**
@@ -4973,8 +5076,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
                 * afterwards, and pretending it was a local activate.
                 * This way is cleaner and logically correct.
                 */
-                p->timestamp = p->timestamp - rq_src->timestamp_last_tick
+                p->timestamp = p->timestamp - rq_src->most_recent_timestamp
-                                + rq_dest->timestamp_last_tick;
+                                + rq_dest->most_recent_timestamp;
                deactivate_task(p, rq_src);
                __activate_task(p, rq_dest);
                if (TASK_PREEMPTS_CURR(p, rq_dest))
@@ -5050,7 +5153,10 @@ wait_to_die:
 }
 #ifdef CONFIG_HOTPLUG_CPU
-/* Figure out where task on dead CPU should go, use force if neccessary. */
+/*
+ * Figure out where task on dead CPU should go, use force if neccessary.
+ * NOTE: interrupts should be disabled by the caller
+ */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
        unsigned long flags;
@@ -5170,6 +5276,7 @@ void idle_task_exit(void)
        mmdrop(mm);
 }
+/* called under rq->lock with disabled interrupts */
 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
 {
        struct rq *rq = cpu_rq(dead_cpu);
@@ -5186,10 +5293,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
         * Drop lock around migration; if someone else moves it,
         * that's OK.  No task can be added to this CPU, so iteration is
         * fine.
+         * NOTE: interrupts should be left disabled  --dev@
         */
-        spin_unlock_irq(&rq->lock);
+        spin_unlock(&rq->lock);
        move_task_off_dead_cpu(dead_cpu, p);
-        spin_lock_irq(&rq->lock);
+        spin_lock(&rq->lock);
        put_task_struct(p);
 }
@@ -5342,16 +5450,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                if (!(sd->flags & SD_LOAD_BALANCE)) {
                        printk("does not load-balance\n");
                        if (sd->parent)
-                                printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
+                                printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+                                                " has parent");
                        break;
                }
                printk("span %s\n", str);
                if (!cpu_isset(cpu, sd->span))
-                        printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
+                        printk(KERN_ERR "ERROR: domain->span does not contain "
+                                        "CPU%d\n", cpu);
                if (!cpu_isset(cpu, group->cpumask))
-                        printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
+                        printk(KERN_ERR "ERROR: domain->groups does not contain"
+                                        " CPU%d\n", cpu);
                printk(KERN_DEBUG);
                for (i = 0; i < level + 2; i++)
@@ -5366,7 +5477,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                        if (!group->cpu_power) {
                                printk("\n");
-                                printk(KERN_ERR "ERROR: domain->cpu_power not set\n");
+                                printk(KERN_ERR "ERROR: domain->cpu_power not "
+                                                "set\n");
                        }
                        if (!cpus_weight(group->cpumask)) {
@@ -5389,15 +5501,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                printk("\n");
                if (!cpus_equal(sd->span, groupmask))
-                        printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+                        printk(KERN_ERR "ERROR: groups don't span "
+                                        "domain->span\n");
                level++;
                sd = sd->parent;
+                if (!sd)
+                        continue;
-                if (sd) {
+                if (!cpus_subset(groupmask, sd->span))
-                        if (!cpus_subset(groupmask, sd->span))
+                        printk(KERN_ERR "ERROR: parent span is not a superset "
-                                printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
+                                "of domain->span\n");
-                }
        } while (sd);
 }
@@ -5511,28 +5625,27 @@ static int __init isolated_cpu_setup(char *str)
 __setup ("isolcpus=", isolated_cpu_setup);
 /*
- * init_sched_build_groups takes an array of groups, the cpumask we wish
+ * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- * to span, and a pointer to a function which identifies what group a CPU
+ * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a valid index into the
+ * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
- * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we
+ * (due to the fact that we keep track of groups covered with a cpumask_t).
- * keep track of groups covered with a cpumask_t).
 *
 * init_sched_build_groups will build a circular linked list of the groups
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_power to 0.
 */
 static void
-init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
-                        const cpumask_t *cpu_map,
+                        int (*group_fn)(int cpu, const cpumask_t *cpu_map,
-                        int (*group_fn)(int cpu, const cpumask_t *cpu_map))
+                                        struct sched_group **sg))
 {
        struct sched_group *first = NULL, *last = NULL;
        cpumask_t covered = CPU_MASK_NONE;
        int i;
        for_each_cpu_mask(i, span) {
-                int group = group_fn(i, cpu_map);
+                struct sched_group *sg;
-                struct sched_group *sg = &groups[group];
+                int group = group_fn(i, cpu_map, &sg);
                int j;
                if (cpu_isset(i, covered))
@@ -5542,7 +5655,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span,
                sg->cpu_power = 0;
                for_each_cpu_mask(j, span) {
-                        if (group_fn(j, cpu_map) != group)
+                        if (group_fn(j, cpu_map, NULL) != group)
                                continue;
                        cpu_set(j, covered);
@@ -5716,8 +5829,9 @@ __setup("max_cache_size=", setup_max_cache_size);
 */
 static void touch_cache(void *__cache, unsigned long __size)
 {
-        unsigned long size = __size/sizeof(long), chunk1 = size/3,
+        unsigned long size = __size / sizeof(long);
-                        chunk2 = 2*size/3;
+        unsigned long chunk1 = size / 3;
+        unsigned long chunk2 = 2 * size / 3;
        unsigned long *cache = __cache;
        int i;
@@ -5826,11 +5940,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
         */
        measure_one(cache, size, cpu1, cpu2);
        for (i = 0; i < ITERATIONS; i++)
-                cost1 += measure_one(cache, size - i*1024, cpu1, cpu2);
+                cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
        measure_one(cache, size, cpu2, cpu1);
        for (i = 0; i < ITERATIONS; i++)
-                cost1 += measure_one(cache, size - i*1024, cpu2, cpu1);
+                cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
        /*
         * (We measure the non-migrating [cached] cost on both
@@ -5840,17 +5954,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
        measure_one(cache, size, cpu1, cpu1);
        for (i = 0; i < ITERATIONS; i++)
-                cost2 += measure_one(cache, size - i*1024, cpu1, cpu1);
+                cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
        measure_one(cache, size, cpu2, cpu2);
        for (i = 0; i < ITERATIONS; i++)
-                cost2 += measure_one(cache, size - i*1024, cpu2, cpu2);
+                cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
        /*
         * Get the per-iteration migration cost:
         */
-        do_div(cost1, 2*ITERATIONS);
+        do_div(cost1, 2 * ITERATIONS);
-        do_div(cost2, 2*ITERATIONS);
+        do_div(cost2, 2 * ITERATIONS);
        return cost1 - cost2;
 }
@@ -5888,7 +6002,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
         */
        cache = vmalloc(max_size);
        if (!cache) {
-                printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
+                printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
                return 1000000; /* return 1 msec on very small boxen */
        }
@@ -5913,7 +6027,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
                avg_fluct = (avg_fluct + fluct)/2;
                if (migration_debug)
-                        printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n",
+                        printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
+                                "(%8Ld %8Ld)\n",
                                cpu1, cpu2, size,
                                (long)cost / 1000000,
                                ((long)cost / 100000) % 10,
@@ -6008,20 +6123,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
                        -1
 #endif
                );
-        if (system_state == SYSTEM_BOOTING) {
+        if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
-                if (num_online_cpus() > 1) {
+                printk("migration_cost=");
-                        printk("migration_cost=");
+                for (distance = 0; distance <= max_distance; distance++) {
-                        for (distance = 0; distance <= max_distance; distance++) {
+                        if (distance)
-                                if (distance)
+                                printk(",");
-                                        printk(",");
+                        printk("%ld", (long)migration_cost[distance] / 1000);
-                                printk("%ld", (long)migration_cost[distance] / 1000);
-                        }
-                        printk("\n");
                }
+                printk("\n");
        }
        j1 = jiffies;
        if (migration_debug)
-                printk("migration: %ld seconds\n", (j1-j0)/HZ);
+                printk("migration: %ld seconds\n", (j1-j0) / HZ);
        /*
         * Move back to the original CPU. NUMA-Q gets confused
@@ -6118,10 +6231,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
-static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
+                            struct sched_group **sg)
 {
+        if (sg)
+                *sg = &per_cpu(sched_group_cpus, cpu);
        return cpu;
 }
 #endif
@@ -6131,39 +6247,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
 */
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group sched_group_core[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_core);
 #endif
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
-static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+                             struct sched_group **sg)
 {
+        int group;
        cpumask_t mask = cpu_sibling_map[cpu];
        cpus_and(mask, mask, *cpu_map);
-        return first_cpu(mask);
+        group = first_cpu(mask);
+        if (sg)
+                *sg = &per_cpu(sched_group_core, group);
+        return group;
 }
 #elif defined(CONFIG_SCHED_MC)
-static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
+                             struct sched_group **sg)
 {
+        if (sg)
+                *sg = &per_cpu(sched_group_core, cpu);
        return cpu;
 }
 #endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
-static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
+                             struct sched_group **sg)
 {
+        int group;
 #ifdef CONFIG_SCHED_MC
        cpumask_t mask = cpu_coregroup_map(cpu);
        cpus_and(mask, mask, *cpu_map);
-        return first_cpu(mask);
+        group = first_cpu(mask);
 #elif defined(CONFIG_SCHED_SMT)
        cpumask_t mask = cpu_sibling_map[cpu];
        cpus_and(mask, mask, *cpu_map);
-        return first_cpu(mask);
+        group = first_cpu(mask);
 #else
-        return cpu;
+        group = cpu;
 #endif
+        if (sg)
+                *sg = &per_cpu(sched_group_phys, group);
+        return group;
 }
 #ifdef CONFIG_NUMA
@@ -6176,12 +6305,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
-static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map)
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
+                                 struct sched_group **sg)
 {
-        return cpu_to_node(cpu);
+        cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
+        int group;
+        cpus_and(nodemask, nodemask, *cpu_map);
+        group = first_cpu(nodemask);
+        if (sg)
+                *sg = &per_cpu(sched_group_allnodes, group);
+        return group;
 }
 static void init_numa_sched_groups_power(struct sched_group *group_head)
 {
        struct sched_group *sg = group_head;
@@ -6217,16 +6356,9 @@ static void free_sched_groups(const cpumask_t *cpu_map)
        int cpu, i;
        for_each_cpu_mask(cpu, *cpu_map) {
-                struct sched_group *sched_group_allnodes
-                        = sched_group_allnodes_bycpu[cpu];
                struct sched_group **sched_group_nodes
                        = sched_group_nodes_bycpu[cpu];
-                if (sched_group_allnodes) {
-                        kfree(sched_group_allnodes);
-                        sched_group_allnodes_bycpu[cpu] = NULL;
-                }
                if (!sched_group_nodes)
                        continue;
@@ -6320,7 +6452,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
        struct sched_domain *sd;
 #ifdef CONFIG_NUMA
        struct sched_group **sched_group_nodes = NULL;
-        struct sched_group *sched_group_allnodes = NULL;
+        int sd_allnodes = 0;
        /*
         * Allocate the per-node list of sched groups
@@ -6338,7 +6470,6 @@ static int build_sched_domains(const cpumask_t *cpu_map)
         * Set up domains for cpus specified by the cpu_map.
         */
        for_each_cpu_mask(i, *cpu_map) {
-                int group;
                struct sched_domain *sd = NULL, *p;
                cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
@@ -6347,26 +6478,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_NUMA
                if (cpus_weight(*cpu_map)
                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
-                        if (!sched_group_allnodes) {
-                                sched_group_allnodes
-                                        = kmalloc_node(sizeof(struct sched_group)
-                                                        * MAX_NUMNODES,
-                                                  GFP_KERNEL,
-                                                  cpu_to_node(i));
-                                if (!sched_group_allnodes) {
-                                        printk(KERN_WARNING
-                                        "Can not alloc allnodes sched group\n");
-                                        goto error;
-                                }
-                                sched_group_allnodes_bycpu[i]
-                                                = sched_group_allnodes;
-                        }
                        sd = &per_cpu(allnodes_domains, i);
                        *sd = SD_ALLNODES_INIT;
                        sd->span = *cpu_map;
-                        group = cpu_to_allnodes_group(i, cpu_map);
+                        cpu_to_allnodes_group(i, cpu_map, &sd->groups);
-                        sd->groups = &sched_group_allnodes[group];
                        p = sd;
+                        sd_allnodes = 1;
                } else
                        p = NULL;
@@ -6381,36 +6498,33 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                p = sd;
                sd = &per_cpu(phys_domains, i);
-                group = cpu_to_phys_group(i, cpu_map);
                *sd = SD_CPU_INIT;
                sd->span = nodemask;
                sd->parent = p;
                if (p)
                        p->child = sd;
-                sd->groups = &sched_group_phys[group];
+                cpu_to_phys_group(i, cpu_map, &sd->groups);
 #ifdef CONFIG_SCHED_MC
                p = sd;
                sd = &per_cpu(core_domains, i);
-                group = cpu_to_core_group(i, cpu_map);
                *sd = SD_MC_INIT;
                sd->span = cpu_coregroup_map(i);
                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
                p->child = sd;
-                sd->groups = &sched_group_core[group];
+                cpu_to_core_group(i, cpu_map, &sd->groups);
 #endif
 #ifdef CONFIG_SCHED_SMT
                p = sd;
                sd = &per_cpu(cpu_domains, i);
-                group = cpu_to_cpu_group(i, cpu_map);
                *sd = SD_SIBLING_INIT;
                sd->span = cpu_sibling_map[i];
                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
                p->child = sd;
-                sd->groups = &sched_group_cpus[group];
+                cpu_to_cpu_group(i, cpu_map, &sd->groups);
 #endif
        }
@@ -6422,8 +6536,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                if (i != first_cpu(this_sibling_map))
                        continue;
-                init_sched_build_groups(sched_group_cpus, this_sibling_map,
+                init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
-                                        cpu_map, &cpu_to_cpu_group);
        }
 #endif
@@ -6434,8 +6547,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(this_core_map, this_core_map, *cpu_map);
                if (i != first_cpu(this_core_map))
                        continue;
-                init_sched_build_groups(sched_group_core, this_core_map,
+                init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
-                                        cpu_map, &cpu_to_core_group);
        }
 #endif
@@ -6448,15 +6560,13 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                if (cpus_empty(nodemask))
                        continue;
-                init_sched_build_groups(sched_group_phys, nodemask,
+                init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
-                                        cpu_map, &cpu_to_phys_group);
        }
 #ifdef CONFIG_NUMA
        /* Set up node groups */
-        if (sched_group_allnodes)
+        if (sd_allnodes)
-                init_sched_build_groups(sched_group_allnodes, *cpu_map,
+                init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
-                                        cpu_map, &cpu_to_allnodes_group);
        for (i = 0; i < MAX_NUMNODES; i++) {
                /* Set up node groups */
@@ -6548,10 +6658,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
        for (i = 0; i < MAX_NUMNODES; i++)
                init_numa_sched_groups_power(sched_group_nodes[i]);
-        if (sched_group_allnodes) {
+        if (sd_allnodes) {
-                int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map);
+                struct sched_group *sg;
-                struct sched_group *sg = &sched_group_allnodes[group];
+                cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
                init_numa_sched_groups_power(sg);
        }
 #endif
@@ -6723,8 +6833,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
            sched_smt_power_savings_store);
 #endif
-#ifdef CONFIG_HOTPLUG_CPU
 /*
 * Force a reinitialization of the sched domains hierarchy.  The domains
 * and groups cannot be updated in place without racing with the balancing
@@ -6757,7 +6865,6 @@ static int update_sched_domains(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
-#endif
 void __init sched_init_smp(void)
 {
@@ -6833,6 +6940,10 @@ void __init sched_init(void)
        set_load_weight(&init_task);
+#ifdef CONFIG_SMP
+        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+#endif
 #ifdef CONFIG_RT_MUTEXES
        plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
 #endif
@@ -6867,6 +6978,9 @@ void __might_sleep(char *file, int line)
                                " context at %s:%d\n", file, line);
                printk("in_atomic():%d, irqs_disabled():%d\n",
                        in_atomic(), irqs_disabled());
+                debug_show_held_locks(current);
+                if (irqs_disabled())
+                        print_irqtrace_events(current);
                dump_stack();
        }
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 7ed8d5304bec..5630255d2e2a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,6 +23,10 @@
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/capability.h>
+#include <linux/freezer.h>
+#include <linux/pid_namespace.h>
+#include <linux/nsproxy.h>
 #include <asm/param.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -33,7 +37,7 @@
 * SLAB caches for signal bits.
 */
-static kmem_cache_t *sigqueue_cachep;
+static struct kmem_cache *sigqueue_cachep;
 /*
 * In POSIX a signal is sent either to a specific thread (Linux task)
@@ -267,18 +271,25 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
                                         int override_rlimit)
 {
        struct sigqueue *q = NULL;
+        struct user_struct *user;
-        atomic_inc(&t->user->sigpending);
+        /*
+         * In order to avoid problems with "switch_user()", we want to make
+         * sure that the compiler doesn't re-load "t->user"
+         */
+        user = t->user;
+        barrier();
+        atomic_inc(&user->sigpending);
        if (override_rlimit ||
-            atomic_read(&t->user->sigpending) <=
+            atomic_read(&user->sigpending) <=
                        t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
                q = kmem_cache_alloc(sigqueue_cachep, flags);
        if (unlikely(q == NULL)) {
-                atomic_dec(&t->user->sigpending);
+                atomic_dec(&user->sigpending);
        } else {
                INIT_LIST_HEAD(&q->list);
                q->flags = 0;
-                q->user = get_uid(t->user);
+                q->user = get_uid(user);
        }
        return(q);
 }
@@ -575,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
        error = -EPERM;
        if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
            && ((sig != SIGCONT) ||
-                (current->signal->session != t->signal->session))
+                (process_session(current) != process_session(t)))
            && (current->euid ^ t->suid) && (current->euid ^ t->uid)
            && (current->uid ^ t->suid) && (current->uid ^ t->uid)
            && !capable(CAP_KILL))
@@ -1126,8 +1137,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
        return error;
 }
-int
+static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
        int error;
        rcu_read_lock();
@@ -1695,7 +1705,9 @@ finish_stop(int stop_count)
                read_unlock(&tasklist_lock);
        }
-        schedule();
+        do {
+                schedule();
+        } while (try_to_freeze());
        /*
         * Now we don't run again until continued.
         */
@@ -1870,8 +1882,12 @@ relock:
                if (sig_kernel_ignore(signr)) /* Default is nothing. */
                        continue;
-                /* Init gets no signals it doesn't want.  */
+                /*
-                if (current == child_reaper)
+                 * Init of a pid space gets no signals it doesn't want from
+                 * within that pid space. It can of course get signals from
+                 * its parent pid space.
+                 */
+                if (current == child_reaper(current))
                        continue;
                if (sig_kernel_stop(signr)) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf25015dce16..918e52df090e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
-                BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
-                BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
                p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
                if (IS_ERR(p)) {
                        printk("ksoftirqd for %i failed\n", hotcpu);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 476c3741511b..2c6c2bf85514 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -293,6 +293,27 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
 }
 EXPORT_SYMBOL(_spin_lock_nested);
+unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        preempt_disable();
+        spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+        /*
+         * On lockdep we dont want the hand-coded irq-enable of
+         * _raw_spin_lock_flags() code, because lockdep assumes
+         * that interrupts are not re-enabled during lock-acquire:
+         */
+#ifdef CONFIG_PROVE_SPIN_LOCKING
+        _raw_spin_lock(lock);
+#else
+        _raw_spin_lock_flags(lock, &flags);
+#endif
+        return flags;
+}
+EXPORT_SYMBOL(_spin_lock_irqsave_nested);
 #endif
diff --git a/kernel/sys.c b/kernel/sys.c
index 98489d82801b..c7675c1bfdf2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
        return 0;
 }
-static void deferred_cad(void *dummy)
+static void deferred_cad(struct work_struct *dummy)
 {
        kernel_restart(NULL);
 }
@@ -892,7 +892,7 @@ static void deferred_cad(void *dummy)
 */
 void ctrl_alt_del(void)
 {
-        static DECLARE_WORK(cad_work, deferred_cad, NULL);
+        static DECLARE_WORK(cad_work, deferred_cad);
        if (C_A_D)
                schedule_work(&cad_work);
@@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 asmlinkage long sys_setuid(uid_t uid)
 {
        int old_euid = current->euid;
-        int old_ruid, old_suid, new_ruid, new_suid;
+        int old_ruid, old_suid, new_suid;
        int retval;
        retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
        if (retval)
                return retval;
-        old_ruid = new_ruid = current->uid;
+        old_ruid = current->uid;
        old_suid = current->suid;
        new_suid = old_suid;
        
@@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
        if (p->real_parent == group_leader) {
                err = -EPERM;
-                if (p->signal->session != group_leader->signal->session)
+                if (process_session(p) != process_session(group_leader))
                        goto out;
                err = -EACCES;
                if (p->did_exec)
@@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
                goto out;
        if (pgid != pid) {
-                struct task_struct *p;
+                struct task_struct *g =
+                        find_task_by_pid_type(PIDTYPE_PGID, pgid);
-                do_each_task_pid(pgid, PIDTYPE_PGID, p) {
+                if (!g || process_session(g) != process_session(group_leader))
-                        if (p->signal->session == group_leader->signal->session)
+                        goto out;
-                                goto ok_pgid;
-                } while_each_task_pid(pgid, PIDTYPE_PGID, p);
-                goto out;
        }
-ok_pgid:
        err = security_task_setpgid(p, pgid);
        if (err)
                goto out;
@@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void)
 asmlinkage long sys_getsid(pid_t pid)
 {
        if (!pid)
-                return current->signal->session;
+                return process_session(current);
        else {
                int retval;
                struct task_struct *p;
@@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid)
                if (p) {
                        retval = security_task_getsid(p);
                        if (!retval)
-                                retval = p->signal->session;
+                                retval = process_session(p);
                }
                read_unlock(&tasklist_lock);
                return retval;
@@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void)
        pid_t session;
        int err = -EPERM;
-        mutex_lock(&tty_mutex);
        write_lock_irq(&tasklist_lock);
        /* Fail if I am already a session leader */
@@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void)
        group_leader->signal->leader = 1;
        __set_special_pids(session, session);
+        spin_lock(&group_leader->sighand->siglock);
        group_leader->signal->tty = NULL;
        group_leader->signal->tty_old_pgrp = 0;
+        spin_unlock(&group_leader->sighand->siglock);
        err = process_group(group_leader);
 out:
        write_unlock_irq(&tasklist_lock);
-        mutex_unlock(&tty_mutex);
        return err;
 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0e53314b14de..d7306d0f3dfc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -135,6 +135,7 @@ cond_syscall(sys_madvise);
 cond_syscall(sys_mremap);
 cond_syscall(sys_remap_file_pages);
 cond_syscall(compat_sys_move_pages);
+cond_syscall(compat_sys_migrate_pages);
 /* block-layer dependent */
 cond_syscall(sys_bdflush);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8bff2c18fb5a..600b33358ded 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
 #ifdef CONFIG_X86
 #include <asm/nmi.h>
+#include <asm/stacktrace.h>
 #endif
 #if defined(CONFIG_SYSCTL)
@@ -64,7 +65,6 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern int sysctl_panic_on_oom;
 extern int max_threads;
-extern int sysrq_enabled;
 extern int core_uses_pid;
 extern int suid_dumpable;
 extern char core_pattern[];
@@ -91,7 +91,9 @@ extern char modprobe_path[];
 extern int sg_big_buff;
 #endif
 #ifdef CONFIG_SYSVIPC
-static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+                void __user *buffer, size_t *lenp, loff_t *ppos);
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
                void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
@@ -130,12 +132,22 @@ extern int max_lock_depth;
 #ifdef CONFIG_SYSCTL_SYSCALL
 static int parse_table(int __user *, int, void __user *, size_t __user *,
-                void __user *, size_t, ctl_table *, void **);
+                void __user *, size_t, ctl_table *);
 #endif
 static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+                  void __user *oldval, size_t __user *oldlenp,
+                  void __user *newval, size_t newlen);
+#ifdef CONFIG_SYSVIPC
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+                  void __user *oldval, size_t __user *oldlenp,
+                  void __user *newval, size_t newlen);
+#endif
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -162,6 +174,40 @@ extern ctl_table inotify_table[];
 int sysctl_legacy_va_layout;
 #endif
+static void *get_uts(ctl_table *table, int write)
+{
+        char *which = table->data;
+#ifdef CONFIG_UTS_NS
+        struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+        which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
+#endif
+        if (!write)
+                down_read(&uts_sem);
+        else
+                down_write(&uts_sem);
+        return which;
+}
+static void put_uts(ctl_table *table, int write, void *which)
+{
+        if (!write)
+                up_read(&uts_sem);
+        else
+                up_write(&uts_sem);
+}
+#ifdef CONFIG_SYSVIPC
+static void *get_ipc(ctl_table *table, int write)
+{
+        char *which = table->data;
+        struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
+        which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns;
+        return which;
+}
+#else
+#define get_ipc(T,W) ((T)->data)
+#endif
 /* /proc declarations: */
 #ifdef CONFIG_PROC_SYSCTL
@@ -170,7 +216,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
 static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
 static int proc_opensys(struct inode *, struct file *);
-struct file_operations proc_sys_file_operations = {
+const struct file_operations proc_sys_file_operations = {
        .open           = proc_opensys,
        .read           = proc_readsys,
        .write          = proc_writesys,
@@ -228,7 +274,6 @@ static ctl_table root_table[] = {
 };
 static ctl_table kern_table[] = {
-#ifndef CONFIG_UTS_NS
        {
                .ctl_name       = KERN_OSTYPE,
                .procname       = "ostype",
@@ -236,7 +281,7 @@ static ctl_table kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.sysname),
                .mode           = 0444,
                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
+                .strategy       = &sysctl_uts_string,
        },
        {
                .ctl_name       = KERN_OSRELEASE,
@@ -245,7 +290,7 @@ static ctl_table kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.release),
                .mode           = 0444,
                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
+                .strategy       = &sysctl_uts_string,
        },
        {
                .ctl_name       = KERN_VERSION,
@@ -254,7 +299,7 @@ static ctl_table kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.version),
                .mode           = 0444,
                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
+                .strategy       = &sysctl_uts_string,
        },
        {
                .ctl_name       = KERN_NODENAME,
@@ -263,7 +308,7 @@ static ctl_table kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.nodename),
                .mode           = 0644,
                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
+                .strategy       = &sysctl_uts_string,
        },
        {
                .ctl_name       = KERN_DOMAINNAME,
@@ -272,56 +317,8 @@ static ctl_table kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.domainname),
                .mode           = 0644,
                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
+                .strategy       = &sysctl_uts_string,
-        },
-#else  /* !CONFIG_UTS_NS */
-        {
-                .ctl_name       = KERN_OSTYPE,
-                .procname       = "ostype",
-                .data           = NULL,
-                /* could maybe use __NEW_UTS_LEN here? */
-                .maxlen         = FIELD_SIZEOF(struct new_utsname, sysname),
-                .mode           = 0444,
-                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
-        },
-        {
-                .ctl_name       = KERN_OSRELEASE,
-                .procname       = "osrelease",
-                .data           = NULL,
-                .maxlen         = FIELD_SIZEOF(struct new_utsname, release),
-                .mode           = 0444,
-                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
-        },
-        {
-                .ctl_name       = KERN_VERSION,
-                .procname       = "version",
-                .data           = NULL,
-                .maxlen         = FIELD_SIZEOF(struct new_utsname, version),
-                .mode           = 0444,
-                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
-        },
-        {
-                .ctl_name       = KERN_NODENAME,
-                .procname       = "hostname",
-                .data           = NULL,
-                .maxlen         = FIELD_SIZEOF(struct new_utsname, nodename),
-                .mode           = 0644,
-                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
-        },
-        {
-                .ctl_name       = KERN_DOMAINNAME,
-                .procname       = "domainname",
-                .data           = NULL,
-                .maxlen         = FIELD_SIZEOF(struct new_utsname, domainname),
-                .mode           = 0644,
-                .proc_handler   = &proc_do_uts_string,
-                .strategy       = &sysctl_string,
        },
-#endif /* !CONFIG_UTS_NS */
        {
                .ctl_name       = KERN_PANIC,
                .procname       = "panic",
@@ -480,65 +477,72 @@ static ctl_table kern_table[] = {
        {
                .ctl_name       = KERN_SHMMAX,
                .procname       = "shmmax",
-                .data           = NULL,
+                .data           = &init_ipc_ns.shm_ctlmax,
-                .maxlen         = sizeof (size_t),
+                .maxlen         = sizeof (init_ipc_ns.shm_ctlmax),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_doulongvec_minmax,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_SHMALL,
                .procname       = "shmall",
-                .data           = NULL,
+                .data           = &init_ipc_ns.shm_ctlall,
-                .maxlen         = sizeof (size_t),
+                .maxlen         = sizeof (init_ipc_ns.shm_ctlall),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_doulongvec_minmax,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_SHMMNI,
                .procname       = "shmmni",
-                .data           = NULL,
+                .data           = &init_ipc_ns.shm_ctlmni,
-                .maxlen         = sizeof (int),
+                .maxlen         = sizeof (init_ipc_ns.shm_ctlmni),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_dointvec,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_MSGMAX,
                .procname       = "msgmax",
-                .data           = NULL,
+                .data           = &init_ipc_ns.msg_ctlmax,
-                .maxlen         = sizeof (int),
+                .maxlen         = sizeof (init_ipc_ns.msg_ctlmax),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_dointvec,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_MSGMNI,
                .procname       = "msgmni",
-                .data           = NULL,
+                .data           = &init_ipc_ns.msg_ctlmni,
-                .maxlen         = sizeof (int),
+                .maxlen         = sizeof (init_ipc_ns.msg_ctlmni),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_dointvec,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_MSGMNB,
                .procname       =  "msgmnb",
-                .data           = NULL,
+                .data           = &init_ipc_ns.msg_ctlmnb,
-                .maxlen         = sizeof (int),
+                .maxlen         = sizeof (init_ipc_ns.msg_ctlmnb),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_dointvec,
+                .strategy       = sysctl_ipc_data,
        },
        {
                .ctl_name       = KERN_SEM,
                .procname       = "sem",
-                .data           = NULL,
+                .data           = &init_ipc_ns.sem_ctls,
                .maxlen         = 4*sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_do_ipc_string,
+                .proc_handler   = &proc_ipc_dointvec,
+                .strategy       = sysctl_ipc_data,
        },
 #endif
 #ifdef CONFIG_MAGIC_SYSRQ
        {
                .ctl_name       = KERN_SYSRQ,
                .procname       = "sysrq",
-                .data           = &sysrq_enabled,
+                .data           = &__sysrq_enabled,
                .maxlen         = sizeof (int),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
@@ -707,6 +711,14 @@ static ctl_table kern_table[] = {
                .mode           = 0444,
                .proc_handler   = &proc_dointvec,
        },
+        {
+                .ctl_name       = CTL_UNNUMBERED,
+                .procname       = "kstack_depth_to_print",
+                .data           = &kstack_depth_to_print,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
 #endif
 #if defined(CONFIG_MMU)
        {
@@ -977,17 +989,6 @@ static ctl_table vm_table[] = {
                .extra1         = &zero,
        },
 #endif
-#ifdef CONFIG_SWAP
-        {
-                .ctl_name       = VM_SWAP_TOKEN_TIMEOUT,
-                .procname       = "swap_token_timeout",
-                .data           = &swap_token_default_timeout,
-                .maxlen         = sizeof(swap_token_default_timeout),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_jiffies,
-                .strategy       = &sysctl_jiffies,
-        },
-#endif
 #ifdef CONFIG_NUMA
        {
                .ctl_name       = VM_ZONE_RECLAIM_MODE,
@@ -1241,7 +1242,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
        do {
                struct ctl_table_header *head =
                        list_entry(tmp, struct ctl_table_header, ctl_entry);
-                void *context = NULL;
                if (!use_table(head))
                        continue;
@@ -1249,9 +1249,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
                spin_unlock(&sysctl_lock);
                error = parse_table(name, nlen, oldval, oldlenp, 
-                                        newval, newlen, head->ctl_table,
+                                        newval, newlen, head->ctl_table);
-                                        &context);
-                kfree(context);
                spin_lock(&sysctl_lock);
                unuse_table(head);
@@ -1307,7 +1305,7 @@ static inline int ctl_perm(ctl_table *table, int op)
 static int parse_table(int __user *name, int nlen,
                       void __user *oldval, size_t __user *oldlenp,
                       void __user *newval, size_t newlen,
-                       ctl_table *table, void **context)
+                       ctl_table *table)
 {
        int n;
 repeat:
@@ -1315,7 +1313,9 @@ repeat:
                return -ENOTDIR;
        if (get_user(n, name))
                return -EFAULT;
-        for ( ; table->ctl_name; table++) {
+        for ( ; table->ctl_name || table->procname; table++) {
+                if (!table->ctl_name)
+                        continue;
                if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
                        int error;
                        if (table->child) {
@@ -1325,7 +1325,7 @@ repeat:
                                        error = table->strategy(
                                                table, name, nlen,
                                                oldval, oldlenp,
-                                                newval, newlen, context);
+                                                newval, newlen);
                                        if (error)
                                                return error;
                                }
@@ -1336,7 +1336,7 @@ repeat:
                        }
                        error = do_sysctl_strategy(table, name, nlen,
                                                   oldval, oldlenp,
-                                                   newval, newlen, context);
+                                                   newval, newlen);
                        return error;
                }
        }
@@ -1347,7 +1347,7 @@ repeat:
 int do_sysctl_strategy (ctl_table *table, 
                        int __user *name, int nlen,
                        void __user *oldval, size_t __user *oldlenp,
-                        void __user *newval, size_t newlen, void **context)
+                        void __user *newval, size_t newlen)
 {
        int op = 0, rc;
        size_t len;
@@ -1361,7 +1361,7 @@ int do_sysctl_strategy (ctl_table *table,
        if (table->strategy) {
                rc = table->strategy(table, name, nlen, oldval, oldlenp,
-                                     newval, newlen, context);
+                                     newval, newlen);
                if (rc < 0)
                        return rc;
                if (rc > 0)
@@ -1532,7 +1532,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
        int len;
        mode_t mode;
        
-        for (; table->ctl_name; table++) {
+        for (; table->ctl_name || table->procname; table++) {
                /* Can't do anything without a proc name. */
                if (!table->procname)
                        continue;
@@ -1579,7 +1579,7 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root,
 static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root)
 {
        struct proc_dir_entry *de;
-        for (; table->ctl_name; table++) {
+        for (; table->ctl_name || table->procname; table++) {
                if (!(de = table->de))
                        continue;
                if (de->mode & S_IFDIR) {
@@ -1614,7 +1614,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
                          size_t count, loff_t *ppos)
 {
        int op;
-        struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
+        struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode);
        struct ctl_table *table;
        size_t res;
        ssize_t error = -ENOTDIR;
@@ -1753,66 +1753,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp,
 *      Special case of dostring for the UTS structure. This has locks
 *      to observe. Should this be in kernel/sys.c ????
 */
- 
-#ifndef CONFIG_UTS_NS
-static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
-                  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        int r;
-        if (!write) {
-                down_read(&uts_sem);
-                r=proc_dostring(table,0,filp,buffer,lenp, ppos);
-                up_read(&uts_sem);
-        } else {
-                down_write(&uts_sem);
-                r=proc_dostring(table,1,filp,buffer,lenp, ppos);
-                up_write(&uts_sem);
-        }
-        return r;
-}
-#else /* !CONFIG_UTS_NS */
 static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
 {
        int r;
-        struct uts_namespace* uts_ns = current->nsproxy->uts_ns;
+        void *which;
-        char* which;
+        which = get_uts(table, write);
+        r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos);
-        switch (table->ctl_name) {
+        put_uts(table, write, which);
-        case KERN_OSTYPE:
-                which = uts_ns->name.sysname;
-                break;
-        case KERN_NODENAME:
-                which = uts_ns->name.nodename;
-                break;
-        case KERN_OSRELEASE:
-                which = uts_ns->name.release;
-                break;
-        case KERN_VERSION:
-                which = uts_ns->name.version;
-                break;
-        case KERN_DOMAINNAME:
-                which = uts_ns->name.domainname;
-                break;
-        default:
-                r = -EINVAL;
-                goto out;
-        }
-        if (!write) {
-                down_read(&uts_sem);
-                r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos);
-                up_read(&uts_sem);
-        } else {
-                down_write(&uts_sem);
-                r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos);
-                up_write(&uts_sem);
-        }
- out:
        return r;
 }
-#endif /* !CONFIG_UTS_NS */
 static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
                                 int *valp,
@@ -1884,7 +1835,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table,
                        p = buf;
                        if (*p == '-' && left > 1) {
                                neg = 1;
-                                left--, p++;
+                                p++;
                        }
                        if (*p < '0' || *p > '9')
                                break;
@@ -1976,9 +1927,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp,
 #define OP_SET  0
 #define OP_AND  1
-#define OP_OR   2
-#define OP_MAX  3
-#define OP_MIN  4
 static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
                                      int *valp,
@@ -1990,13 +1938,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
                switch(op) {
                case OP_SET:    *valp = val; break;
                case OP_AND:    *valp &= val; break;
-                case OP_OR:     *valp |= val; break;
-                case OP_MAX:    if(*valp < val)
-                                        *valp = val;
-                                break;
-                case OP_MIN:    if(*valp > val)
-                                *valp = val;
-                                break;
                }
        } else {
                int val = *valp;
@@ -2135,7 +2076,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write,
                        p = buf;
                        if (*p == '-' && left > 1) {
                                neg = 1;
-                                left--, p++;
+                                p++;
                        }
                        if (*p < '0' || *p > '9')
                                break;
@@ -2391,46 +2332,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
 }
 #ifdef CONFIG_SYSVIPC
-static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
-                void __user *buffer, size_t *lenp, loff_t *ppos)
+        void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-        void *data;
+        void *which;
-        struct ipc_namespace *ns;
+        which = get_ipc(table, write);
+        return __do_proc_dointvec(which, table, write, filp, buffer,
-        ns = current->nsproxy->ipc_ns;
-        switch (table->ctl_name) {
-        case KERN_SHMMAX:
-                data = &ns->shm_ctlmax;
-                goto proc_minmax;
-        case KERN_SHMALL:
-                data = &ns->shm_ctlall;
-                goto proc_minmax;
-        case KERN_SHMMNI:
-                data = &ns->shm_ctlmni;
-                break;
-        case KERN_MSGMAX:
-                data = &ns->msg_ctlmax;
-                break;
-        case KERN_MSGMNI:
-                data = &ns->msg_ctlmni;
-                break;
-        case KERN_MSGMNB:
-                data = &ns->msg_ctlmnb;
-                break;
-        case KERN_SEM:
-                data = &ns->sem_ctls;
-                break;
-        default:
-                return -EINVAL;
-        }
-        return __do_proc_dointvec(data, table, write, filp, buffer,
                        lenp, ppos, NULL, NULL);
-proc_minmax:
+}
-        return __do_proc_doulongvec_minmax(data, table, write, filp, buffer,
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
+        struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        void *which;
+        which = get_ipc(table, write);
+        return __do_proc_doulongvec_minmax(which, table, write, filp, buffer,
                        lenp, ppos, 1l, 1l);
 }
 #endif
 static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp,
@@ -2475,6 +2394,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp,
 {
        return -ENOSYS;
 }
+static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
+                void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        return -ENOSYS;
+}
+static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
+                struct file *filp, void __user *buffer,
+                size_t *lenp, loff_t *ppos)
+{
+        return -ENOSYS;
+}
 #endif
 int proc_dointvec(ctl_table *table, int write, struct file *filp,
@@ -2539,7 +2469,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
 /* The generic string strategy routine: */
 int sysctl_string(ctl_table *table, int __user *name, int nlen,
                  void __user *oldval, size_t __user *oldlenp,
-                  void __user *newval, size_t newlen, void **context)
+                  void __user *newval, size_t newlen)
 {
        if (!table->data || !table->maxlen) 
                return -ENOTDIR;
@@ -2585,7 +2515,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen,
 */
 int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        if (newval && newlen) {
@@ -2621,7 +2551,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
 /* Strategy function to convert jiffies to seconds */ 
 int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        if (oldval) {
                size_t olen;
@@ -2649,7 +2579,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
 /* Strategy function to convert jiffies to seconds */ 
 int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        if (oldval) {
                size_t olen;
@@ -2674,50 +2604,140 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
        return 1;
 }
+/* The generic string strategy routine: */
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+                  void __user *oldval, size_t __user *oldlenp,
+                  void __user *newval, size_t newlen)
+{
+        struct ctl_table uts_table;
+        int r, write;
+        write = newval && newlen;
+        memcpy(&uts_table, table, sizeof(uts_table));
+        uts_table.data = get_uts(table, write);
+        r = sysctl_string(&uts_table, name, nlen,
+                oldval, oldlenp, newval, newlen);
+        put_uts(table, write, uts_table.data);
+        return r;
+}
+#ifdef CONFIG_SYSVIPC
+/* The generic sysctl ipc data routine. */
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+                void __user *oldval, size_t __user *oldlenp,
+                void __user *newval, size_t newlen)
+{
+        size_t len;
+        void *data;
+        /* Get out of I don't have a variable */
+        if (!table->data || !table->maxlen)
+                return -ENOTDIR;
+        data = get_ipc(table, 1);
+        if (!data)
+                return -ENOTDIR;
+        if (oldval && oldlenp) {
+                if (get_user(len, oldlenp))
+                        return -EFAULT;
+                if (len) {
+                        if (len > table->maxlen)
+                                len = table->maxlen;
+                        if (copy_to_user(oldval, data, len))
+                                return -EFAULT;
+                        if (put_user(len, oldlenp))
+                                return -EFAULT;
+                }
+        }
+        if (newval && newlen) {
+                if (newlen > table->maxlen)
+                        newlen = table->maxlen;
+                if (copy_from_user(data, newval, newlen))
+                        return -EFAULT;
+        }
+        return 1;
+}
+#endif
 #else /* CONFIG_SYSCTL_SYSCALL */
 asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
 {
        static int msg_count;
+        struct __sysctl_args tmp;
+        int name[CTL_MAXNAME];
+        int i;
+        /* Read in the sysctl name for better debug message logging */
+        if (copy_from_user(&tmp, args, sizeof(tmp)))
+                return -EFAULT;
+        if (tmp.nlen <= 0 || tmp.nlen >= CTL_MAXNAME)
+                return -ENOTDIR;
+        for (i = 0; i < tmp.nlen; i++)
+                if (get_user(name[i], tmp.name + i))
+                        return -EFAULT;
+        /* Ignore accesses to kernel.version */
+        if ((tmp.nlen == 2) && (name[0] == CTL_KERN) && (name[1] == KERN_VERSION))
+                goto out;
        if (msg_count < 5) {
                msg_count++;
                printk(KERN_INFO
                        "warning: process `%s' used the removed sysctl "
-                        "system call\n", current->comm);
+                        "system call with ", current->comm);
+                for (i = 0; i < tmp.nlen; i++)
+                        printk("%d.", name[i]);
+                printk("\n");
        }
+out:
        return -ENOSYS;
 }
 int sysctl_string(ctl_table *table, int __user *name, int nlen,
                  void __user *oldval, size_t __user *oldlenp,
-                  void __user *newval, size_t newlen, void **context)
+                  void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
 int sysctl_intvec(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
 int sysctl_jiffies(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
 int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
                void __user *oldval, size_t __user *oldlenp,
-                void __user *newval, size_t newlen, void **context)
+                void __user *newval, size_t newlen)
 {
        return -ENOSYS;
 }
+static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
+                  void __user *oldval, size_t __user *oldlenp,
+                  void __user *newval, size_t newlen)
+{
+        return -ENOSYS;
+}
+static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen,
+                void __user *oldval, size_t __user *oldlenp,
+                void __user *newval, size_t newlen)
+{
+        return -ENOSYS;
+}
 #endif /* CONFIG_SYSCTL_SYSCALL */
 /*
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2039585ec5e1..4c3476fa058d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -34,7 +34,7 @@
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
-kmem_cache_t *taskstats_cache;
+struct kmem_cache *taskstats_cache;
 static struct genl_family family = {
        .id             = GENL_ID_GENERATE,
@@ -69,7 +69,7 @@ enum actions {
 };
 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
-                        void **replyp, size_t size)
+                                size_t size)
 {
        struct sk_buff *skb;
        void *reply;
@@ -77,8 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
        /*
         * If new attributes are added, please revisit this allocation
         */
-        size = nlmsg_total_size(genlmsg_total_size(size));
+        skb = genlmsg_new(size, GFP_KERNEL);
-        skb = nlmsg_new(size, GFP_KERNEL);
        if (!skb)
                return -ENOMEM;
@@ -86,20 +85,15 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
                int seq = get_cpu_var(taskstats_seqnum)++;
                put_cpu_var(taskstats_seqnum);
-                reply = genlmsg_put(skb, 0, seq,
+                reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
-                                family.id, 0, 0,
-                                cmd, family.version);
        } else
-                reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+                reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
-                                family.id, 0, 0,
-                                cmd, family.version);
        if (reply == NULL) {
                nlmsg_free(skb);
                return -EINVAL;
        }
        *skbp = skb;
-        *replyp = reply;
        return 0;
 }
@@ -124,10 +118,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 /*
 * Send taskstats data in @skb to listeners registered for @cpu's exit data
 */
-static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+static void send_cpu_listeners(struct sk_buff *skb,
+                                        struct listener_list *listeners)
 {
        struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
-        struct listener_list *listeners;
        struct listener *s, *tmp;
        struct sk_buff *skb_next, *skb_cur = skb;
        void *reply = genlmsg_data(genlhdr);
@@ -140,7 +134,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
        }
        rc = 0;
-        listeners = &per_cpu(listener_array, cpu);
        down_read(&listeners->sem);
        list_for_each_entry(s, &listeners->list, list) {
                skb_next = NULL;
@@ -191,6 +184,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
        } else
                get_task_struct(tsk);
+        memset(stats, 0, sizeof(*stats));
        /*
         * Each accounting subsystem adds calls to its functions to
         * fill in relevant parts of struct taskstsats as follows
@@ -233,6 +227,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
        if (first->signal->stats)
                memcpy(stats, first->signal->stats, sizeof(*stats));
+        else
+                memset(stats, 0, sizeof(*stats));
        tsk = first;
        do {
@@ -349,14 +345,36 @@ static int parse(struct nlattr *na, cpumask_t *mask)
        return ret;
 }
+static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
+{
+        struct nlattr *na, *ret;
+        int aggr;
+        aggr = (type == TASKSTATS_TYPE_PID)
+                        ? TASKSTATS_TYPE_AGGR_PID
+                        : TASKSTATS_TYPE_AGGR_TGID;
+        na = nla_nest_start(skb, aggr);
+        if (!na)
+                goto err;
+        if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+                goto err;
+        ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
+        if (!ret)
+                goto err;
+        nla_nest_end(skb, na);
+        return nla_data(ret);
+err:
+        return NULL;
+}
 static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
        int rc = 0;
        struct sk_buff *rep_skb;
-        struct taskstats stats;
+        struct taskstats *stats;
-        void *reply;
        size_t size;
-        struct nlattr *na;
        cpumask_t mask;
        rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
@@ -377,141 +395,122 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
        size = nla_total_size(sizeof(u32)) +
                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
-        memset(&stats, 0, sizeof(stats));
+        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
-        rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
        if (rc < 0)
                return rc;
+        rc = -EINVAL;
        if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
                u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
-                rc = fill_pid(pid, NULL, &stats);
+                stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
-                if (rc < 0)
+                if (!stats)
                        goto err;
-                na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+                rc = fill_pid(pid, NULL, stats);
-                NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+                if (rc < 0)
-                NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+                        goto err;
-                                stats);
        } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
                u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
-                rc = fill_tgid(tgid, NULL, &stats);
+                stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
-                if (rc < 0)
+                if (!stats)
                        goto err;
-                na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+                rc = fill_tgid(tgid, NULL, stats);
-                NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+                if (rc < 0)
-                NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+                        goto err;
-                                stats);
+        } else
-        } else {
-                rc = -EINVAL;
                goto err;
-        }
-        nla_nest_end(rep_skb, na);
        return send_reply(rep_skb, info->snd_pid);
-nla_put_failure:
-        rc = genlmsg_cancel(rep_skb, reply);
 err:
        nlmsg_free(rep_skb);
        return rc;
 }
-void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
 {
-        struct listener_list *listeners;
+        struct signal_struct *sig = tsk->signal;
-        struct taskstats *tmp;
+        struct taskstats *stats;
-        /*
-         * This is the cpu on which the task is exiting currently and will
-         * be the one for which the exit event is sent, even if the cpu
-         * on which this function is running changes later.
-         */
-        *mycpu = raw_smp_processor_id();
-        *ptidstats = NULL;
+        if (sig->stats || thread_group_empty(tsk))
-        tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+                goto ret;
-        if (!tmp)
-                return;
-        listeners = &per_cpu(listener_array, *mycpu);
+        /* No problem if kmem_cache_zalloc() fails */
-        down_read(&listeners->sem);
+        stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);
-        if (!list_empty(&listeners->list)) {
-                *ptidstats = tmp;
+        spin_lock_irq(&tsk->sighand->siglock);
-                tmp = NULL;
+        if (!sig->stats) {
+                sig->stats = stats;
+                stats = NULL;
        }
-        up_read(&listeners->sem);
+        spin_unlock_irq(&tsk->sighand->siglock);
-        kfree(tmp);
+        if (stats)
+                kmem_cache_free(taskstats_cache, stats);
+ret:
+        return sig->stats;
 }
 /* Send pid data out on exit */
-void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+void taskstats_exit(struct task_struct *tsk, int group_dead)
-                        int group_dead, unsigned int mycpu)
 {
        int rc;
+        struct listener_list *listeners;
+        struct taskstats *stats;
        struct sk_buff *rep_skb;
-        void *reply;
        size_t size;
        int is_thread_group;
-        struct nlattr *na;
-        if (!family_registered || !tidstats)
+        if (!family_registered)
                return;
-        rc = 0;
        /*
         * Size includes space for nested attributes
         */
        size = nla_total_size(sizeof(u32)) +
                nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
-        is_thread_group = (tsk->signal->stats != NULL);
+        is_thread_group = !!taskstats_tgid_alloc(tsk);
-        if (is_thread_group)
+        if (is_thread_group) {
-                size = 2 * size;        /* PID + STATS + TGID + STATS */
+                /* PID + STATS + TGID + STATS */
+                size = 2 * size;
+                /* fill the tsk->signal->stats structure */
+                fill_tgid_exit(tsk);
+        }
-        rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+        listeners = &__raw_get_cpu_var(listener_array);
-        if (rc < 0)
+        if (list_empty(&listeners->list))
-                goto ret;
+                return;
-        rc = fill_pid(tsk->pid, tsk, tidstats);
+        rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
        if (rc < 0)
-                goto err_skb;
+                return;
-        na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
-        NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+        if (!stats)
-        NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+                goto err;
-                        *tidstats);
-        nla_nest_end(rep_skb, na);
-        if (!is_thread_group)
+        rc = fill_pid(tsk->pid, tsk, stats);
-                goto send;
+        if (rc < 0)
+                goto err;
        /*
-         * tsk has/had a thread group so fill the tsk->signal->stats structure
         * Doesn't matter if tsk is the leader or the last group member leaving
         */
+        if (!is_thread_group || !group_dead)
-        fill_tgid_exit(tsk);
-        if (!group_dead)
                goto send;
-        na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+        stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
-        NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+        if (!stats)
-        /* No locking needed for tsk->signal->stats since group is dead */
+                goto err;
-        NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-                        *tsk->signal->stats);
+        memcpy(stats, tsk->signal->stats, sizeof(*stats));
-        nla_nest_end(rep_skb, na);
 send:
-        send_cpu_listeners(rep_skb, mycpu);
+        send_cpu_listeners(rep_skb, listeners);
        return;
+err:
-nla_put_failure:
-        genlmsg_cancel(rep_skb, reply);
-err_skb:
        nlmsg_free(rep_skb);
-ret:
-        return;
 }
 static struct genl_ops taskstats_ops = {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 74eca5939bd9..22504afc0d34 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c)
        /* check if clocksource is already registered */
        if (is_registered_source(c)) {
                printk("register_clocksource: Cannot register %s. "
-                        "Already registered!", c->name);
+                       "Already registered!", c->name);
                ret = -EBUSY;
        } else {
                /* register it */
@@ -186,6 +186,7 @@ void clocksource_reselect(void)
 }
 EXPORT_SYMBOL(clocksource_reselect);
+#ifdef CONFIG_SYSFS
 /**
 * sysfs_show_current_clocksources - sysfs interface for current clocksource
 * @dev:        unused
@@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
 * Sysfs setup bits:
 */
 static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
-                        sysfs_override_clocksource);
+                   sysfs_override_clocksource);
 static SYSDEV_ATTR(available_clocksource, 0600,
-                        sysfs_show_available_clocksources, NULL);
+                   sysfs_show_available_clocksources, NULL);
 static struct sysdev_class clocksource_sysclass = {
        set_kset_name("clocksource"),
@@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void)
 }
 device_initcall(init_clocksource_sysfs);
+#endif /* CONFIG_SYSFS */
 /**
 * boot_override_clocksource - boot clock override
diff --git a/kernel/timer.c b/kernel/timer.c
index c1c7fbcffec1..feddf817baa5 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases;
 EXPORT_SYMBOL(boot_tvec_bases);
 static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+        int rem;
+        unsigned long original = j;
+        /*
+         * We don't want all cpus firing their timers at once hitting the
+         * same lock or cachelines, so we skew each extra cpu with an extra
+         * 3 jiffies. This 3 jiffies came originally from the mm/ code which
+         * already did this.
+         * The skew is done by adding 3*cpunr, then round, then subtract this
+         * extra offset again.
+         */
+        j += cpu * 3;
+        rem = j % HZ;
+        /*
+         * If the target jiffie is just after a whole second (which can happen
+         * due to delays of the timer irq, long irq off times etc etc) then
+         * we should round down to the whole second, not up. Use 1/4th second
+         * as cutoff for this rounding as an extreme upper bound for this.
+         */
+        if (rem < HZ/4) /* round down */
+                j = j - rem;
+        else /* round up */
+                j = j - rem + HZ;
+        /* now that we have rounded, subtract the extra skew again */
+        j -= cpu * 3;
+        if (j <= jiffies) /* rounding ate our timeout entirely; */
+                return original;
+        return j;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies);
+/**
+ * __round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies_relative rounds a time delta  in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long __round_jiffies_relative(unsigned long j, int cpu)
+{
+        /*
+         * In theory the following code can skip a jiffy in case jiffies
+         * increments right between the addition and the later subtraction.
+         * However since the entire point of this function is to use approximate
+         * timeouts, it's entirely ok to not handle that.
+         */
+        return  __round_jiffies(j + jiffies, cpu) - jiffies;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_relative);
+/**
+ * round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * round_jiffies rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long round_jiffies(unsigned long j)
+{
+        return __round_jiffies(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies);
+/**
+ * round_jiffies_relative - function to round jiffies to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * round_jiffies_relative rounds a time delta  in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the "j" parameter.
+ */
+unsigned long round_jiffies_relative(unsigned long j)
+{
+        return __round_jiffies_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_relative);
 static inline void set_running_timer(tvec_base_t *base,
                                        struct timer_list *timer)
 {
@@ -714,7 +846,7 @@ static int change_clocksource(void)
                clock = new;
                clock->cycle_last = now;
                printk(KERN_INFO "Time: %s clocksource has been installed.\n",
-                                        clock->name);
+                       clock->name);
                return 1;
        } else if (clock->update_callback) {
                return clock->update_callback();
@@ -722,7 +854,10 @@ static int change_clocksource(void)
        return 0;
 }
 #else
-#define change_clocksource() (0)
+static inline int change_clocksource(void)
+{
+        return 0;
+}
 #endif
 /**
@@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device);
 * If the error is already larger, we look ahead even further
 * to compensate for late or lost adjustments.
 */
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+                                                 s64 *offset)
 {
        s64 tick_error, i;
        u32 look_ahead, adj;
@@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *
         * Now calculate the error in (1 << look_ahead) ticks, but first
         * remove the single look ahead already included in the error.
         */
-        tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+        tick_error = current_tick_length() >>
+                (TICK_LENGTH_SHIFT - clock->shift + 1);
        tick_error -= clock->xtime_interval >> 1;
        error = ((error - tick_error) >> look_ahead) + tick_error;
@@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
        clock->mult += adj;
        clock->xtime_interval += interval;
        clock->xtime_nsec -= offset;
-        clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
+        clock->error -= (interval - offset) <<
+                        (TICK_LENGTH_SHIFT - clock->shift);
 }
 /**
@@ -1008,11 +1146,15 @@ static inline void calc_load(unsigned long ticks)
        unsigned long active_tasks; /* fixed-point */
        static int count = LOAD_FREQ;
-        active_tasks = count_active_tasks();
+        count -= ticks;
-        for (count -= ticks; count < 0; count += LOAD_FREQ) {
+        if (unlikely(count < 0)) {
-                CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+                active_tasks = count_active_tasks();
-                CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+                do {
-                CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+                        CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+                        CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+                        CALC_LOAD(avenrun[2], EXP_15, active_tasks);
+                        count += LOAD_FREQ;
+                } while (count < 0);
        }
 }
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 65a5036a3d95..baacc3691415 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -80,18 +80,31 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 */
 void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
+        struct mm_struct *mm;
        /* convert pages-jiffies to Mbyte-usec */
        stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB;
        stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB;
-        if (p->mm) {
+        mm = get_task_mm(p);
+        if (mm) {
                /* adjust to KB unit */
-                stats->hiwater_rss   = p->mm->hiwater_rss * PAGE_SIZE / KB;
+                stats->hiwater_rss   = mm->hiwater_rss * PAGE_SIZE / KB;
-                stats->hiwater_vm    = p->mm->hiwater_vm * PAGE_SIZE / KB;
+                stats->hiwater_vm    = mm->hiwater_vm * PAGE_SIZE / KB;
+                mmput(mm);
        }
        stats->read_char        = p->rchar;
        stats->write_char       = p->wchar;
        stats->read_syscalls    = p->syscr;
        stats->write_syscalls   = p->syscw;
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+        stats->read_bytes       = p->ioac.read_bytes;
+        stats->write_bytes      = p->ioac.write_bytes;
+        stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+#else
+        stats->read_bytes       = 0;
+        stats->write_bytes      = 0;
+        stats->cancelled_write_bytes = 0;
+#endif
 }
 #undef KB
 #undef MB
diff --git a/kernel/unwind.c b/kernel/unwind.c
index f7e50d16dbf6..09c261329249 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -14,11 +14,12 @@
 #include <linux/bootmem.h>
 #include <linux/sort.h>
 #include <linux/stop_machine.h>
+#include <linux/uaccess.h>
 #include <asm/sections.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
-extern char __start_unwind[], __end_unwind[];
+extern const char __start_unwind[], __end_unwind[];
 extern const u8 __start_unwind_hdr[], __end_unwind_hdr[];
 #define MAX_STACK_DEPTH 8
@@ -94,6 +95,7 @@ static const struct {
 typedef unsigned long uleb128_t;
 typedef   signed long sleb128_t;
+#define sleb128abs __builtin_labs
 static struct unwind_table {
        struct {
@@ -135,6 +137,17 @@ struct unwind_state {
 static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
+static unsigned unwind_debug;
+static int __init unwind_debug_setup(char *s)
+{
+        unwind_debug = simple_strtoul(s, NULL, 0);
+        return 1;
+}
+__setup("unwind_debug=", unwind_debug_setup);
+#define dprintk(lvl, fmt, args...) \
+        ((void)(lvl > unwind_debug \
+         || printk(KERN_DEBUG "unwind: " fmt "\n", ##args)))
 static struct unwind_table *find_table(unsigned long pc)
 {
        struct unwind_table *table;
@@ -151,7 +164,9 @@ static struct unwind_table *find_table(unsigned long pc)
 static unsigned long read_pointer(const u8 **pLoc,
                                  const void *end,
-                                  signed ptrType);
+                                  signed ptrType,
+                                  unsigned long text_base,
+                                  unsigned long data_base);
 static void init_unwind_table(struct unwind_table *table,
                              const char *name,
@@ -176,10 +191,13 @@ static void init_unwind_table(struct unwind_table *table,
        /* See if the linker provided table looks valid. */
        if (header_size <= 4
            || header_start[0] != 1
-            || (void *)read_pointer(&ptr, end, header_start[1]) != table_start
+            || (void *)read_pointer(&ptr, end, header_start[1], 0, 0)
-            || header_start[2] == DW_EH_PE_omit
+               != table_start
-            || read_pointer(&ptr, end, header_start[2]) <= 0
+            || !read_pointer(&ptr, end, header_start[2], 0, 0)
-            || header_start[3] == DW_EH_PE_omit)
+            || !read_pointer(&ptr, end, header_start[3], 0,
+                             (unsigned long)header_start)
+            || !read_pointer(&ptr, end, header_start[3], 0,
+                             (unsigned long)header_start))
                header_start = NULL;
        table->hdrsz = header_size;
        smp_wmb();
@@ -269,7 +287,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
                ptr = (const u8 *)(fde + 2);
                if (!read_pointer(&ptr,
                                  (const u8 *)(fde + 1) + *fde,
-                                  ptrType))
+                                  ptrType, 0, 0))
                        return;
                ++n;
        }
@@ -279,6 +297,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
        hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int)
                + 2 * n * sizeof(unsigned long);
+        dprintk(2, "Binary lookup table size for %s: %lu bytes", table->name, hdrSize);
        header = alloc(hdrSize);
        if (!header)
                return;
@@ -303,7 +322,7 @@ static void __init setup_unwind_table(struct unwind_table *table,
                ptr = (const u8 *)(fde + 2);
                header->table[n].start = read_pointer(&ptr,
                                                      (const u8 *)(fde + 1) + *fde,
-                                                      fde_pointer_type(cie));
+                                                      fde_pointer_type(cie), 0, 0);
                header->table[n].fde = (unsigned long)fde;
                ++n;
        }
@@ -486,7 +505,9 @@ static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table)
 static unsigned long read_pointer(const u8 **pLoc,
                                  const void *end,
-                                  signed ptrType)
+                                  signed ptrType,
+                                  unsigned long text_base,
+                                  unsigned long data_base)
 {
        unsigned long value = 0;
        union {
@@ -498,13 +519,17 @@ static unsigned long read_pointer(const u8 **pLoc,
                const unsigned long *pul;
        } ptr;
-        if (ptrType < 0 || ptrType == DW_EH_PE_omit)
+        if (ptrType < 0 || ptrType == DW_EH_PE_omit) {
+                dprintk(1, "Invalid pointer encoding %02X (%p,%p).", ptrType, *pLoc, end);
                return 0;
+        }
        ptr.p8 = *pLoc;
        switch(ptrType & DW_EH_PE_FORM) {
        case DW_EH_PE_data2:
-                if (end < (const void *)(ptr.p16u + 1))
+                if (end < (const void *)(ptr.p16u + 1)) {
+                        dprintk(1, "Data16 overrun (%p,%p).", ptr.p8, end);
                        return 0;
+                }
                if(ptrType & DW_EH_PE_signed)
                        value = get_unaligned(ptr.p16s++);
                else
@@ -512,8 +537,10 @@ static unsigned long read_pointer(const u8 **pLoc,
                break;
        case DW_EH_PE_data4:
 #ifdef CONFIG_64BIT
-                if (end < (const void *)(ptr.p32u + 1))
+                if (end < (const void *)(ptr.p32u + 1)) {
+                        dprintk(1, "Data32 overrun (%p,%p).", ptr.p8, end);
                        return 0;
+                }
                if(ptrType & DW_EH_PE_signed)
                        value = get_unaligned(ptr.p32s++);
                else
@@ -525,8 +552,10 @@ static unsigned long read_pointer(const u8 **pLoc,
                BUILD_BUG_ON(sizeof(u32) != sizeof(value));
 #endif
        case DW_EH_PE_native:
-                if (end < (const void *)(ptr.pul + 1))
+                if (end < (const void *)(ptr.pul + 1)) {
+                        dprintk(1, "DataUL overrun (%p,%p).", ptr.p8, end);
                        return 0;
+                }
                value = get_unaligned(ptr.pul++);
                break;
        case DW_EH_PE_leb128:
@@ -534,10 +563,14 @@ static unsigned long read_pointer(const u8 **pLoc,
                value = ptrType & DW_EH_PE_signed
                        ? get_sleb128(&ptr.p8, end)
                        : get_uleb128(&ptr.p8, end);
-                if ((const void *)ptr.p8 > end)
+                if ((const void *)ptr.p8 > end) {
+                        dprintk(1, "DataLEB overrun (%p,%p).", ptr.p8, end);
                        return 0;
+                }
                break;
        default:
+                dprintk(2, "Cannot decode pointer type %02X (%p,%p).",
+                        ptrType, ptr.p8, end);
                return 0;
        }
        switch(ptrType & DW_EH_PE_ADJUST) {
@@ -546,12 +579,33 @@ static unsigned long read_pointer(const u8 **pLoc,
        case DW_EH_PE_pcrel:
                value += (unsigned long)*pLoc;
                break;
+        case DW_EH_PE_textrel:
+                if (likely(text_base)) {
+                        value += text_base;
+                        break;
+                }
+                dprintk(2, "Text-relative encoding %02X (%p,%p), but zero text base.",
+                        ptrType, *pLoc, end);
+                return 0;
+        case DW_EH_PE_datarel:
+                if (likely(data_base)) {
+                        value += data_base;
+                        break;
+                }
+                dprintk(2, "Data-relative encoding %02X (%p,%p), but zero data base.",
+                        ptrType, *pLoc, end);
+                return 0;
        default:
+                dprintk(2, "Cannot adjust pointer type %02X (%p,%p).",
+                        ptrType, *pLoc, end);
                return 0;
        }
        if ((ptrType & DW_EH_PE_indirect)
-            && __get_user(value, (unsigned long *)value))
+            && probe_kernel_address((unsigned long *)value, value)) {
+                dprintk(1, "Cannot read indirect value %lx (%p,%p).",
+                        value, *pLoc, end);
                return 0;
+        }
        *pLoc = ptr.p8;
        return value;
@@ -594,7 +648,8 @@ static signed fde_pointer_type(const u32 *cie)
                        case 'P': {
                                        signed ptrType = *ptr++;
-                                        if (!read_pointer(&ptr, end, ptrType) || ptr > end)
+                                        if (!read_pointer(&ptr, end, ptrType, 0, 0)
+                                            || ptr > end)
                                                return -1;
                                }
                                break;
@@ -654,7 +709,8 @@ static int processCFI(const u8 *start,
                        case DW_CFA_nop:
                                break;
                        case DW_CFA_set_loc:
-                                if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
+                                state->loc = read_pointer(&ptr.p8, end, ptrType, 0, 0);
+                                if (state->loc == 0)
                                        result = 0;
                                break;
                        case DW_CFA_advance_loc1:
@@ -700,8 +756,10 @@ static int processCFI(const u8 *start,
                                        state->label = NULL;
                                        return 1;
                                }
-                                if (state->stackDepth >= MAX_STACK_DEPTH)
+                                if (state->stackDepth >= MAX_STACK_DEPTH) {
+                                        dprintk(1, "State stack overflow (%p,%p).", ptr.p8, end);
                                        return 0;
+                                }
                                state->stack[state->stackDepth++] = ptr.p8;
                                break;
                        case DW_CFA_restore_state:
@@ -716,8 +774,10 @@ static int processCFI(const u8 *start,
                                        result = processCFI(start, end, 0, ptrType, state);
                                        state->loc = loc;
                                        state->label = label;
-                                } else
+                                } else {
+                                        dprintk(1, "State stack underflow (%p,%p).", ptr.p8, end);
                                        return 0;
+                                }
                                break;
                        case DW_CFA_def_cfa:
                                state->cfa.reg = get_uleb128(&ptr.p8, end);
@@ -749,6 +809,7 @@ static int processCFI(const u8 *start,
                                break;
                        case DW_CFA_GNU_window_save:
                        default:
+                                dprintk(1, "Unrecognized CFI op %02X (%p,%p).", ptr.p8[-1], ptr.p8 - 1, end);
                                result = 0;
                                break;
                        }
@@ -764,12 +825,17 @@ static int processCFI(const u8 *start,
                        set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
                        break;
                }
-                if (ptr.p8 > end)
+                if (ptr.p8 > end) {
+                        dprintk(1, "Data overrun (%p,%p).", ptr.p8, end);
                        result = 0;
+                }
                if (result && targetLoc != 0 && targetLoc < state->loc)
                        return 1;
        }
+        if (result && ptr.p8 < end)
+                dprintk(1, "Data underrun (%p,%p).", ptr.p8, end);
        return result
           && ptr.p8 == end
           && (targetLoc == 0
@@ -786,7 +852,7 @@ int unwind(struct unwind_frame_info *frame)
 #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
        const u32 *fde = NULL, *cie = NULL;
        const u8 *ptr = NULL, *end = NULL;
-        unsigned long pc = UNW_PC(frame) - frame->call_frame;
+        unsigned long pc = UNW_PC(frame) - frame->call_frame, sp;
        unsigned long startLoc = 0, endLoc = 0, cfa;
        unsigned i;
        signed ptrType = -1;
@@ -813,9 +879,9 @@ int unwind(struct unwind_frame_info *frame)
                        ptr = hdr + 4;
                        end = hdr + table->hdrsz;
                        if (tableSize
-                            && read_pointer(&ptr, end, hdr[1])
+                            && read_pointer(&ptr, end, hdr[1], 0, 0)
                               == (unsigned long)table->address
-                            && (i = read_pointer(&ptr, end, hdr[2])) > 0
+                            && (i = read_pointer(&ptr, end, hdr[2], 0, 0)) > 0
                            && i == (end - ptr) / (2 * tableSize)
                            && !((end - ptr) % (2 * tableSize))) {
                                do {
@@ -823,7 +889,8 @@ int unwind(struct unwind_frame_info *frame)
                                        startLoc = read_pointer(&cur,
                                                                cur + tableSize,
-                                                                hdr[3]);
+                                                                hdr[3], 0,
+                                                                (unsigned long)hdr);
                                        if (pc < startLoc)
                                                i /= 2;
                                        else {
@@ -834,13 +901,17 @@ int unwind(struct unwind_frame_info *frame)
                                if (i == 1
                                    && (startLoc = read_pointer(&ptr,
                                                                ptr + tableSize,
-                                                                hdr[3])) != 0
+                                                                hdr[3], 0,
+                                                                (unsigned long)hdr)) != 0
                                    && pc >= startLoc)
                                        fde = (void *)read_pointer(&ptr,
                                                                   ptr + tableSize,
-                                                                   hdr[3]);
+                                                                   hdr[3], 0,
+                                                                   (unsigned long)hdr);
                        }
                }
+                if(hdr && !fde)
+                        dprintk(3, "Binary lookup for %lx failed.", pc);
                if (fde != NULL) {
                        cie = cie_for_fde(fde, table);
@@ -851,17 +922,19 @@ int unwind(struct unwind_frame_info *frame)
                           && (ptrType = fde_pointer_type(cie)) >= 0
                           && read_pointer(&ptr,
                                           (const u8 *)(fde + 1) + *fde,
-                                           ptrType) == startLoc) {
+                                           ptrType, 0, 0) == startLoc) {
                                if (!(ptrType & DW_EH_PE_indirect))
                                        ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed;
                                endLoc = startLoc
                                         + read_pointer(&ptr,
                                                        (const u8 *)(fde + 1) + *fde,
-                                                        ptrType);
+                                                        ptrType, 0, 0);
                                if(pc >= endLoc)
                                        fde = NULL;
                        } else
                                fde = NULL;
+                        if(!fde)
+                                dprintk(1, "Binary lookup result for %lx discarded.", pc);
                }
                if (fde == NULL) {
                        for (fde = table->address, tableSize = table->size;
@@ -881,7 +954,7 @@ int unwind(struct unwind_frame_info *frame)
                                ptr = (const u8 *)(fde + 2);
                                startLoc = read_pointer(&ptr,
                                                        (const u8 *)(fde + 1) + *fde,
-                                                        ptrType);
+                                                        ptrType, 0, 0);
                                if (!startLoc)
                                        continue;
                                if (!(ptrType & DW_EH_PE_indirect))
@@ -889,10 +962,12 @@ int unwind(struct unwind_frame_info *frame)
                                endLoc = startLoc
                                         + read_pointer(&ptr,
                                                        (const u8 *)(fde + 1) + *fde,
-                                                        ptrType);
+                                                        ptrType, 0, 0);
                                if (pc >= startLoc && pc < endLoc)
                                        break;
                        }
+                        if(!fde)
+                                dprintk(3, "Linear lookup for %lx failed.", pc);
                }
        }
        if (cie != NULL) {
@@ -926,6 +1001,8 @@ int unwind(struct unwind_frame_info *frame)
                        if (ptr >= end || *ptr)
                                cie = NULL;
                }
+                if(!cie)
+                        dprintk(1, "CIE unusable (%p,%p).", ptr, end);
                ++ptr;
        }
        if (cie != NULL) {
@@ -935,17 +1012,27 @@ int unwind(struct unwind_frame_info *frame)
                state.dataAlign = get_sleb128(&ptr, end);
                if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
                        cie = NULL;
-                else {
+                else if (UNW_PC(frame) % state.codeAlign
+                         || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
+                        dprintk(1, "Input pointer(s) misaligned (%lx,%lx).",
+                                UNW_PC(frame), UNW_SP(frame));
+                        return -EPERM;
+                } else {
                        retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
                        /* skip augmentation */
-                        if (((const char *)(cie + 2))[1] == 'z')
+                        if (((const char *)(cie + 2))[1] == 'z') {
-                                ptr += get_uleb128(&ptr, end);
+                                uleb128_t augSize = get_uleb128(&ptr, end);
+                                ptr += augSize;
+                        }
                        if (ptr > end
                           || retAddrReg >= ARRAY_SIZE(reg_info)
                           || REG_INVALID(retAddrReg)
                           || reg_info[retAddrReg].width != sizeof(unsigned long))
                                cie = NULL;
                }
+                if(!cie)
+                        dprintk(1, "CIE validation failed (%p,%p).", ptr, end);
        }
        if (cie != NULL) {
                state.cieStart = ptr;
@@ -959,13 +1046,15 @@ int unwind(struct unwind_frame_info *frame)
                        if ((ptr += augSize) > end)
                                fde = NULL;
                }
+                if(!fde)
+                        dprintk(1, "FDE validation failed (%p,%p).", ptr, end);
        }
        if (cie == NULL || fde == NULL) {
 #ifdef CONFIG_FRAME_POINTER
                unsigned long top, bottom;
-#endif
-#ifdef CONFIG_FRAME_POINTER
+                if ((UNW_SP(frame) | UNW_FP(frame)) % sizeof(unsigned long))
+                        return -EPERM;
                top = STACK_TOP(frame->task);
                bottom = STACK_BOTTOM(frame->task);
 # if FRAME_RETADDR_OFFSET < 0
@@ -981,18 +1070,19 @@ int unwind(struct unwind_frame_info *frame)
                        & (sizeof(unsigned long) - 1))) {
                        unsigned long link;
-                        if (!__get_user(link,
+                        if (!probe_kernel_address(
                                        (unsigned long *)(UNW_FP(frame)
-                                                          + FRAME_LINK_OFFSET))
+                                                          + FRAME_LINK_OFFSET),
+                                                  link)
 # if FRAME_RETADDR_OFFSET < 0
                           && link > bottom && link < UNW_FP(frame)
 # else
                           && link > UNW_FP(frame) && link < bottom
 # endif
                           && !(link & (sizeof(link) - 1))
-                           && !__get_user(UNW_PC(frame),
+                           && !probe_kernel_address(
                                          (unsigned long *)(UNW_FP(frame)
-                                                            + FRAME_RETADDR_OFFSET))) {
+                                                            + FRAME_RETADDR_OFFSET), UNW_PC(frame))) {
                                UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
 # if FRAME_RETADDR_OFFSET < 0
                                        -
@@ -1015,8 +1105,11 @@ int unwind(struct unwind_frame_info *frame)
           || state.regs[retAddrReg].where == Nowhere
           || state.cfa.reg >= ARRAY_SIZE(reg_info)
           || reg_info[state.cfa.reg].width != sizeof(unsigned long)
-           || state.cfa.offs % sizeof(unsigned long))
+           || FRAME_REG(state.cfa.reg, unsigned long) % sizeof(unsigned long)
+           || state.cfa.offs % sizeof(unsigned long)) {
+                dprintk(1, "Unusable unwind info (%p,%p).", ptr, end);
                return -EIO;
+        }
        /* update frame */
 #ifndef CONFIG_AS_CFI_SIGNAL_FRAME
        if(frame->call_frame
@@ -1035,10 +1128,14 @@ int unwind(struct unwind_frame_info *frame)
 #else
 # define CASES CASE(8); CASE(16); CASE(32); CASE(64)
 #endif
+        pc = UNW_PC(frame);
+        sp = UNW_SP(frame);
        for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
                if (REG_INVALID(i)) {
                        if (state.regs[i].where == Nowhere)
                                continue;
+                        dprintk(1, "Cannot restore register %u (%d).",
+                                i, state.regs[i].where);
                        return -EIO;
                }
                switch(state.regs[i].where) {
@@ -1047,8 +1144,11 @@ int unwind(struct unwind_frame_info *frame)
                case Register:
                        if (state.regs[i].value >= ARRAY_SIZE(reg_info)
                           || REG_INVALID(state.regs[i].value)
-                           || reg_info[i].width > reg_info[state.regs[i].value].width)
+                           || reg_info[i].width > reg_info[state.regs[i].value].width) {
+                                dprintk(1, "Cannot restore register %u from register %lu.",
+                                        i, state.regs[i].value);
                                return -EIO;
+                        }
                        switch(reg_info[state.regs[i].value].width) {
 #define CASE(n) \
                        case sizeof(u##n): \
@@ -1058,6 +1158,9 @@ int unwind(struct unwind_frame_info *frame)
                        CASES;
 #undef CASE
                        default:
+                                dprintk(1, "Unsupported register size %u (%lu).",
+                                        reg_info[state.regs[i].value].width,
+                                        state.regs[i].value);
                                return -EIO;
                        }
                        break;
@@ -1082,12 +1185,17 @@ int unwind(struct unwind_frame_info *frame)
                        CASES;
 #undef CASE
                        default:
+                                dprintk(1, "Unsupported register size %u (%u).",
+                                        reg_info[i].width, i);
                                return -EIO;
                        }
                        break;
                case Value:
-                        if (reg_info[i].width != sizeof(unsigned long))
+                        if (reg_info[i].width != sizeof(unsigned long)) {
+                                dprintk(1, "Unsupported value size %u (%u).",
+                                        reg_info[i].width, i);
                                return -EIO;
+                        }
                        FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
                                                            * state.dataAlign;
                        break;
@@ -1099,15 +1207,20 @@ int unwind(struct unwind_frame_info *frame)
                                    % sizeof(unsigned long)
                                    || addr < startLoc
                                    || addr + sizeof(unsigned long) < addr
-                                    || addr + sizeof(unsigned long) > endLoc)
+                                    || addr + sizeof(unsigned long) > endLoc) {
+                                        dprintk(1, "Bad memory location %lx (%lx).",
+                                                addr, state.regs[i].value);
                                        return -EIO;
+                                }
                                switch(reg_info[i].width) {
 #define CASE(n)     case sizeof(u##n): \
-                                        __get_user(FRAME_REG(i, u##n), (u##n *)addr); \
+                                        probe_kernel_address((u##n *)addr, FRAME_REG(i, u##n)); \
                                        break
                                CASES;
 #undef CASE
                                default:
+                                        dprintk(1, "Unsupported memory size %u (%u).",
+                                                reg_info[i].width, i);
                                        return -EIO;
                                }
                        }
@@ -1115,6 +1228,17 @@ int unwind(struct unwind_frame_info *frame)
                }
        }
+        if (UNW_PC(frame) % state.codeAlign
+            || UNW_SP(frame) % sleb128abs(state.dataAlign)) {
+                dprintk(1, "Output pointer(s) misaligned (%lx,%lx).",
+                        UNW_PC(frame), UNW_SP(frame));
+                return -EIO;
+        }
+        if (pc == UNW_PC(frame) && sp == UNW_SP(frame)) {
+                dprintk(1, "No progress (%lx,%lx).", pc, sp);
+                return -EIO;
+        }
        return 0;
 #undef CASES
 #undef FRAME_REG
diff --git a/kernel/user.c b/kernel/user.c
index 6408c0424291..4869563080e9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,7 +26,7 @@
 #define __uidhashfn(uid)        (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
 #define uidhashentry(uid)       (uidhash_table + __uidhashfn((uid)))
-static kmem_cache_t *uid_cachep;
+static struct kmem_cache *uid_cachep;
 static struct list_head uidhash_table[UIDHASH_SZ];
 /*
@@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid)
        if (!up) {
                struct user_struct *new;
-                new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
+                new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
                if (!new)
                        return NULL;
                new->uid = uid;
@@ -187,6 +187,17 @@ void switch_uid(struct user_struct *new_user)
        atomic_dec(&old_user->processes);
        switch_uid_keyring(new_user);
        current->user = new_user;
+        /*
+         * We need to synchronize with __sigqueue_alloc()
+         * doing a get_uid(p->user).. If that saw the old
+         * user value, we need to wait until it has exited
+         * its critical region before we can free the old
+         * structure.
+         */
+        smp_mb();
+        spin_unlock_wait(&current->sighand->siglock);
        free_uid(old_user);
        suid_keys(current);
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 17c2f03d2c27..db49886bfae1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -29,6 +29,9 @@
 #include <linux/kthread.h>
 #include <linux/hardirq.h>
 #include <linux/mempolicy.h>
+#include <linux/freezer.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
 /*
 * The per-CPU workqueue (if single thread, we always use the first
@@ -55,6 +58,8 @@ struct cpu_workqueue_struct {
        struct task_struct *thread;
        int run_depth;          /* Detect run_workqueue() recursion depth */
+        int freezeable;         /* Freeze the thread during suspend */
 } ____cacheline_aligned;
 /*
@@ -80,6 +85,99 @@ static inline int is_single_threaded(struct workqueue_struct *wq)
        return list_empty(&wq->list);
 }
+/*
+ * Set the workqueue on which a work item is to be run
+ * - Must *only* be called if the pending flag is set
+ */
+static inline void set_wq_data(struct work_struct *work, void *wq)
+{
+        unsigned long new;
+        BUG_ON(!work_pending(work));
+        new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING);
+        new |= work->management & WORK_STRUCT_FLAG_MASK;
+        work->management = new;
+}
+static inline void *get_wq_data(struct work_struct *work)
+{
+        return (void *) (work->management & WORK_STRUCT_WQ_DATA_MASK);
+}
+static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work)
+{
+        int ret = 0;
+        unsigned long flags;
+        spin_lock_irqsave(&cwq->lock, flags);
+        /*
+         * We need to re-validate the work info after we've gotten
+         * the cpu_workqueue lock. We can run the work now iff:
+         *
+         *  - the wq_data still matches the cpu_workqueue_struct
+         *  - AND the work is still marked pending
+         *  - AND the work is still on a list (which will be this
+         *    workqueue_struct list)
+         *
+         * All these conditions are important, because we
+         * need to protect against the work being run right
+         * now on another CPU (all but the last one might be
+         * true if it's currently running and has not been
+         * released yet, for example).
+         */
+        if (get_wq_data(work) == cwq
+            && work_pending(work)
+            && !list_empty(&work->entry)) {
+                work_func_t f = work->func;
+                list_del_init(&work->entry);
+                spin_unlock_irqrestore(&cwq->lock, flags);
+                if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
+                        work_release(work);
+                f(work);
+                spin_lock_irqsave(&cwq->lock, flags);
+                cwq->remove_sequence++;
+                wake_up(&cwq->work_done);
+                ret = 1;
+        }
+        spin_unlock_irqrestore(&cwq->lock, flags);
+        return ret;
+}
+/**
+ * run_scheduled_work - run scheduled work synchronously
+ * @work: work to run
+ *
+ * This checks if the work was pending, and runs it
+ * synchronously if so. It returns a boolean to indicate
+ * whether it had any scheduled work to run or not.
+ *
+ * NOTE! This _only_ works for normal work_structs. You
+ * CANNOT use this for delayed work, because the wq data
+ * for delayed work will not point properly to the per-
+ * CPU workqueue struct, but will change!
+ */
+int fastcall run_scheduled_work(struct work_struct *work)
+{
+        for (;;) {
+                struct cpu_workqueue_struct *cwq;
+                if (!work_pending(work))
+                        return 0;
+                if (list_empty(&work->entry))
+                        return 0;
+                /* NOTE! This depends intimately on __queue_work! */
+                cwq = get_wq_data(work);
+                if (!cwq)
+                        return 0;
+                if (__run_work(cwq, work))
+                        return 1;
+        }
+}
+EXPORT_SYMBOL(run_scheduled_work);
 /* Preempt must be disabled. */
 static void __queue_work(struct cpu_workqueue_struct *cwq,
                         struct work_struct *work)
@@ -87,7 +185,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
        unsigned long flags;
        spin_lock_irqsave(&cwq->lock, flags);
-        work->wq_data = cwq;
+        set_wq_data(work, cwq);
        list_add_tail(&work->entry, &cwq->worklist);
        cwq->insert_sequence++;
        wake_up(&cwq->more_work);
@@ -108,7 +206,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
        int ret = 0, cpu = get_cpu();
-        if (!test_and_set_bit(0, &work->pending)) {
+        if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
                if (unlikely(is_single_threaded(wq)))
                        cpu = singlethread_cpu;
                BUG_ON(!list_empty(&work->entry));
@@ -122,38 +220,42 @@ EXPORT_SYMBOL_GPL(queue_work);
 static void delayed_work_timer_fn(unsigned long __data)
 {
-        struct work_struct *work = (struct work_struct *)__data;
+        struct delayed_work *dwork = (struct delayed_work *)__data;
-        struct workqueue_struct *wq = work->wq_data;
+        struct workqueue_struct *wq = get_wq_data(&dwork->work);
        int cpu = smp_processor_id();
        if (unlikely(is_single_threaded(wq)))
                cpu = singlethread_cpu;
-        __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
+        __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
 }
 /**
 * queue_delayed_work - queue work on a workqueue after delay
 * @wq: workqueue to use
- * @work: work to queue
+ * @work: delayable work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Returns 0 if @work was already on a queue, non-zero otherwise.
 */
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
-                        struct work_struct *work, unsigned long delay)
+                        struct delayed_work *dwork, unsigned long delay)
 {
        int ret = 0;
-        struct timer_list *timer = &work->timer;
+        struct timer_list *timer = &dwork->timer;
+        struct work_struct *work = &dwork->work;
+        if (delay == 0)
+                return queue_work(wq, work);
-        if (!test_and_set_bit(0, &work->pending)) {
+        if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
                BUG_ON(timer_pending(timer));
                BUG_ON(!list_empty(&work->entry));
                /* This stores wq for the moment, for the timer_fn */
-                work->wq_data = wq;
+                set_wq_data(work, wq);
                timer->expires = jiffies + delay;
-                timer->data = (unsigned long)work;
+                timer->data = (unsigned long)dwork;
                timer->function = delayed_work_timer_fn;
                add_timer(timer);
                ret = 1;
@@ -172,19 +274,20 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);
 * Returns 0 if @work was already on a queue, non-zero otherwise.
 */
 int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
-                        struct work_struct *work, unsigned long delay)
+                        struct delayed_work *dwork, unsigned long delay)
 {
        int ret = 0;
-        struct timer_list *timer = &work->timer;
+        struct timer_list *timer = &dwork->timer;
+        struct work_struct *work = &dwork->work;
-        if (!test_and_set_bit(0, &work->pending)) {
+        if (!test_and_set_bit(WORK_STRUCT_PENDING, &work->management)) {
                BUG_ON(timer_pending(timer));
                BUG_ON(!list_empty(&work->entry));
                /* This stores wq for the moment, for the timer_fn */
-                work->wq_data = wq;
+                set_wq_data(work, wq);
                timer->expires = jiffies + delay;
-                timer->data = (unsigned long)work;
+                timer->data = (unsigned long)dwork;
                timer->function = delayed_work_timer_fn;
                add_timer_on(timer, cpu);
                ret = 1;
@@ -212,15 +315,26 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
        while (!list_empty(&cwq->worklist)) {
                struct work_struct *work = list_entry(cwq->worklist.next,
                                                struct work_struct, entry);
-                void (*f) (void *) = work->func;
+                work_func_t f = work->func;
-                void *data = work->data;
                list_del_init(cwq->worklist.next);
                spin_unlock_irqrestore(&cwq->lock, flags);
-                BUG_ON(work->wq_data != cwq);
+                BUG_ON(get_wq_data(work) != cwq);
-                clear_bit(0, &work->pending);
+                if (!test_bit(WORK_STRUCT_NOAUTOREL, &work->management))
-                f(data);
+                        work_release(work);
+                f(work);
+                if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
+                        printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
+                                        "%s/0x%08x/%d\n",
+                                        current->comm, preempt_count(),
+                                        current->pid);
+                        printk(KERN_ERR "    last function: ");
+                        print_symbol("%s\n", (unsigned long)f);
+                        debug_show_held_locks(current);
+                        dump_stack();
+                }
                spin_lock_irqsave(&cwq->lock, flags);
                cwq->remove_sequence++;
@@ -237,7 +351,8 @@ static int worker_thread(void *__cwq)
        struct k_sigaction sa;
        sigset_t blocked;
-        current->flags |= PF_NOFREEZE;
+        if (!cwq->freezeable)
+                current->flags |= PF_NOFREEZE;
        set_user_nice(current, -5);
@@ -260,6 +375,9 @@ static int worker_thread(void *__cwq)
        set_current_state(TASK_INTERRUPTIBLE);
        while (!kthread_should_stop()) {
+                if (cwq->freezeable)
+                        try_to_freeze();
                add_wait_queue(&cwq->more_work, &wait);
                if (list_empty(&cwq->worklist))
                        schedule();
@@ -336,7 +454,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 EXPORT_SYMBOL_GPL(flush_workqueue);
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
-                                                   int cpu)
+                                                   int cpu, int freezeable)
 {
        struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
        struct task_struct *p;
@@ -346,6 +464,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
        cwq->thread = NULL;
        cwq->insert_sequence = 0;
        cwq->remove_sequence = 0;
+        cwq->freezeable = freezeable;
        INIT_LIST_HEAD(&cwq->worklist);
        init_waitqueue_head(&cwq->more_work);
        init_waitqueue_head(&cwq->work_done);
@@ -361,7 +480,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 }
 struct workqueue_struct *__create_workqueue(const char *name,
-                                            int singlethread)
+                                            int singlethread, int freezeable)
 {
        int cpu, destroy = 0;
        struct workqueue_struct *wq;
@@ -381,7 +500,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
        mutex_lock(&workqueue_mutex);
        if (singlethread) {
                INIT_LIST_HEAD(&wq->list);
-                p = create_workqueue_thread(wq, singlethread_cpu);
+                p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
                if (!p)
                        destroy = 1;
                else
@@ -389,7 +508,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
        } else {
                list_add(&wq->list, &workqueues);
                for_each_online_cpu(cpu) {
-                        p = create_workqueue_thread(wq, cpu);
+                        p = create_workqueue_thread(wq, cpu, freezeable);
                        if (p) {
                                kthread_bind(p, cpu);
                                wake_up_process(p);
@@ -468,38 +587,37 @@ EXPORT_SYMBOL(schedule_work);
 /**
 * schedule_delayed_work - put work task in global workqueue after delay
- * @work: job to be done
+ * @dwork: job to be done
- * @delay: number of jiffies to wait
+ * @delay: number of jiffies to wait or 0 for immediate execution
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
-int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
+int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
 {
-        return queue_delayed_work(keventd_wq, work, delay);
+        return queue_delayed_work(keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 /**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
 * @cpu: cpu to use
- * @work: job to be done
+ * @dwork: job to be done
 * @delay: number of jiffies to wait
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue on the specified CPU.
 */
 int schedule_delayed_work_on(int cpu,
-                        struct work_struct *work, unsigned long delay)
+                        struct delayed_work *dwork, unsigned long delay)
 {
-        return queue_delayed_work_on(cpu, keventd_wq, work, delay);
+        return queue_delayed_work_on(cpu, keventd_wq, dwork, delay);
 }
 EXPORT_SYMBOL(schedule_delayed_work_on);
 /**
 * schedule_on_each_cpu - call a function on each online CPU from keventd
 * @func: the function to call
- * @info: a pointer to pass to func()
 *
 * Returns zero on success.
 * Returns -ve errno on failure.
@@ -508,7 +626,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 *
 * schedule_on_each_cpu() is very slow.
 */
-int schedule_on_each_cpu(void (*func)(void *info), void *info)
+int schedule_on_each_cpu(work_func_t func)
 {
        int cpu;
        struct work_struct *works;
@@ -519,7 +637,7 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info)
        mutex_lock(&workqueue_mutex);
        for_each_online_cpu(cpu) {
-                INIT_WORK(per_cpu_ptr(works, cpu), func, info);
+                INIT_WORK(per_cpu_ptr(works, cpu), func);
                __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
                                per_cpu_ptr(works, cpu));
        }
@@ -539,12 +657,12 @@ EXPORT_SYMBOL(flush_scheduled_work);
 * cancel_rearming_delayed_workqueue - reliably kill off a delayed
 *                      work whose handler rearms the delayed work.
 * @wq:   the controlling workqueue structure
- * @work: the delayed work struct
+ * @dwork: the delayed work struct
 */
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
-                                       struct work_struct *work)
+                                       struct delayed_work *dwork)
 {
-        while (!cancel_delayed_work(work))
+        while (!cancel_delayed_work(dwork))
                flush_workqueue(wq);
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
@@ -552,18 +670,17 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
 /**
 * cancel_rearming_delayed_work - reliably kill off a delayed keventd
 *                      work whose handler rearms the delayed work.
- * @work: the delayed work struct
+ * @dwork: the delayed work struct
 */
-void cancel_rearming_delayed_work(struct work_struct *work)
+void cancel_rearming_delayed_work(struct delayed_work *dwork)
 {
-        cancel_rearming_delayed_workqueue(keventd_wq, work);
+        cancel_rearming_delayed_workqueue(keventd_wq, dwork);
 }
 EXPORT_SYMBOL(cancel_rearming_delayed_work);
 /**
 * execute_in_process_context - reliably execute the routine with user context
 * @fn:         the function to execute
- * @data:       data to pass to the function
 * @ew:         guaranteed storage for the execute work structure (must
 *              be available when the work executes)
 *
@@ -573,15 +690,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work);
 * Returns:     0 - function was executed
 *              1 - function was scheduled for execution
 */
-int execute_in_process_context(void (*fn)(void *data), void *data,
+int execute_in_process_context(work_func_t fn, struct execute_work *ew)
-                               struct execute_work *ew)
 {
        if (!in_interrupt()) {
-                fn(data);
+                fn(&ew->work);
                return 0;
        }
-        INIT_WORK(&ew->work, fn, data);
+        INIT_WORK(&ew->work, fn);
        schedule_work(&ew->work);
        return 1;
@@ -609,7 +725,6 @@ int current_is_keventd(void)
 }
-#ifdef CONFIG_HOTPLUG_CPU
 /* Take the work from this (downed) CPU. */
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
@@ -642,7 +757,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                mutex_lock(&workqueue_mutex);
                /* Create a new workqueue thread for it. */
                list_for_each_entry(wq, &workqueues, list) {
-                        if (!create_workqueue_thread(wq, hotcpu)) {
+                        if (!create_workqueue_thread(wq, hotcpu, 0)) {
                                printk("workqueue for %i failed\n", hotcpu);
                                return NOTIFY_BAD;
                        }
@@ -692,7 +807,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        return NOTIFY_OK;
 }
-#endif
 void init_workqueues(void)
 {