52 files changed, 2002 insertions, 981 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 2a7c933651c7..f4330acead46 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -483,10 +483,14 @@ static void do_acct_process(struct file *file)
        ac.ac_ppid = current->parent->tgid;
 #endif
-        read_lock(&tasklist_lock);      /* pin current->signal */
+        mutex_lock(&tty_mutex);
+        /* FIXME: Whoever is responsible for current->signal locking needs
+           to use the same locking all over the kernel and document it */
+        read_lock(&tasklist_lock);
        ac.ac_tty = current->signal->tty ?
                old_encode_dev(tty_devnum(current->signal->tty)) : 0;
        read_unlock(&tasklist_lock);
+        mutex_unlock(&tty_mutex);
        spin_lock_irq(&current->sighand->siglock);
        ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
diff --git a/kernel/audit.c b/kernel/audit.c
index 963fd15c9621..f9889ee77825 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -244,7 +244,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
                char *ctx = NULL;
                u32 len;
                int rc;
-                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
                        return rc;
                else
                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -267,7 +267,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
                char *ctx = NULL;
                u32 len;
                int rc;
-                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
                        return rc;
                else
                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -293,7 +293,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
                char *ctx = NULL;
                u32 len;
                int rc;
-                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
                        return rc;
                else
                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -321,7 +321,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
                char *ctx = NULL;
                u32 len;
                int rc;
-                if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+                if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
                        return rc;
                else
                        audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -538,7 +538,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                if (status_get->mask & AUDIT_STATUS_PID) {
                        int old   = audit_pid;
                        if (sid) {
-                                if ((err = selinux_ctxid_to_string(
+                                if ((err = selinux_sid_to_string(
                                                sid, &ctx, &len)))
                                        return err;
                                else
@@ -576,7 +576,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                                 "user pid=%d uid=%u auid=%u",
                                                 pid, uid, loginuid);
                                if (sid) {
-                                        if (selinux_ctxid_to_string(
+                                        if (selinux_sid_to_string(
                                                        sid, &ctx, &len)) {
                                                audit_log_format(ab, 
                                                        " ssid=%u", sid);
@@ -614,7 +614,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
                                           loginuid, sid);
                break;
        case AUDIT_SIGNAL_INFO:
-                err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
+                err = selinux_sid_to_string(audit_sig_sid, &ctx, &len);
                if (err)
                        return err;
                sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a44879b0c72f..1a58a81fb09d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1398,7 +1398,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
        if (sid) {
                char *ctx = NULL;
                u32 len;
-                if (selinux_ctxid_to_string(sid, &ctx, &len))
+                if (selinux_sid_to_string(sid, &ctx, &len))
                        audit_log_format(ab, " ssid=%u", sid);
                else
                        audit_log_format(ab, " subj=%s", ctx);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1bd8827a0102..105147631753 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -385,7 +385,7 @@ static int audit_filter_rules(struct task_struct *tsk,
                           logged upon error */
                        if (f->se_rule) {
                                if (need_sid) {
-                                        selinux_task_ctxid(tsk, &sid);
+                                        selinux_get_task_sid(tsk, &sid);
                                        need_sid = 0;
                                }
                                result = selinux_audit_rule_match(sid, f->type,
@@ -817,6 +817,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                audit_log_format(ab, " success=%s exit=%ld", 
                                 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
                                 context->return_code);
+        mutex_lock(&tty_mutex);
        if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
                tty = tsk->signal->tty->name;
        else
@@ -838,6 +840,9 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                  context->gid,
                  context->euid, context->suid, context->fsuid,
                  context->egid, context->sgid, context->fsgid, tty);
+        mutex_unlock(&tty_mutex);
        audit_log_task_info(ab, tsk);
        if (context->filterkey) {
                audit_log_format(ab, " key=");
@@ -898,7 +903,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                        if (axi->osid != 0) {
                                char *ctx = NULL;
                                u32 len;
-                                if (selinux_ctxid_to_string(
+                                if (selinux_sid_to_string(
                                                axi->osid, &ctx, &len)) {
                                        audit_log_format(ab, " osid=%u",
                                                        axi->osid);
@@ -1005,7 +1010,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
                if (n->osid != 0) {
                        char *ctx = NULL;
                        u32 len;
-                        if (selinux_ctxid_to_string(
+                        if (selinux_sid_to_string(
                                n->osid, &ctx, &len)) {
                                audit_log_format(ab, " osid=%u", n->osid);
                                call_panic = 2;
diff --git a/kernel/capability.c b/kernel/capability.c
index c7685ad00a97..edb845a6e84a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -133,7 +133,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
     int found = 0;
     do_each_thread(g, target) {
-             if (target == current || target->pid == 1)
+             if (target == current || is_init(target))
                     continue;
             found = 1;
             if (security_capset_check(target, effective, inheritable,
diff --git a/kernel/compat.c b/kernel/compat.c
index 126dee9530aa..75573e5d27b0 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -22,6 +22,7 @@
 #include <linux/security.h>
 #include <linux/timex.h>
 #include <linux/migrate.h>
+#include <linux/posix-timers.h>
 #include <asm/uaccess.h>
@@ -601,6 +602,30 @@ long compat_sys_clock_getres(clockid_t which_clock,
        return err;
 } 
+static long compat_clock_nanosleep_restart(struct restart_block *restart)
+{
+        long err;
+        mm_segment_t oldfs;
+        struct timespec tu;
+        struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1);
+        restart->arg1 = (unsigned long) &tu;
+        oldfs = get_fs();
+        set_fs(KERNEL_DS);
+        err = clock_nanosleep_restart(restart);
+        set_fs(oldfs);
+        if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
+            put_compat_timespec(&tu, rmtp))
+                return -EFAULT;
+        if (err == -ERESTART_RESTARTBLOCK) {
+                restart->fn = compat_clock_nanosleep_restart;
+                restart->arg1 = (unsigned long) rmtp;
+        }
+        return err;
+}
 long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
                            struct compat_timespec __user *rqtp,
                            struct compat_timespec __user *rmtp)
@@ -608,6 +633,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
        long err;
        mm_segment_t oldfs;
        struct timespec in, out; 
+        struct restart_block *restart;
        if (get_compat_timespec(&in, rqtp)) 
                return -EFAULT;
@@ -618,9 +644,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
                                  (struct timespec __user *) &in,
                                  (struct timespec __user *) &out);
        set_fs(oldfs);
        if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
            put_compat_timespec(&out, rmtp))
                return -EFAULT;
+        if (err == -ERESTART_RESTARTBLOCK) {
+                restart = &current_thread_info()->restart_block;
+                restart->fn = compat_clock_nanosleep_restart;
+                restart->arg1 = (unsigned long) rmtp;
+        }
        return err;     
 } 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f230f9ae01c2..32c96628463e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,11 @@ static DEFINE_MUTEX(cpu_bitmask_lock);
 static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
+/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+ * Should always be manipulated under cpu_add_remove_lock
+ */
+static int cpu_hotplug_disabled;
 #ifdef CONFIG_HOTPLUG_CPU
 /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
@@ -108,30 +113,25 @@ static int take_cpu_down(void *unused)
        return 0;
 }
-int cpu_down(unsigned int cpu)
+/* Requires cpu_add_remove_lock to be held */
+static int _cpu_down(unsigned int cpu)
 {
        int err;
        struct task_struct *p;
        cpumask_t old_allowed, tmp;
-        mutex_lock(&cpu_add_remove_lock);
+        if (num_online_cpus() == 1)
-        if (num_online_cpus() == 1) {
+                return -EBUSY;
-                err = -EBUSY;
-                goto out;
-        }
-        if (!cpu_online(cpu)) {
+        if (!cpu_online(cpu))
-                err = -EINVAL;
+                return -EINVAL;
-                goto out;
-        }
        err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
                                                (void *)(long)cpu);
        if (err == NOTIFY_BAD) {
                printk("%s: attempt to take down CPU %u failed\n",
                                __FUNCTION__, cpu);
-                err = -EINVAL;
+                return -EINVAL;
-                goto out;
        }
        /* Ensure that we are not runnable on dying cpu */
@@ -179,22 +179,32 @@ out_thread:
        err = kthread_stop(p);
 out_allowed:
        set_cpus_allowed(current, old_allowed);
-out:
+        return err;
+}
+int cpu_down(unsigned int cpu)
+{
+        int err = 0;
+        mutex_lock(&cpu_add_remove_lock);
+        if (cpu_hotplug_disabled)
+                err = -EBUSY;
+        else
+                err = _cpu_down(cpu);
        mutex_unlock(&cpu_add_remove_lock);
        return err;
 }
 #endif /*CONFIG_HOTPLUG_CPU*/
-int __devinit cpu_up(unsigned int cpu)
+/* Requires cpu_add_remove_lock to be held */
+static int __devinit _cpu_up(unsigned int cpu)
 {
        int ret;
        void *hcpu = (void *)(long)cpu;
-        mutex_lock(&cpu_add_remove_lock);
+        if (cpu_online(cpu) || !cpu_present(cpu))
-        if (cpu_online(cpu) || !cpu_present(cpu)) {
+                return -EINVAL;
-                ret = -EINVAL;
-                goto out;
-        }
        ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
        if (ret == NOTIFY_BAD) {
@@ -219,7 +229,95 @@ out_notify:
        if (ret != 0)
                blocking_notifier_call_chain(&cpu_chain,
                                CPU_UP_CANCELED, hcpu);
+        return ret;
+}
+int __devinit cpu_up(unsigned int cpu)
+{
+        int err = 0;
+        mutex_lock(&cpu_add_remove_lock);
+        if (cpu_hotplug_disabled)
+                err = -EBUSY;
+        else
+                err = _cpu_up(cpu);
+        mutex_unlock(&cpu_add_remove_lock);
+        return err;
+}
+#ifdef CONFIG_SUSPEND_SMP
+static cpumask_t frozen_cpus;
+int disable_nonboot_cpus(void)
+{
+        int cpu, first_cpu, error;
+        mutex_lock(&cpu_add_remove_lock);
+        first_cpu = first_cpu(cpu_present_map);
+        if (!cpu_online(first_cpu)) {
+                error = _cpu_up(first_cpu);
+                if (error) {
+                        printk(KERN_ERR "Could not bring CPU%d up.\n",
+                                first_cpu);
+                        goto out;
+                }
+        }
+        error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
+        if (error) {
+                printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
+                goto out;
+        }
+        /* We take down all of the non-boot CPUs in one shot to avoid races
+         * with the userspace trying to use the CPU hotplug at the same time
+         */
+        cpus_clear(frozen_cpus);
+        printk("Disabling non-boot CPUs ...\n");
+        for_each_online_cpu(cpu) {
+                if (cpu == first_cpu)
+                        continue;
+                error = _cpu_down(cpu);
+                if (!error) {
+                        cpu_set(cpu, frozen_cpus);
+                        printk("CPU%d is down\n", cpu);
+                } else {
+                        printk(KERN_ERR "Error taking CPU%d down: %d\n",
+                                cpu, error);
+                        break;
+                }
+        }
+        if (!error) {
+                BUG_ON(num_online_cpus() > 1);
+                /* Make sure the CPUs won't be enabled by someone else */
+                cpu_hotplug_disabled = 1;
+        } else {
+                printk(KERN_ERR "Non-boot CPUs are not disabled");
+        }
 out:
        mutex_unlock(&cpu_add_remove_lock);
-        return ret;
+        return error;
+}
+void enable_nonboot_cpus(void)
+{
+        int cpu, error;
+        /* Allow everyone to use the CPU hotplug again */
+        mutex_lock(&cpu_add_remove_lock);
+        cpu_hotplug_disabled = 0;
+        mutex_unlock(&cpu_add_remove_lock);
+        printk("Enabling non-boot CPUs ...\n");
+        for_each_cpu_mask(cpu, frozen_cpus) {
+                error = cpu_up(cpu);
+                if (!error) {
+                        printk("CPU%d is up\n", cpu);
+                        continue;
+                }
+                printk(KERN_WARNING "Error taking CPU%d up: %d\n",
+                        cpu, error);
+        }
+        cpus_clear(frozen_cpus);
 }
+#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4ea6f0dc2fc5..8c3c400cce91 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -240,7 +240,7 @@ static struct super_block *cpuset_sb;
 * A cpuset can only be deleted if both its 'count' of using tasks
 * is zero, and its list of 'children' cpusets is empty.  Since all
 * tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * least one task in the system (init), therefore, top_cpuset
 * always has either children cpusets and/or using tasks.  So we don't
 * need a special hack to ensure that top_cpuset cannot be deleted.
 *
@@ -289,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode)
                inode->i_mode = mode;
                inode->i_uid = current->fsuid;
                inode->i_gid = current->fsgid;
-                inode->i_blksize = PAGE_CACHE_SIZE;
                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
@@ -913,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
        int fudge;
        int retval;
+        /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
+        if (cs == &top_cpuset)
+                return -EACCES;
        trialcs = *cs;
        retval = nodelist_parse(buf, trialcs.mems_allowed);
        if (retval < 0)
@@ -1222,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
        task_lock(tsk);
        oldcs = tsk->cpuset;
-        if (!oldcs) {
+        /*
+         * After getting 'oldcs' cpuset ptr, be sure still not exiting.
+         * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
+         * then fail this attach_task(), to avoid breaking top_cpuset.count.
+         */
+        if (tsk->flags & PF_EXITING) {
                task_unlock(tsk);
                mutex_unlock(&callback_mutex);
                put_task_struct(tsk);
@@ -2037,33 +2045,104 @@ out:
        return err;
 }
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
 /*
- * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
- * period.  This is necessary in order to make cpusets transparent
+ * or memory nodes, we need to walk over the cpuset hierarchy,
- * (of no affect) on systems that are actively using CPU hotplug
+ * removing that CPU or node from all cpusets.  If this removes the
- * but making no active use of cpusets.
+ * last CPU or node from a cpuset, then the guarantee_online_cpus()
- *
+ * or guarantee_online_mems() code will use that emptied cpusets
- * This handles CPU hotplug (cpuhp) events.  If someday Memory
+ * parent online CPUs or nodes.  Cpusets that were already empty of
- * Nodes can be hotplugged (dynamically changing node_online_map)
+ * CPUs or nodes are left empty.
- * then we should handle that too, perhaps in a similar way.
+ *
+ * This routine is intentionally inefficient in a couple of regards.
+ * It will check all cpusets in a subtree even if the top cpuset of
+ * the subtree has no offline CPUs or nodes.  It checks both CPUs and
+ * nodes, even though the caller could have been coded to know that
+ * only one of CPUs or nodes needed to be checked on a given call.
+ * This was done to minimize text size rather than cpu cycles.
+ *
+ * Call with both manage_mutex and callback_mutex held.
+ *
+ * Recursive, on depth of cpuset subtree.
 */
-#ifdef CONFIG_HOTPLUG_CPU
+static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
-static int cpuset_handle_cpuhp(struct notifier_block *nb,
+{
-                                unsigned long phase, void *cpu)
+        struct cpuset *c;
+        /* Each of our child cpusets mems must be online */
+        list_for_each_entry(c, &cur->children, sibling) {
+                guarantee_online_cpus_mems_in_subtree(c);
+                if (!cpus_empty(c->cpus_allowed))
+                        guarantee_online_cpus(c, &c->cpus_allowed);
+                if (!nodes_empty(c->mems_allowed))
+                        guarantee_online_mems(c, &c->mems_allowed);
+        }
+}
+/*
+ * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
+ * cpu_online_map and node_online_map.  Force the top cpuset to track
+ * whats online after any CPU or memory node hotplug or unplug event.
+ *
+ * To ensure that we don't remove a CPU or node from the top cpuset
+ * that is currently in use by a child cpuset (which would violate
+ * the rule that cpusets must be subsets of their parent), we first
+ * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ *
+ * Since there are two callers of this routine, one for CPU hotplug
+ * events and one for memory node hotplug events, we could have coded
+ * two separate routines here.  We code it as a single common routine
+ * in order to minimize text size.
+ */
+static void common_cpu_mem_hotplug_unplug(void)
 {
        mutex_lock(&manage_mutex);
        mutex_lock(&callback_mutex);
+        guarantee_online_cpus_mems_in_subtree(&top_cpuset);
        top_cpuset.cpus_allowed = cpu_online_map;
+        top_cpuset.mems_allowed = node_online_map;
        mutex_unlock(&callback_mutex);
        mutex_unlock(&manage_mutex);
+}
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period.  This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This routine ensures that top_cpuset.cpus_allowed tracks
+ * cpu_online_map on each CPU hotplug (cpuhp) event.
+ */
+static int cpuset_handle_cpuhp(struct notifier_block *nb,
+                                unsigned long phase, void *cpu)
+{
+        common_cpu_mem_hotplug_unplug();
        return 0;
 }
 #endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Keep top_cpuset.mems_allowed tracking node_online_map.
+ * Call this routine anytime after you change node_online_map.
+ * See also the previous routine cpuset_handle_cpuhp().
+ */
+void cpuset_track_online_nodes()
+{
+        common_cpu_mem_hotplug_unplug();
+}
+#endif
 /**
 * cpuset_init_smp - initialize cpus_allowed
 *
@@ -2245,7 +2324,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
        int i;
        for (i = 0; zl->zones[i]; i++) {
-                int nid = zl->zones[i]->zone_pgdat->node_id;
+                int nid = zone_to_nid(zl->zones[i]);
                if (node_isset(nid, current->mems_allowed))
                        return 1;
@@ -2316,9 +2395,9 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
        const struct cpuset *cs;        /* current cpuset ancestors */
        int allowed;                    /* is allocation in zone z allowed? */
-        if (in_interrupt())
+        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
                return 1;
-        node = z->zone_pgdat->node_id;
+        node = zone_to_nid(z);
        might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
        if (node_isset(node, current->mems_allowed))
                return 1;
diff --git a/kernel/exit.c b/kernel/exit.c
index d891883420f7..2e4c13cba95a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -219,7 +219,7 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
        do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
                if (p == ignored_task
                                || p->exit_state
-                                || p->real_parent->pid == 1)
+                                || is_init(p->real_parent))
                        continue;
                if (process_group(p->real_parent) != pgrp
                            && p->real_parent->signal->session == p->signal->session) {
@@ -249,17 +249,6 @@ static int has_stopped_jobs(int pgrp)
        do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
                if (p->state != TASK_STOPPED)
                        continue;
-                /* If p is stopped by a debugger on a signal that won't
-                   stop it, then don't count p as stopped.  This isn't
-                   perfect but it's a good approximation.  */
-                if (unlikely (p->ptrace)
-                    && p->exit_code != SIGSTOP
-                    && p->exit_code != SIGTSTP
-                    && p->exit_code != SIGTTOU
-                    && p->exit_code != SIGTTIN)
-                        continue;
                retval = 1;
                break;
        } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
@@ -292,9 +281,7 @@ static void reparent_to_init(void)
        /* Set the exit signal to SIGCHLD so we signal init on exit */
        current->exit_signal = SIGCHLD;
-        if ((current->policy == SCHED_NORMAL ||
+        if (!has_rt_policy(current) && (task_nice(current) < 0))
-                        current->policy == SCHED_BATCH)
-                                && (task_nice(current) < 0))
                set_user_nice(current, 0);
        /* cpus_allowed? */
        /* rt_priority? */
@@ -487,6 +474,18 @@ void fastcall put_files_struct(struct files_struct *files)
 EXPORT_SYMBOL(put_files_struct);
+void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
+{
+        struct files_struct *old;
+        old = tsk->files;
+        task_lock(tsk);
+        tsk->files = files;
+        task_unlock(tsk);
+        put_files_struct(old);
+}
+EXPORT_SYMBOL(reset_files_struct);
 static inline void __exit_files(struct task_struct *tsk)
 {
        struct files_struct * files = tsk->files;
@@ -954,15 +953,15 @@ fastcall NORET_TYPE void do_exit(long code)
        if (tsk->splice_pipe)
                __free_pipe_info(tsk->splice_pipe);
-        /* PF_DEAD causes final put_task_struct after we schedule. */
        preempt_disable();
-        BUG_ON(tsk->flags & PF_DEAD);
+        /* causes final put_task_struct in finish_task_switch(). */
-        tsk->flags |= PF_DEAD;
+        tsk->state = TASK_DEAD;
        schedule();
        BUG();
        /* Avoid "noreturn function does return".  */
-        for (;;) ;
+        for (;;)
+                cpu_relax();    /* For when BUG is null */
 }
 EXPORT_SYMBOL_GPL(do_exit);
@@ -971,7 +970,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
 {
        if (comp)
                complete(comp);
-        
        do_exit(code);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index f9b014e3e700..1c999f3e0b47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -45,6 +45,7 @@
 #include <linux/cn_proc.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
+#include <linux/random.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -175,10 +176,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->thread_info = ti;
        setup_thread_stack(tsk, orig);
+#ifdef CONFIG_CC_STACKPROTECTOR
+        tsk->stack_canary = get_random_int();
+#endif
        /* One for us, one for whoever does the "release_task()" (usually parent) */
        atomic_set(&tsk->usage,2);
        atomic_set(&tsk->fs_excl, 0);
+#ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
+#endif
        tsk->splice_pipe = NULL;
        return tsk;
 }
@@ -1056,7 +1063,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
        p->irq_events = 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        p->hardirqs_enabled = 1;
+#else
        p->hardirqs_enabled = 0;
+#endif
        p->hardirq_enable_ip = 0;
        p->hardirq_enable_event = 0;
        p->hardirq_disable_ip = _THIS_IP_;
@@ -1139,7 +1150,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /* Our parent execution domain becomes current domain
           These must match for thread signalling to apply */
-           
        p->parent_exec_id = p->self_exec_id;
        /* ok, now we should be set up.. */
@@ -1162,6 +1172,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
+        /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
+        p->ioprio = current->ioprio;
        /*
         * The task hasn't been attached yet, so its cpus_allowed mask will
         * not be changed, nor will its assigned CPU.
@@ -1221,11 +1234,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                }
        }
-        /*
-         * inherit ioprio
-         */
-        p->ioprio = current->ioprio;
        if (likely(p->pid)) {
                add_parent(p);
                if (unlikely(p->ptrace & PT_PTRACED))
diff --git a/kernel/futex.c b/kernel/futex.c
index 9d260e838cff..4b6770e9806d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -389,7 +389,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 {
        struct task_struct *p;
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        p = find_task_by_pid(pid);
        if (!p)
                goto out_unlock;
@@ -403,7 +403,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
        }
        get_task_struct(p);
 out_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return p;
 }
@@ -1624,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
                struct task_struct *p;
                ret = -ESRCH;
-                read_lock(&tasklist_lock);
+                rcu_read_lock();
                p = find_task_by_pid(pid);
                if (!p)
                        goto err_unlock;
@@ -1633,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
                                !capable(CAP_SYS_PTRACE))
                        goto err_unlock;
                head = p->robust_list;
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
        }
        if (put_user(sizeof(*head), len_ptr))
@@ -1641,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
        return put_user(head, head_ptr);
 err_unlock:
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        return ret;
 }
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 21c38a7e666b..d0ba190dfeb6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -693,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
        return t->task == NULL;
 }
-static long __sched nanosleep_restart(struct restart_block *restart)
+long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
 {
        struct hrtimer_sleeper t;
        struct timespec __user *rmtp;
@@ -702,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart)
        restart->fn = do_no_restart_syscall;
-        hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS);
+        hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS);
-        t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+        t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
        if (do_nanosleep(&t, HRTIMER_ABS))
                return 0;
-        rmtp = (struct timespec __user *) restart->arg2;
+        rmtp = (struct timespec __user *) restart->arg1;
        if (rmtp) {
                time = ktime_sub(t.timer.expires, t.timer.base->get_time());
                if (time.tv64 <= 0)
@@ -718,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
                        return -EFAULT;
        }
-        restart->fn = nanosleep_restart;
+        restart->fn = hrtimer_nanosleep_restart;
        /* The other values in restart are already filled in */
        return -ERESTART_RESTARTBLOCK;
@@ -751,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
        }
        restart = &current_thread_info()->restart_block;
-        restart->fn = nanosleep_restart;
+        restart->fn = hrtimer_nanosleep_restart;
-        restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF;
+        restart->arg0 = (unsigned long) t.timer.base->index;
-        restart->arg1 = t.timer.expires.tv64 >> 32;
+        restart->arg1 = (unsigned long) rmtp;
-        restart->arg2 = (unsigned long) rmtp;
+        restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF;
-        restart->arg3 = (unsigned long) t.timer.base->index;
+        restart->arg3 = t.timer.expires.tv64 >> 32;
        return -ERESTART_RESTARTBLOCK;
 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ac1f850d4937..736cb0bd498f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -40,10 +40,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
        spin_lock_irqsave(&desc->lock, flags);
        irq_chip_set_defaults(chip);
        desc->chip = chip;
-        /*
-         * For compatibility only:
-         */
-        desc->chip = chip;
        spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
@@ -146,7 +142,7 @@ static void default_disable(unsigned int irq)
        struct irq_desc *desc = irq_desc + irq;
        if (!(desc->status & IRQ_DELAYED_DISABLE))
-                irq_desc[irq].chip->mask(irq);
+                desc->chip->mask(irq);
 }
 /*
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 48a53f68af96..4c6cdbaed661 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -154,6 +154,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
        return retval;
 }
+#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 /**
 * __do_IRQ - original all in one highlevel IRQ handler
 * @irq:        the interrupt number
@@ -253,6 +254,7 @@ out:
        return 1;
 }
+#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 50087ecf337e..fcdd5d2bc3f4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -40,7 +40,7 @@ struct resource crashk_res = {
 int kexec_should_crash(struct task_struct *p)
 {
-        if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+        if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
                return 1;
        return 0;
 }
@@ -995,7 +995,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
        image = xchg(dest_image, image);
 out:
-        xchg(&kexec_lock, 0); /* Release the mutex */
+        locked = xchg(&kexec_lock, 0); /* Release the mutex */
+        BUG_ON(!locked);
        kimage_free(image);
        return result;
@@ -1061,7 +1062,8 @@ void crash_kexec(struct pt_regs *regs)
                        machine_crash_shutdown(&fixed_regs);
                        machine_kexec(kexec_crash_image);
                }
-                xchg(&kexec_lock, 0);
+                locked = xchg(&kexec_lock, 0);
+                BUG_ON(!locked);
        }
 }
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 64ab045c3d9d..5d1d907378a2 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
        len = min(len, fifo->size - fifo->in + fifo->out);
+        /*
+         * Ensure that we sample the fifo->out index -before- we
+         * start putting bytes into the kfifo.
+         */
+        smp_mb();
        /* first put the data starting from fifo->in to buffer end */
        l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
        memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
@@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
        /* then put the rest (if any) at the beginning of the buffer */
        memcpy(fifo->buffer, buffer + l, len - l);
+        /*
+         * Ensure that we add the bytes to the kfifo -before-
+         * we update the fifo->in index.
+         */
+        smp_wmb();
        fifo->in += len;
        return len;
@@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
        len = min(len, fifo->in - fifo->out);
+        /*
+         * Ensure that we sample the fifo->in index -before- we
+         * start removing bytes from the kfifo.
+         */
+        smp_rmb();
        /* first get the data from fifo->out until the end of the buffer */
        l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
        memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
@@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
        /* then get the rest (if any) from the beginning of the buffer */
        memcpy(buffer + l, fifo->buffer, len - l);
+        /*
+         * Ensure that we remove the bytes from the kfifo -before-
+         * we update the fifo->out index.
+         */
+        smp_mb();
        fifo->out += len;
        return len;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5c470c57fb57..842f8015d7fd 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -176,6 +176,8 @@ static int wait_for_helper(void *data)
        if (pid < 0) {
                sub_info->retval = pid;
        } else {
+                int ret;
                /*
                 * Normally it is bogus to call wait4() from in-kernel because
                 * wait4() wants to write the exit code to a userspace address.
@@ -185,7 +187,15 @@ static int wait_for_helper(void *data)
                 *
                 * Thus the __user pointer cast is valid here.
                 */
-                sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL);
+                sys_wait4(pid, (int __user *)&ret, 0, NULL);
+                /*
+                 * If ret is 0, either ____call_usermodehelper failed and the
+                 * real error code is already in sub_info->retval or
+                 * sub_info->retval is 0 anyway, so don't mess with it then.
+                 */
+                if (ret)
+                        sub_info->retval = ret;
        }
        complete(sub_info->complete);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9bad17884513..e596525669ed 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -36,6 +36,7 @@
 #include <linux/stacktrace.h>
 #include <linux/debug_locks.h>
 #include <linux/irqflags.h>
+#include <linux/utsname.h>
 #include <asm/sections.h>
@@ -121,8 +122,8 @@ static struct list_head chainhash_table[CHAINHASH_SIZE];
 * unique.
 */
 #define iterate_chain_key(key1, key2) \
-        (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \
+        (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
-        ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \
+        ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
        (key2))
 void lockdep_off(void)
@@ -224,7 +225,14 @@ static int save_trace(struct stack_trace *trace)
        trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
        trace->entries = stack_trace + nr_stack_trace_entries;
-        save_stack_trace(trace, NULL, 0, 3);
+        trace->skip = 3;
+        trace->all_contexts = 0;
+        /* Make sure to not recurse in case the the unwinder needs to tak
+e          locks. */
+        lockdep_off();
+        save_stack_trace(trace, NULL);
+        lockdep_on();
        trace->max_entries = trace->nr_entries;
@@ -508,6 +516,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
        return 0;
 }
+static void print_kernel_version(void)
+{
+        printk("%s %.*s\n", system_utsname.release,
+                (int)strcspn(system_utsname.version, " "),
+                system_utsname.version);
+}
 /*
 * When a circular dependency is detected, print the
 * header first:
@@ -524,6 +539,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
        printk("\n=======================================================\n");
        printk(  "[ INFO: possible circular locking dependency detected ]\n");
+        print_kernel_version();
        printk(  "-------------------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, curr->pid);
@@ -705,6 +721,7 @@ print_bad_irq_dependency(struct task_struct *curr,
        printk("\n======================================================\n");
        printk(  "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
                irqclass, irqclass);
+        print_kernel_version();
        printk(  "------------------------------------------------------\n");
        printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
                curr->comm, curr->pid,
@@ -786,6 +803,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
        printk("\n=============================================\n");
        printk(  "[ INFO: possible recursive locking detected ]\n");
+        print_kernel_version();
        printk(  "---------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, curr->pid);
@@ -1368,6 +1386,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
        printk("\n=========================================================\n");
        printk(  "[ INFO: possible irq lock inversion dependency detected ]\n");
+        print_kernel_version();
        printk(  "---------------------------------------------------------\n");
        printk("%s/%d just changed the state of lock:\n",
                curr->comm, curr->pid);
@@ -1462,6 +1481,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        printk("\n=================================\n");
        printk(  "[ INFO: inconsistent lock state ]\n");
+        print_kernel_version();
        printk(  "---------------------------------\n");
        printk("inconsistent {%s} -> {%s} usage.\n",
diff --git a/kernel/module.c b/kernel/module.c
index 2a19cd47c046..05625d5dc758 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -933,6 +933,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
        return sprintf(buf, "0x%lx\n", sattr->address);
 }
+static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
+{
+        int section;
+        for (section = 0; section < sect_attrs->nsections; section++)
+                kfree(sect_attrs->attrs[section].name);
+        kfree(sect_attrs);
+}
 static void add_sect_attrs(struct module *mod, unsigned int nsect,
                char *secstrings, Elf_Shdr *sechdrs)
 {
@@ -949,21 +958,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
                        + nloaded * sizeof(sect_attrs->attrs[0]),
                        sizeof(sect_attrs->grp.attrs[0]));
        size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
-        if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL)))
+        sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+        if (sect_attrs == NULL)
                return;
        /* Setup section attributes. */
        sect_attrs->grp.name = "sections";
        sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
+        sect_attrs->nsections = 0;
        sattr = &sect_attrs->attrs[0];
        gattr = &sect_attrs->grp.attrs[0];
        for (i = 0; i < nsect; i++) {
                if (! (sechdrs[i].sh_flags & SHF_ALLOC))
                        continue;
                sattr->address = sechdrs[i].sh_addr;
-                strlcpy(sattr->name, secstrings + sechdrs[i].sh_name,
+                sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
-                        MODULE_SECT_NAME_LEN);
+                                        GFP_KERNEL);
+                if (sattr->name == NULL)
+                        goto out;
+                sect_attrs->nsections++;
                sattr->mattr.show = module_sect_show;
                sattr->mattr.store = NULL;
                sattr->mattr.attr.name = sattr->name;
@@ -979,7 +993,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
        mod->sect_attrs = sect_attrs;
        return;
  out:
-        kfree(sect_attrs);
+        free_sect_attrs(sect_attrs);
 }
 static void remove_sect_attrs(struct module *mod)
@@ -989,13 +1003,13 @@ static void remove_sect_attrs(struct module *mod)
                                   &mod->sect_attrs->grp);
                /* We are positive that no one is using any sect attrs
                 * at this point.  Deallocate immediately. */
-                kfree(mod->sect_attrs);
+                free_sect_attrs(mod->sect_attrs);
                mod->sect_attrs = NULL;
        }
 }
 #else
 static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
                char *sectstrings, Elf_Shdr *sechdrs)
 {
@@ -1054,6 +1068,12 @@ static int mod_sysfs_setup(struct module *mod,
 {
        int err;
+        if (!module_subsys.kset.subsys) {
+                printk(KERN_ERR "%s: module_subsys not initialized\n",
+                       mod->name);
+                err = -EINVAL;
+                goto out;
+        }
        memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
        err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
        if (err)
diff --git a/kernel/panic.c b/kernel/panic.c
index 8010b9b17aca..525e365f7239 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -270,3 +270,15 @@ void oops_exit(void)
 {
        do_oops_enter_exit();
 }
+#ifdef CONFIG_CC_STACKPROTECTOR
+/*
+ * Called when gcc's -fstack-protector feature is used, and
+ * gcc detects corruption of the on-stack canary value
+ */
+void __stack_chk_fail(void)
+{
+        panic("stack-protector: Kernel stack is corrupted");
+}
+EXPORT_SYMBOL(__stack_chk_fail);
+#endif
diff --git a/kernel/params.c b/kernel/params.c
index 91aea7aa532e..f406655d6653 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -547,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name,
                                            unsigned int name_skip)
 {
        struct module_kobject *mk;
+        int ret;
        mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
        BUG_ON(!mk);
@@ -554,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
        mk->mod = THIS_MODULE;
        kobj_set_kset_s(mk, module_subsys);
        kobject_set_name(&mk->kobj, name);
-        kobject_register(&mk->kobj);
+        ret = kobject_register(&mk->kobj);
+        BUG_ON(ret < 0);
        /* no need to keep the kobject if no parameter is exported */
        if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
@@ -684,13 +686,20 @@ decl_subsys(module, &module_ktype, NULL);
 */
 static int __init param_sysfs_init(void)
 {
-        subsystem_register(&module_subsys);
+        int ret;
+        ret = subsystem_register(&module_subsys);
+        if (ret < 0) {
+                printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
+                        __FILE__, __LINE__, ret);
+                return ret;
+        }
        param_sysfs_builtin();
        return 0;
 }
-__initcall(param_sysfs_init);
+subsys_initcall(param_sysfs_init);
 EXPORT_SYMBOL(param_set_byte);
 EXPORT_SYMBOL(param_get_byte);
diff --git a/kernel/pid.c b/kernel/pid.c
index 93e212f20671..8387e8c68193 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -223,9 +223,6 @@ int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
        struct pid_link *link;
        struct pid *pid;
-        WARN_ON(!task->pid); /* to be removed soon */
-        WARN_ON(!nr); /* to be removed soon */
        link = &task->pids[type];
        link->pid = pid = find_pid(nr);
        hlist_add_head_rcu(&link->node, &pid->tasks[type]);
@@ -252,6 +249,15 @@ void fastcall detach_pid(struct task_struct *task, enum pid_type type)
        free_pid(pid);
 }
+/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
+void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
+                           enum pid_type type)
+{
+        new->pids[type].pid = old->pids[type].pid;
+        hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
+        old->pids[type].pid = NULL;
+}
 struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
 {
        struct task_struct *result = NULL;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d38d9ec3276c..479b16b44f79 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1393,25 +1393,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
        }
 }
-static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
+static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
+                            struct timespec *rqtp, struct itimerspec *it)
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-                     struct timespec *rqtp, struct timespec __user *rmtp)
 {
-        struct restart_block *restart_block =
-            &current_thread_info()->restart_block;
        struct k_itimer timer;
        int error;
        /*
-         * Diagnose required errors first.
-         */
-        if (CPUCLOCK_PERTHREAD(which_clock) &&
-            (CPUCLOCK_PID(which_clock) == 0 ||
-             CPUCLOCK_PID(which_clock) == current->pid))
-                return -EINVAL;
-        /*
         * Set up a temporary timer and then wait for it to go off.
         */
        memset(&timer, 0, sizeof timer);
@@ -1422,11 +1410,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
        timer.it_process = current;
        if (!error) {
                static struct itimerspec zero_it;
-                struct itimerspec it = { .it_value = *rqtp,
-                                         .it_interval = {} };
+                memset(it, 0, sizeof *it);
+                it->it_value = *rqtp;
                spin_lock_irq(&timer.it_lock);
-                error = posix_cpu_timer_set(&timer, flags, &it, NULL);
+                error = posix_cpu_timer_set(&timer, flags, it, NULL);
                if (error) {
                        spin_unlock_irq(&timer.it_lock);
                        return error;
@@ -1454,49 +1443,89 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
                 * We were interrupted by a signal.
                 */
                sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
-                posix_cpu_timer_set(&timer, 0, &zero_it, &it);
+                posix_cpu_timer_set(&timer, 0, &zero_it, it);
                spin_unlock_irq(&timer.it_lock);
-                if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
+                if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
                        /*
                         * It actually did fire already.
                         */
                        return 0;
                }
+                error = -ERESTART_RESTARTBLOCK;
+        }
+        return error;
+}
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+                     struct timespec *rqtp, struct timespec __user *rmtp)
+{
+        struct restart_block *restart_block =
+            &current_thread_info()->restart_block;
+        struct itimerspec it;
+        int error;
+        /*
+         * Diagnose required errors first.
+         */
+        if (CPUCLOCK_PERTHREAD(which_clock) &&
+            (CPUCLOCK_PID(which_clock) == 0 ||
+             CPUCLOCK_PID(which_clock) == current->pid))
+                return -EINVAL;
+        error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
+        if (error == -ERESTART_RESTARTBLOCK) {
+                if (flags & TIMER_ABSTIME)
+                        return -ERESTARTNOHAND;
                /*
-                 * Report back to the user the time still remaining.
+                 * Report back to the user the time still remaining.
-                 */
+                 */
-                if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
+                if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
-                    copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
                        return -EFAULT;
-                restart_block->fn = posix_cpu_clock_nanosleep_restart;
+                restart_block->fn = posix_cpu_nsleep_restart;
-                /* Caller already set restart_block->arg1 */
                restart_block->arg0 = which_clock;
                restart_block->arg1 = (unsigned long) rmtp;
                restart_block->arg2 = rqtp->tv_sec;
                restart_block->arg3 = rqtp->tv_nsec;
-                error = -ERESTART_RESTARTBLOCK;
        }
        return error;
 }
-static long
+long posix_cpu_nsleep_restart(struct restart_block *restart_block)
-posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
 {
        clockid_t which_clock = restart_block->arg0;
        struct timespec __user *rmtp;
        struct timespec t;
+        struct itimerspec it;
+        int error;
        rmtp = (struct timespec __user *) restart_block->arg1;
        t.tv_sec = restart_block->arg2;
        t.tv_nsec = restart_block->arg3;
        restart_block->fn = do_no_restart_syscall;
-        return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
+        error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
+        if (error == -ERESTART_RESTARTBLOCK) {
+                /*
+                 * Report back to the user the time still remaining.
+                 */
+                if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                        return -EFAULT;
+                restart_block->fn = posix_cpu_nsleep_restart;
+                restart_block->arg0 = which_clock;
+                restart_block->arg1 = (unsigned long) rmtp;
+                restart_block->arg2 = t.tv_sec;
+                restart_block->arg3 = t.tv_nsec;
+        }
+        return error;
 }
@@ -1524,6 +1553,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags,
 {
        return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
 }
+static long process_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+        return -EINVAL;
+}
 static int thread_cpu_clock_getres(const clockid_t which_clock,
                                   struct timespec *tp)
 {
@@ -1544,6 +1577,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
 {
        return -EINVAL;
 }
+static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+        return -EINVAL;
+}
 static __init int init_posix_cpu_timers(void)
 {
@@ -1553,6 +1590,7 @@ static __init int init_posix_cpu_timers(void)
                .clock_set = do_posix_clock_nosettime,
                .timer_create = process_cpu_timer_create,
                .nsleep = process_cpu_nsleep,
+                .nsleep_restart = process_cpu_nsleep_restart,
        };
        struct k_clock thread = {
                .clock_getres = thread_cpu_clock_getres,
@@ -1560,6 +1598,7 @@ static __init int init_posix_cpu_timers(void)
                .clock_set = do_posix_clock_nosettime,
                .timer_create = thread_cpu_timer_create,
                .nsleep = thread_cpu_nsleep,
+                .nsleep_restart = thread_cpu_nsleep_restart,
        };
        register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ac6dc8744429..e5ebcc1ec3a0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
        return CLOCK_DISPATCH(which_clock, nsleep,
                              (which_clock, flags, &t, rmtp));
 }
+/*
+ * nanosleep_restart for monotonic and realtime clocks
+ */
+static int common_nsleep_restart(struct restart_block *restart_block)
+{
+        return hrtimer_nanosleep_restart(restart_block);
+}
+/*
+ * This will restart clock_nanosleep. This is required only by
+ * compat_clock_nanosleep_restart for now.
+ */
+long
+clock_nanosleep_restart(struct restart_block *restart_block)
+{
+        clockid_t which_clock = restart_block->arg0;
+        return CLOCK_DISPATCH(which_clock, nsleep_restart,
+                              (restart_block));
+}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 619ecabf7c58..825068ca3479 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,17 @@ config PM_DEBUG
        code. This is helpful when debugging and reporting various PM bugs, 
        like suspend support.
+config DISABLE_CONSOLE_SUSPEND
+        bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
+        depends on PM && PM_DEBUG
+        default n
+        ---help---
+        This option turns off the console suspend mechanism that prevents
+        debug messages from reaching the console during the suspend/resume
+        operations.  This may be helpful when debugging device drivers'
+        suspend/resume routines, but may itself lead to problems, for example
+        if netconsole is used.
 config PM_TRACE
        bool "Suspend/resume event tracing"
        depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
@@ -53,6 +64,17 @@ config PM_TRACE
        CAUTION: this option will cause your machine's real-time clock to be
        set to an invalid time after a resume.
+config PM_SYSFS_DEPRECATED
+        bool "Driver model /sys/devices/.../power/state files (DEPRECATED)"
+        depends on PM && SYSFS
+        default n
+        help
+          The driver model started out with a sysfs file intended to provide
+          a userspace hook for device power management.  This feature has never
+          worked very well, except for limited testing purposes, and so it will
+          be removed.   It's not clear that a generic mechanism could really
+          handle the wide variability of device power states; any replacements
+          are likely to be bus or driver specific.
 config SOFTWARE_SUSPEND
        bool "Software Suspend"
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 8d0af3d37a4b..38725f526afc 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,6 +7,4 @@ obj-y				:= main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)         += pm.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)  += swsusp.o disk.o snapshot.o swap.o user.o
-obj-$(CONFIG_SUSPEND_SMP)       += smp.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e13e74067845..d72234942798 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/pm.h>
+#include <linux/cpu.h>
 #include "power.h"
@@ -72,7 +73,10 @@ static int prepare_processes(void)
        int error;
        pm_prepare_console();
-        disable_nonboot_cpus();
+        error = disable_nonboot_cpus();
+        if (error)
+                goto enable_cpus;
        if (freeze_processes()) {
                error = -EBUSY;
@@ -84,6 +88,7 @@ static int prepare_processes(void)
                return 0;
 thaw:
        thaw_processes();
+enable_cpus:
        enable_nonboot_cpus();
        pm_restore_console();
        return error;
@@ -98,7 +103,7 @@ static void unprepare_processes(void)
 }
 /**
- *      pm_suspend_disk - The granpappy of power management.
+ *      pm_suspend_disk - The granpappy of hibernation power management.
 *
 *      If we're going through the firmware, then get it over with quickly.
 *
@@ -207,7 +212,7 @@ static int software_resume(void)
        pr_debug("PM: Preparing devices for restore.\n");
-        if ((error = device_suspend(PMSG_FREEZE))) {
+        if ((error = device_suspend(PMSG_PRETHAW))) {
                printk("Some devices failed to suspend\n");
                swsusp_free();
                goto Thaw;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6d295c776794..873228c71dab 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,6 +16,8 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 #include <linux/console.h>
+#include <linux/cpu.h>
+#include <linux/resume-trace.h>
 #include "power.h"
@@ -51,7 +53,7 @@ void pm_set_ops(struct pm_ops * ops)
 static int suspend_prepare(suspend_state_t state)
 {
-        int error = 0;
+        int error;
        unsigned int free_pages;
        if (!pm_ops || !pm_ops->enter)
@@ -59,12 +61,9 @@ static int suspend_prepare(suspend_state_t state)
        pm_prepare_console();
-        disable_nonboot_cpus();
+        error = disable_nonboot_cpus();
+        if (error)
-        if (num_online_cpus() != 1) {
-                error = -EPERM;
                goto Enable_cpu;
-        }
        if (freeze_processes()) {
                error = -EAGAIN;
@@ -283,10 +282,39 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
 power_attr(state);
+#ifdef CONFIG_PM_TRACE
+int pm_trace_enabled;
+static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
+{
+        return sprintf(buf, "%d\n", pm_trace_enabled);
+}
+static ssize_t
+pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
+{
+        int val;
+        if (sscanf(buf, "%d", &val) == 1) {
+                pm_trace_enabled = !!val;
+                return n;
+        }
+        return -EINVAL;
+}
+power_attr(pm_trace);
+static struct attribute * g[] = {
+        &state_attr.attr,
+        &pm_trace_attr.attr,
+        NULL,
+};
+#else
 static struct attribute * g[] = {
        &state_attr.attr,
        NULL,
 };
+#endif /* CONFIG_PM_TRACE */
 static struct attribute_group attr_group = {
        .attrs = g,
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 57a792982fb9..bfe999f7b272 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,8 +38,6 @@ extern struct subsystem power_subsys;
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
-extern struct pbe *pagedir_nosave;
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 extern int in_suspend;
@@ -50,21 +48,62 @@ extern asmlinkage int swsusp_arch_resume(void);
 extern unsigned int count_data_pages(void);
+/**
+ *      Auxiliary structure used for reading the snapshot image data and
+ *      metadata from and writing them to the list of page backup entries
+ *      (PBEs) which is the main data structure of swsusp.
+ *
+ *      Using struct snapshot_handle we can transfer the image, including its
+ *      metadata, as a continuous sequence of bytes with the help of
+ *      snapshot_read_next() and snapshot_write_next().
+ *
+ *      The code that writes the image to a storage or transfers it to
+ *      the user land is required to use snapshot_read_next() for this
+ *      purpose and it should not make any assumptions regarding the internal
+ *      structure of the image.  Similarly, the code that reads the image from
+ *      a storage or transfers it from the user land is required to use
+ *      snapshot_write_next().
+ *
+ *      This may allow us to change the internal structure of the image
+ *      in the future with considerably less effort.
+ */
 struct snapshot_handle {
-        loff_t          offset;
+        loff_t          offset; /* number of the last byte ready for reading
-        unsigned int    page;
+                                 * or writing in the sequence
-        unsigned int    page_offset;
+                                 */
-        unsigned int    prev;
+        unsigned int    cur;    /* number of the block of PAGE_SIZE bytes the
-        struct pbe      *pbe, *last_pbe;
+                                 * next operation will refer to (ie. current)
-        void            *buffer;
+                                 */
-        unsigned int    buf_offset;
+        unsigned int    cur_offset;     /* offset with respect to the current
+                                         * block (for the next operation)
+                                         */
+        unsigned int    prev;   /* number of the block of PAGE_SIZE bytes that
+                                 * was the current one previously
+                                 */
+        void            *buffer;        /* address of the block to read from
+                                         * or write to
+                                         */
+        unsigned int    buf_offset;     /* location to read from or write to,
+                                         * given as a displacement from 'buffer'
+                                         */
+        int             sync_read;      /* Set to one to notify the caller of
+                                         * snapshot_write_next() that it may
+                                         * need to call wait_on_bio_chain()
+                                         */
 };
+/* This macro returns the address from/to which the caller of
+ * snapshot_read_next()/snapshot_write_next() is allowed to
+ * read/write data after the function returns
+ */
 #define data_of(handle) ((handle).buffer + (handle).buf_offset)
+extern unsigned int snapshot_additional_pages(struct zone *zone);
 extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
-int snapshot_image_loaded(struct snapshot_handle *handle);
+extern int snapshot_image_loaded(struct snapshot_handle *handle);
+extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
 #define SNAPSHOT_IOC_MAGIC      '3'
 #define SNAPSHOT_FREEZE                 _IO(SNAPSHOT_IOC_MAGIC, 1)
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
deleted file mode 100644
index 5957312b2d68..000000000000
--- a/kernel/power/smp.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * drivers/power/smp.c - Functions for stopping other CPUs.
- *
- * Copyright 2004 Pavel Machek <pavel@suse.cz>
- * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
- *
- * This file is released under the GPLv2.
- */
-#undef DEBUG
-#include <linux/smp_lock.h>
-#include <linux/interrupt.h>
-#include <linux/suspend.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <asm/atomic.h>
-#include <asm/tlbflush.h>
-/* This is protected by pm_sem semaphore */
-static cpumask_t frozen_cpus;
-void disable_nonboot_cpus(void)
-{
-        int cpu, error;
-        error = 0;
-        cpus_clear(frozen_cpus);
-        printk("Freezing cpus ...\n");
-        for_each_online_cpu(cpu) {
-                if (cpu == 0)
-                        continue;
-                error = cpu_down(cpu);
-                if (!error) {
-                        cpu_set(cpu, frozen_cpus);
-                        printk("CPU%d is down\n", cpu);
-                        continue;
-                }
-                printk("Error taking cpu %d down: %d\n", cpu, error);
-        }
-        BUG_ON(raw_smp_processor_id() != 0);
-        if (error)
-                panic("cpus not sleeping");
-}
-void enable_nonboot_cpus(void)
-{
-        int cpu, error;
-        printk("Thawing cpus ...\n");
-        for_each_cpu_mask(cpu, frozen_cpus) {
-                error = cpu_up(cpu);
-                if (!error) {
-                        printk("CPU%d is up\n", cpu);
-                        continue;
-                }
-                printk("Error taking cpu %d up: %d\n", cpu, error);
-                panic("Not enough cpus");
-        }
-        cpus_clear(frozen_cpus);
-}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 75d4886e648e..1b84313cbab5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -34,10 +34,12 @@
 #include "power.h"
-struct pbe *pagedir_nosave;
+/* List of PBEs used for creating and restoring the suspend image */
+struct pbe *restore_pblist;
 static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
-static unsigned long *buffer;
+static void *buffer;
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -156,240 +158,637 @@ static inline int save_highmem(void) {return 0;}
 static inline int restore_highmem(void) {return 0;}
 #endif
-static int pfn_is_nosave(unsigned long pfn)
+/**
+ *      @safe_needed - on resume, for storing the PBE list and the image,
+ *      we can only use memory pages that do not conflict with the pages
+ *      used before suspend.
+ *
+ *      The unsafe pages are marked with the PG_nosave_free flag
+ *      and we count them using unsafe_pages
+ */
+#define PG_ANY          0
+#define PG_SAFE         1
+#define PG_UNSAFE_CLEAR 1
+#define PG_UNSAFE_KEEP  0
+static unsigned int allocated_unsafe_pages;
+static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 {
-        unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+        void *res;
-        unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
-        return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+        res = (void *)get_zeroed_page(gfp_mask);
+        if (safe_needed)
+                while (res && PageNosaveFree(virt_to_page(res))) {
+                        /* The page is unsafe, mark it for swsusp_free() */
+                        SetPageNosave(virt_to_page(res));
+                        allocated_unsafe_pages++;
+                        res = (void *)get_zeroed_page(gfp_mask);
+                }
+        if (res) {
+                SetPageNosave(virt_to_page(res));
+                SetPageNosaveFree(virt_to_page(res));
+        }
+        return res;
+}
+unsigned long get_safe_page(gfp_t gfp_mask)
+{
+        return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
 }
 /**
- *      saveable - Determine whether a page should be cloned or not.
+ *      free_image_page - free page represented by @addr, allocated with
- *      @pfn:   The page
+ *      alloc_image_page (page flags set by it must be cleared)
- *
- *      We save a page if it's Reserved, and not in the range of pages
- *      statically defined as 'unsaveable', or if it isn't reserved, and
- *      isn't part of a free chunk of pages.
 */
-static int saveable(struct zone *zone, unsigned long *zone_pfn)
+static inline void free_image_page(void *addr, int clear_nosave_free)
 {
-        unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
+        ClearPageNosave(virt_to_page(addr));
-        struct page *page;
+        if (clear_nosave_free)
+                ClearPageNosaveFree(virt_to_page(addr));
+        free_page((unsigned long)addr);
+}
-        if (!pfn_valid(pfn))
+/* struct linked_page is used to build chains of pages */
-                return 0;
-        page = pfn_to_page(pfn);
+#define LINKED_PAGE_DATA_SIZE   (PAGE_SIZE - sizeof(void *))
-        BUG_ON(PageReserved(page) && PageNosave(page));
-        if (PageNosave(page))
-                return 0;
-        if (PageReserved(page) && pfn_is_nosave(pfn))
-                return 0;
-        if (PageNosaveFree(page))
-                return 0;
-        return 1;
+struct linked_page {
-}
+        struct linked_page *next;
+        char data[LINKED_PAGE_DATA_SIZE];
+} __attribute__((packed));
-unsigned int count_data_pages(void)
+static inline void
+free_list_of_pages(struct linked_page *list, int clear_page_nosave)
 {
-        struct zone *zone;
+        while (list) {
-        unsigned long zone_pfn;
+                struct linked_page *lp = list->next;
-        unsigned int n = 0;
-        for_each_zone (zone) {
+                free_image_page(list, clear_page_nosave);
-                if (is_highmem(zone))
+                list = lp;
-                        continue;
-                mark_free_pages(zone);
-                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-                        n += saveable(zone, &zone_pfn);
        }
-        return n;
 }
-static void copy_data_pages(struct pbe *pblist)
+/**
+  *     struct chain_allocator is used for allocating small objects out of
+  *     a linked list of pages called 'the chain'.
+  *
+  *     The chain grows each time when there is no room for a new object in
+  *     the current page.  The allocated objects cannot be freed individually.
+  *     It is only possible to free them all at once, by freeing the entire
+  *     chain.
+  *
+  *     NOTE: The chain allocator may be inefficient if the allocated objects
+  *     are not much smaller than PAGE_SIZE.
+  */
+struct chain_allocator {
+        struct linked_page *chain;      /* the chain */
+        unsigned int used_space;        /* total size of objects allocated out
+                                         * of the current page
+                                         */
+        gfp_t gfp_mask;         /* mask for allocating pages */
+        int safe_needed;        /* if set, only "safe" pages are allocated */
+};
+static void
+chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
 {
-        struct zone *zone;
+        ca->chain = NULL;
-        unsigned long zone_pfn;
+        ca->used_space = LINKED_PAGE_DATA_SIZE;
-        struct pbe *pbe, *p;
+        ca->gfp_mask = gfp_mask;
+        ca->safe_needed = safe_needed;
+}
-        pbe = pblist;
+static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
-        for_each_zone (zone) {
+{
-                if (is_highmem(zone))
+        void *ret;
-                        continue;
-                mark_free_pages(zone);
+        if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
-                /* This is necessary for swsusp_free() */
+                struct linked_page *lp;
-                for_each_pb_page (p, pblist)
-                        SetPageNosaveFree(virt_to_page(p));
+                lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
-                for_each_pbe (p, pblist)
+                if (!lp)
-                        SetPageNosaveFree(virt_to_page(p->address));
+                        return NULL;
-                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-                        if (saveable(zone, &zone_pfn)) {
+                lp->next = ca->chain;
-                                struct page *page;
+                ca->chain = lp;
-                                long *src, *dst;
+                ca->used_space = 0;
-                                int n;
-                                page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
-                                BUG_ON(!pbe);
-                                pbe->orig_address = (unsigned long)page_address(page);
-                                /* copy_page and memcpy are not usable for copying task structs. */
-                                dst = (long *)pbe->address;
-                                src = (long *)pbe->orig_address;
-                                for (n = PAGE_SIZE / sizeof(long); n; n--)
-                                        *dst++ = *src++;
-                                pbe = pbe->next;
-                        }
-                }
        }
-        BUG_ON(pbe);
+        ret = ca->chain->data + ca->used_space;
+        ca->used_space += size;
+        return ret;
 }
+static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
+{
+        free_list_of_pages(ca->chain, clear_page_nosave);
+        memset(ca, 0, sizeof(struct chain_allocator));
+}
 /**
- *      free_pagedir - free pages allocated with alloc_pagedir()
+ *      Data types related to memory bitmaps.
+ *
+ *      Memory bitmap is a structure consiting of many linked lists of
+ *      objects.  The main list's elements are of type struct zone_bitmap
+ *      and each of them corresonds to one zone.  For each zone bitmap
+ *      object there is a list of objects of type struct bm_block that
+ *      represent each blocks of bit chunks in which information is
+ *      stored.
+ *
+ *      struct memory_bitmap contains a pointer to the main list of zone
+ *      bitmap objects, a struct bm_position used for browsing the bitmap,
+ *      and a pointer to the list of pages used for allocating all of the
+ *      zone bitmap objects and bitmap block objects.
+ *
+ *      NOTE: It has to be possible to lay out the bitmap in memory
+ *      using only allocations of order 0.  Additionally, the bitmap is
+ *      designed to work with arbitrary number of zones (this is over the
+ *      top for now, but let's avoid making unnecessary assumptions ;-).
+ *
+ *      struct zone_bitmap contains a pointer to a list of bitmap block
+ *      objects and a pointer to the bitmap block object that has been
+ *      most recently used for setting bits.  Additionally, it contains the
+ *      pfns that correspond to the start and end of the represented zone.
+ *
+ *      struct bm_block contains a pointer to the memory page in which
+ *      information is stored (in the form of a block of bit chunks
+ *      of type unsigned long each).  It also contains the pfns that
+ *      correspond to the start and end of the represented memory area and
+ *      the number of bit chunks in the block.
+ *
+ *      NOTE: Memory bitmaps are used for two types of operations only:
+ *      "set a bit" and "find the next bit set".  Moreover, the searching
+ *      is always carried out after all of the "set a bit" operations
+ *      on given bitmap.
 */
-static void free_pagedir(struct pbe *pblist, int clear_nosave_free)
+#define BM_END_OF_MAP   (~0UL)
+#define BM_CHUNKS_PER_BLOCK     (PAGE_SIZE / sizeof(long))
+#define BM_BITS_PER_CHUNK       (sizeof(long) << 3)
+#define BM_BITS_PER_BLOCK       (PAGE_SIZE << 3)
+struct bm_block {
+        struct bm_block *next;          /* next element of the list */
+        unsigned long start_pfn;        /* pfn represented by the first bit */
+        unsigned long end_pfn;  /* pfn represented by the last bit plus 1 */
+        unsigned int size;      /* number of bit chunks */
+        unsigned long *data;    /* chunks of bits representing pages */
+};
+struct zone_bitmap {
+        struct zone_bitmap *next;       /* next element of the list */
+        unsigned long start_pfn;        /* minimal pfn in this zone */
+        unsigned long end_pfn;          /* maximal pfn in this zone plus 1 */
+        struct bm_block *bm_blocks;     /* list of bitmap blocks */
+        struct bm_block *cur_block;     /* recently used bitmap block */
+};
+/* strcut bm_position is used for browsing memory bitmaps */
+struct bm_position {
+        struct zone_bitmap *zone_bm;
+        struct bm_block *block;
+        int chunk;
+        int bit;
+};
+struct memory_bitmap {
+        struct zone_bitmap *zone_bm_list;       /* list of zone bitmaps */
+        struct linked_page *p_list;     /* list of pages used to store zone
+                                         * bitmap objects and bitmap block
+                                         * objects
+                                         */
+        struct bm_position cur; /* most recently used bit position */
+};
+/* Functions that operate on memory bitmaps */
+static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
 {
-        struct pbe *pbe;
+        bm->cur.chunk = 0;
+        bm->cur.bit = -1;
+}
-        while (pblist) {
+static void memory_bm_position_reset(struct memory_bitmap *bm)
-                pbe = (pblist + PB_PAGE_SKIP)->next;
+{
-                ClearPageNosave(virt_to_page(pblist));
+        struct zone_bitmap *zone_bm;
-                if (clear_nosave_free)
-                        ClearPageNosaveFree(virt_to_page(pblist));
+        zone_bm = bm->zone_bm_list;
-                free_page((unsigned long)pblist);
+        bm->cur.zone_bm = zone_bm;
-                pblist = pbe;
+        bm->cur.block = zone_bm->bm_blocks;
-        }
+        memory_bm_reset_chunk(bm);
 }
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
 /**
- *      fill_pb_page - Create a list of PBEs on a given memory page
+ *      create_bm_block_list - create a list of block bitmap objects
 */
-static inline void fill_pb_page(struct pbe *pbpage)
+static inline struct bm_block *
+create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
 {
-        struct pbe *p;
+        struct bm_block *bblist = NULL;
+        while (nr_blocks-- > 0) {
+                struct bm_block *bb;
-        p = pbpage;
+                bb = chain_alloc(ca, sizeof(struct bm_block));
-        pbpage += PB_PAGE_SKIP;
+                if (!bb)
-        do
+                        return NULL;
-                p->next = p + 1;
-        while (++p < pbpage);
+                bb->next = bblist;
+                bblist = bb;
+        }
+        return bblist;
 }
 /**
- *      create_pbe_list - Create a list of PBEs on top of a given chain
+ *      create_zone_bm_list - create a list of zone bitmap objects
- *      of memory pages allocated with alloc_pagedir()
 */
-static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
+static inline struct zone_bitmap *
+create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
 {
-        struct pbe *pbpage, *p;
+        struct zone_bitmap *zbmlist = NULL;
-        unsigned int num = PBES_PER_PAGE;
-        for_each_pb_page (pbpage, pblist) {
+        while (nr_zones-- > 0) {
-                if (num >= nr_pages)
+                struct zone_bitmap *zbm;
-                        break;
+                zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
+                if (!zbm)
+                        return NULL;
+                zbm->next = zbmlist;
+                zbmlist = zbm;
+        }
+        return zbmlist;
+}
+/**
+  *     memory_bm_create - allocate memory for a memory bitmap
+  */
+static int
+memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+{
+        struct chain_allocator ca;
+        struct zone *zone;
+        struct zone_bitmap *zone_bm;
+        struct bm_block *bb;
+        unsigned int nr;
+        chain_init(&ca, gfp_mask, safe_needed);
-                fill_pb_page(pbpage);
+        /* Compute the number of zones */
-                num += PBES_PER_PAGE;
+        nr = 0;
+        for_each_zone (zone)
+                if (populated_zone(zone) && !is_highmem(zone))
+                        nr++;
+        /* Allocate the list of zones bitmap objects */
+        zone_bm = create_zone_bm_list(nr, &ca);
+        bm->zone_bm_list = zone_bm;
+        if (!zone_bm) {
+                chain_free(&ca, PG_UNSAFE_CLEAR);
+                return -ENOMEM;
        }
-        if (pbpage) {
-                for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
+        /* Initialize the zone bitmap objects */
-                        p->next = p + 1;
+        for_each_zone (zone) {
-                p->next = NULL;
+                unsigned long pfn;
+                if (!populated_zone(zone) || is_highmem(zone))
+                        continue;
+                zone_bm->start_pfn = zone->zone_start_pfn;
+                zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+                /* Allocate the list of bitmap block objects */
+                nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+                bb = create_bm_block_list(nr, &ca);
+                zone_bm->bm_blocks = bb;
+                zone_bm->cur_block = bb;
+                if (!bb)
+                        goto Free;
+                nr = zone->spanned_pages;
+                pfn = zone->zone_start_pfn;
+                /* Initialize the bitmap block objects */
+                while (bb) {
+                        unsigned long *ptr;
+                        ptr = alloc_image_page(gfp_mask, safe_needed);
+                        bb->data = ptr;
+                        if (!ptr)
+                                goto Free;
+                        bb->start_pfn = pfn;
+                        if (nr >= BM_BITS_PER_BLOCK) {
+                                pfn += BM_BITS_PER_BLOCK;
+                                bb->size = BM_CHUNKS_PER_BLOCK;
+                                nr -= BM_BITS_PER_BLOCK;
+                        } else {
+                                /* This is executed only once in the loop */
+                                pfn += nr;
+                                bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
+                        }
+                        bb->end_pfn = pfn;
+                        bb = bb->next;
+                }
+                zone_bm = zone_bm->next;
        }
+        bm->p_list = ca.chain;
+        memory_bm_position_reset(bm);
+        return 0;
+Free:
+        bm->p_list = ca.chain;
+        memory_bm_free(bm, PG_UNSAFE_CLEAR);
+        return -ENOMEM;
 }
-static unsigned int unsafe_pages;
+/**
+  *     memory_bm_free - free memory occupied by the memory bitmap @bm
+  */
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
+{
+        struct zone_bitmap *zone_bm;
+        /* Free the list of bit blocks for each zone_bitmap object */
+        zone_bm = bm->zone_bm_list;
+        while (zone_bm) {
+                struct bm_block *bb;
+                bb = zone_bm->bm_blocks;
+                while (bb) {
+                        if (bb->data)
+                                free_image_page(bb->data, clear_nosave_free);
+                        bb = bb->next;
+                }
+                zone_bm = zone_bm->next;
+        }
+        free_list_of_pages(bm->p_list, clear_nosave_free);
+        bm->zone_bm_list = NULL;
+}
 /**
- *      @safe_needed - on resume, for storing the PBE list and the image,
+ *      memory_bm_set_bit - set the bit in the bitmap @bm that corresponds
- *      we can only use memory pages that do not conflict with the pages
+ *      to given pfn.  The cur_zone_bm member of @bm and the cur_block member
- *      used before suspend.
+ *      of @bm->cur_zone_bm are updated.
 *
- *      The unsafe pages are marked with the PG_nosave_free flag
+ *      If the bit cannot be set, the function returns -EINVAL .
- *      and we count them using unsafe_pages
 */
-static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+static int
+memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
 {
-        void *res;
+        struct zone_bitmap *zone_bm;
+        struct bm_block *bb;
-        res = (void *)get_zeroed_page(gfp_mask);
-        if (safe_needed)
+        /* Check if the pfn is from the current zone */
-                while (res && PageNosaveFree(virt_to_page(res))) {
+        zone_bm = bm->cur.zone_bm;
-                        /* The page is unsafe, mark it for swsusp_free() */
+        if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
-                        SetPageNosave(virt_to_page(res));
+                zone_bm = bm->zone_bm_list;
-                        unsafe_pages++;
+                /* We don't assume that the zones are sorted by pfns */
-                        res = (void *)get_zeroed_page(gfp_mask);
+                while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
+                        zone_bm = zone_bm->next;
+                        if (unlikely(!zone_bm))
+                                return -EINVAL;
                }
-        if (res) {
+                bm->cur.zone_bm = zone_bm;
-                SetPageNosave(virt_to_page(res));
-                SetPageNosaveFree(virt_to_page(res));
        }
-        return res;
+        /* Check if the pfn corresponds to the current bitmap block */
+        bb = zone_bm->cur_block;
+        if (pfn < bb->start_pfn)
+                bb = zone_bm->bm_blocks;
+        while (pfn >= bb->end_pfn) {
+                bb = bb->next;
+                if (unlikely(!bb))
+                        return -EINVAL;
+        }
+        zone_bm->cur_block = bb;
+        pfn -= bb->start_pfn;
+        set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK);
+        return 0;
 }
-unsigned long get_safe_page(gfp_t gfp_mask)
+/* Two auxiliary functions for memory_bm_next_pfn */
+/* Find the first set bit in the given chunk, if there is one */
+static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
 {
-        return (unsigned long)alloc_image_page(gfp_mask, 1);
+        bit++;
+        while (bit < BM_BITS_PER_CHUNK) {
+                if (test_bit(bit, chunk_p))
+                        return bit;
+                bit++;
+        }
+        return -1;
+}
+/* Find a chunk containing some bits set in given block of bits */
+static inline int next_chunk_in_block(int n, struct bm_block *bb)
+{
+        n++;
+        while (n < bb->size) {
+                if (bb->data[n])
+                        return n;
+                n++;
+        }
+        return -1;
 }
 /**
- *      alloc_pagedir - Allocate the page directory.
+ *      memory_bm_next_pfn - find the pfn that corresponds to the next set bit
- *
+ *      in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is
- *      First, determine exactly how many pages we need and
+ *      returned.
- *      allocate them.
 *
- *      We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
+ *      It is required to run memory_bm_position_reset() before the first call to
- *      struct pbe elements (pbes) and the last element in the page points
+ *      this function.
- *      to the next page.
+ */
+static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
+{
+        struct zone_bitmap *zone_bm;
+        struct bm_block *bb;
+        int chunk;
+        int bit;
+        do {
+                bb = bm->cur.block;
+                do {
+                        chunk = bm->cur.chunk;
+                        bit = bm->cur.bit;
+                        do {
+                                bit = next_bit_in_chunk(bit, bb->data + chunk);
+                                if (bit >= 0)
+                                        goto Return_pfn;
+                                chunk = next_chunk_in_block(chunk, bb);
+                                bit = -1;
+                        } while (chunk >= 0);
+                        bb = bb->next;
+                        bm->cur.block = bb;
+                        memory_bm_reset_chunk(bm);
+                } while (bb);
+                zone_bm = bm->cur.zone_bm->next;
+                if (zone_bm) {
+                        bm->cur.zone_bm = zone_bm;
+                        bm->cur.block = zone_bm->bm_blocks;
+                        memory_bm_reset_chunk(bm);
+                }
+        } while (zone_bm);
+        memory_bm_position_reset(bm);
+        return BM_END_OF_MAP;
+Return_pfn:
+        bm->cur.chunk = chunk;
+        bm->cur.bit = bit;
+        return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
+}
+/**
+ *      snapshot_additional_pages - estimate the number of additional pages
+ *      be needed for setting up the suspend image data structures for given
+ *      zone (usually the returned value is greater than the exact number)
+ */
+unsigned int snapshot_additional_pages(struct zone *zone)
+{
+        unsigned int res;
+        res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+        res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
+        return res;
+}
+/**
+ *      pfn_is_nosave - check if given pfn is in the 'nosave' section
+ */
+static inline int pfn_is_nosave(unsigned long pfn)
+{
+        unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+        unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+        return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+/**
+ *      saveable - Determine whether a page should be cloned or not.
+ *      @pfn:   The page
 *
- *      On each page we set up a list of struct_pbe elements.
+ *      We save a page if it isn't Nosave, and is not in the range of pages
+ *      statically defined as 'unsaveable', and it
+ *      isn't a part of a free chunk of pages.
 */
-static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
+static struct page *saveable_page(unsigned long pfn)
-                                 int safe_needed)
 {
-        unsigned int num;
+        struct page *page;
-        struct pbe *pblist, *pbe;
+        if (!pfn_valid(pfn))
+                return NULL;
-        if (!nr_pages)
+        page = pfn_to_page(pfn);
+        if (PageNosave(page))
+                return NULL;
+        if (PageReserved(page) && pfn_is_nosave(pfn))
                return NULL;
+        if (PageNosaveFree(page))
+                return NULL;
+        return page;
+}
+unsigned int count_data_pages(void)
+{
+        struct zone *zone;
+        unsigned long pfn, max_zone_pfn;
+        unsigned int n = 0;
-        pblist = alloc_image_page(gfp_mask, safe_needed);
+        for_each_zone (zone) {
-        /* FIXME: rewrite this ugly loop */
+                if (is_highmem(zone))
-        for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
+                        continue;
-                        pbe = pbe->next, num += PBES_PER_PAGE) {
+                mark_free_pages(zone);
-                pbe += PB_PAGE_SKIP;
+                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
-                pbe->next = alloc_image_page(gfp_mask, safe_needed);
+                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+                        n += !!saveable_page(pfn);
        }
-        if (!pbe) { /* get_zeroed_page() failed */
+        return n;
-                free_pagedir(pblist, 1);
+}
-                pblist = NULL;
-        } else
+static inline void copy_data_page(long *dst, long *src)
-                create_pbe_list(pblist, nr_pages);
+{
-        return pblist;
+        int n;
+        /* copy_page and memcpy are not usable for copying task structs. */
+        for (n = PAGE_SIZE / sizeof(long); n; n--)
+                *dst++ = *src++;
+}
+static void
+copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
+{
+        struct zone *zone;
+        unsigned long pfn;
+        for_each_zone (zone) {
+                unsigned long max_zone_pfn;
+                if (is_highmem(zone))
+                        continue;
+                mark_free_pages(zone);
+                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
+                        if (saveable_page(pfn))
+                                memory_bm_set_bit(orig_bm, pfn);
+        }
+        memory_bm_position_reset(orig_bm);
+        memory_bm_position_reset(copy_bm);
+        do {
+                pfn = memory_bm_next_pfn(orig_bm);
+                if (likely(pfn != BM_END_OF_MAP)) {
+                        struct page *page;
+                        void *src;
+                        page = pfn_to_page(pfn);
+                        src = page_address(page);
+                        page = pfn_to_page(memory_bm_next_pfn(copy_bm));
+                        copy_data_page(page_address(page), src);
+                }
+        } while (pfn != BM_END_OF_MAP);
 }
 /**
- * Free pages we allocated for suspend. Suspend pages are alocated
+ *      swsusp_free - free pages allocated for the suspend.
- * before atomic copy, so we need to free them after resume.
+ *
+ *      Suspend pages are alocated before the atomic copy is made, so we
+ *      need to release them after the resume.
 */
 void swsusp_free(void)
 {
        struct zone *zone;
-        unsigned long zone_pfn;
+        unsigned long pfn, max_zone_pfn;
        for_each_zone(zone) {
-                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
-                        if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
+                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-                                struct page *page;
+                        if (pfn_valid(pfn)) {
-                                page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+                                struct page *page = pfn_to_page(pfn);
                                if (PageNosave(page) && PageNosaveFree(page)) {
                                        ClearPageNosave(page);
                                        ClearPageNosaveFree(page);
@@ -399,7 +798,7 @@ void swsusp_free(void)
        }
        nr_copy_pages = 0;
        nr_meta_pages = 0;
-        pagedir_nosave = NULL;
+        restore_pblist = NULL;
        buffer = NULL;
 }
@@ -414,46 +813,57 @@ void swsusp_free(void)
 static int enough_free_mem(unsigned int nr_pages)
 {
        struct zone *zone;
-        unsigned int n = 0;
+        unsigned int free = 0, meta = 0;
        for_each_zone (zone)
-                if (!is_highmem(zone))
+                if (!is_highmem(zone)) {
-                        n += zone->free_pages;
+                        free += zone->free_pages;
-        pr_debug("swsusp: available memory: %u pages\n", n);
+                        meta += snapshot_additional_pages(zone);
-        return n > (nr_pages + PAGES_FOR_IO +
+                }
-                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
-}
-static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+        pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
-{
+                nr_pages, PAGES_FOR_IO, meta, free);
-        struct pbe *p;
-        for_each_pbe (p, pblist) {
+        return free > nr_pages + PAGES_FOR_IO + meta;
-                p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
-                if (!p->address)
-                        return -ENOMEM;
-        }
-        return 0;
 }
-static struct pbe *swsusp_alloc(unsigned int nr_pages)
+static int
+swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
+                unsigned int nr_pages)
 {
-        struct pbe *pblist;
+        int error;
-        if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
+        error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
-                printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
+        if (error)
-                return NULL;
+                goto Free;
-        }
-        if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
+        error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
-                printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+        if (error)
-                swsusp_free();
+                goto Free;
-                return NULL;
+        while (nr_pages-- > 0) {
+                struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+                if (!page)
+                        goto Free;
+                SetPageNosave(page);
+                SetPageNosaveFree(page);
+                memory_bm_set_bit(copy_bm, page_to_pfn(page));
        }
+        return 0;
-        return pblist;
+Free:
+        swsusp_free();
+        return -ENOMEM;
 }
+/* Memory bitmap used for marking saveable pages */
+static struct memory_bitmap orig_bm;
+/* Memory bitmap used for marking allocated pages that will contain the copies
+ * of saveable pages
+ */
+static struct memory_bitmap copy_bm;
 asmlinkage int swsusp_save(void)
 {
        unsigned int nr_pages;
@@ -464,25 +874,19 @@ asmlinkage int swsusp_save(void)
        nr_pages = count_data_pages();
        printk("swsusp: Need to copy %u pages\n", nr_pages);
-        pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
-                 nr_pages,
-                 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
-                 PAGES_FOR_IO, nr_free_pages());
        if (!enough_free_mem(nr_pages)) {
                printk(KERN_ERR "swsusp: Not enough free memory\n");
                return -ENOMEM;
        }
-        pagedir_nosave = swsusp_alloc(nr_pages);
+        if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
-        if (!pagedir_nosave)
                return -ENOMEM;
        /* During allocating of suspend pagedir, new cold pages may appear.
         * Kill them.
         */
        drain_local_pages();
-        copy_data_pages(pagedir_nosave);
+        copy_data_pages(&copy_bm, &orig_bm);
        /*
         * End of critical section. From now on, we can write to memory,
@@ -511,22 +915,20 @@ static void init_header(struct swsusp_info *info)
 }
 /**
- *      pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *      pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
- *      list starting at @pbe are stored in the array @buf[] (1 page)
+ *      are stored in the array @buf[] (1 page at a time)
 */
-static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+static inline void
+pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
        int j;
-        for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+        for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
-                buf[j] = pbe->orig_address;
+                buf[j] = memory_bm_next_pfn(bm);
-                pbe = pbe->next;
+                if (unlikely(buf[j] == BM_END_OF_MAP))
+                        break;
        }
-        if (!pbe)
-                for (; j < PAGE_SIZE / sizeof(long); j++)
-                        buf[j] = 0;
-        return pbe;
 }
 /**
@@ -553,37 +955,39 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
 int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 {
-        if (handle->page > nr_meta_pages + nr_copy_pages)
+        if (handle->cur > nr_meta_pages + nr_copy_pages)
                return 0;
        if (!buffer) {
                /* This makes the buffer be freed by swsusp_free() */
-                buffer = alloc_image_page(GFP_ATOMIC, 0);
+                buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
                if (!buffer)
                        return -ENOMEM;
        }
        if (!handle->offset) {
                init_header((struct swsusp_info *)buffer);
                handle->buffer = buffer;
-                handle->pbe = pagedir_nosave;
+                memory_bm_position_reset(&orig_bm);
+                memory_bm_position_reset(&copy_bm);
        }
-        if (handle->prev < handle->page) {
+        if (handle->prev < handle->cur) {
-                if (handle->page <= nr_meta_pages) {
+                if (handle->cur <= nr_meta_pages) {
-                        handle->pbe = pack_orig_addresses(buffer, handle->pbe);
+                        memset(buffer, 0, PAGE_SIZE);
-                        if (!handle->pbe)
+                        pack_pfns(buffer, &orig_bm);
-                                handle->pbe = pagedir_nosave;
                } else {
-                        handle->buffer = (void *)handle->pbe->address;
+                        unsigned long pfn = memory_bm_next_pfn(&copy_bm);
-                        handle->pbe = handle->pbe->next;
+                        handle->buffer = page_address(pfn_to_page(pfn));
                }
-                handle->prev = handle->page;
+                handle->prev = handle->cur;
        }
-        handle->buf_offset = handle->page_offset;
+        handle->buf_offset = handle->cur_offset;
-        if (handle->page_offset + count >= PAGE_SIZE) {
+        if (handle->cur_offset + count >= PAGE_SIZE) {
-                count = PAGE_SIZE - handle->page_offset;
+                count = PAGE_SIZE - handle->cur_offset;
-                handle->page_offset = 0;
+                handle->cur_offset = 0;
-                handle->page++;
+                handle->cur++;
        } else {
-                handle->page_offset += count;
+                handle->cur_offset += count;
        }
        handle->offset += count;
        return count;
@@ -595,47 +999,50 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 *      had been used before suspend
 */
-static int mark_unsafe_pages(struct pbe *pblist)
+static int mark_unsafe_pages(struct memory_bitmap *bm)
 {
        struct zone *zone;
-        unsigned long zone_pfn;
+        unsigned long pfn, max_zone_pfn;
-        struct pbe *p;
-        if (!pblist) /* a sanity check */
-                return -EINVAL;
        /* Clear page flags */
        for_each_zone (zone) {
-                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+                max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
-                        if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
-                                ClearPageNosaveFree(pfn_to_page(zone_pfn +
+                        if (pfn_valid(pfn))
-                                        zone->zone_start_pfn));
+                                ClearPageNosaveFree(pfn_to_page(pfn));
        }
-        /* Mark orig addresses */
+        /* Mark pages that correspond to the "original" pfns as "unsafe" */
-        for_each_pbe (p, pblist) {
+        memory_bm_position_reset(bm);
-                if (virt_addr_valid(p->orig_address))
+        do {
-                        SetPageNosaveFree(virt_to_page(p->orig_address));
+                pfn = memory_bm_next_pfn(bm);
-                else
+                if (likely(pfn != BM_END_OF_MAP)) {
-                        return -EFAULT;
+                        if (likely(pfn_valid(pfn)))
-        }
+                                SetPageNosaveFree(pfn_to_page(pfn));
+                        else
+                                return -EFAULT;
+                }
+        } while (pfn != BM_END_OF_MAP);
-        unsafe_pages = 0;
+        allocated_unsafe_pages = 0;
        return 0;
 }
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+static void
+duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
 {
-        /* We assume both lists contain the same number of elements */
+        unsigned long pfn;
-        while (src) {
-                dst->orig_address = src->orig_address;
+        memory_bm_position_reset(src);
-                dst = dst->next;
+        pfn = memory_bm_next_pfn(src);
-                src = src->next;
+        while (pfn != BM_END_OF_MAP) {
+                memory_bm_set_bit(dst, pfn);
+                pfn = memory_bm_next_pfn(src);
        }
 }
-static int check_header(struct swsusp_info *info)
+static inline int check_header(struct swsusp_info *info)
 {
        char *reason = NULL;
@@ -662,19 +1069,14 @@ static int check_header(struct swsusp_info *info)
 *      load header - check the image header and copy data from it
 */
-static int load_header(struct snapshot_handle *handle,
+static int
-                              struct swsusp_info *info)
+load_header(struct swsusp_info *info)
 {
        int error;
-        struct pbe *pblist;
+        restore_pblist = NULL;
        error = check_header(info);
        if (!error) {
-                pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
-                if (!pblist)
-                        return -ENOMEM;
-                pagedir_nosave = pblist;
-                handle->pbe = pblist;
                nr_copy_pages = info->image_pages;
                nr_meta_pages = info->pages - info->image_pages - 1;
        }
@@ -682,113 +1084,137 @@ static int load_header(struct snapshot_handle *handle,
 }
 /**
- *      unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *      unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
- *      the PBEs in the list starting at @pbe
+ *      the corresponding bit in the memory bitmap @bm
 */
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+static inline void
-                                                struct pbe *pbe)
+unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
 {
        int j;
-        for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+        for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
-                pbe->orig_address = buf[j];
+                if (unlikely(buf[j] == BM_END_OF_MAP))
-                pbe = pbe->next;
+                        break;
+                memory_bm_set_bit(bm, buf[j]);
        }
-        return pbe;
 }
 /**
- *      prepare_image - use metadata contained in the PBE list
+ *      prepare_image - use the memory bitmap @bm to mark the pages that will
- *      pointed to by pagedir_nosave to mark the pages that will
+ *      be overwritten in the process of restoring the system memory state
- *      be overwritten in the process of restoring the system
+ *      from the suspend image ("unsafe" pages) and allocate memory for the
- *      memory state from the image ("unsafe" pages) and allocate
+ *      image.
- *      memory for the image
 *
- *      The idea is to allocate the PBE list first and then
+ *      The idea is to allocate a new memory bitmap first and then allocate
- *      allocate as many pages as it's needed for the image data,
+ *      as many pages as needed for the image data, but not to assign these
- *      but not to assign these pages to the PBEs initially.
+ *      pages to specific tasks initially.  Instead, we just mark them as
- *      Instead, we just mark them as allocated and create a list
+ *      allocated and create a list of "safe" pages that will be used later.
- *      of "safe" which will be used later
 */
-struct safe_page {
+#define PBES_PER_LINKED_PAGE    (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
-        struct safe_page *next;
-        char padding[PAGE_SIZE - sizeof(void *)];
-};
-static struct safe_page *safe_pages;
+static struct linked_page *safe_pages_list;
-static int prepare_image(struct snapshot_handle *handle)
+static int
+prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
 {
-        int error = 0;
+        unsigned int nr_pages;
-        unsigned int nr_pages = nr_copy_pages;
+        struct linked_page *sp_list, *lp;
-        struct pbe *p, *pblist = NULL;
+        int error;
-        p = pagedir_nosave;
+        error = mark_unsafe_pages(bm);
-        error = mark_unsafe_pages(p);
+        if (error)
-        if (!error) {
+                goto Free;
-                pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
-                if (pblist)
+        error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
-                        copy_page_backup_list(pblist, p);
+        if (error)
-                free_pagedir(p, 0);
+                goto Free;
-                if (!pblist)
+        duplicate_memory_bitmap(new_bm, bm);
+        memory_bm_free(bm, PG_UNSAFE_KEEP);
+        /* Reserve some safe pages for potential later use.
+         *
+         * NOTE: This way we make sure there will be enough safe pages for the
+         * chain_alloc() in get_buffer().  It is a bit wasteful, but
+         * nr_copy_pages cannot be greater than 50% of the memory anyway.
+         */
+        sp_list = NULL;
+        /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
+        nr_pages = nr_copy_pages - allocated_unsafe_pages;
+        nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
+        while (nr_pages > 0) {
+                lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
+                if (!lp) {
                        error = -ENOMEM;
+                        goto Free;
+                }
+                lp->next = sp_list;
+                sp_list = lp;
+                nr_pages--;
        }
-        safe_pages = NULL;
+        /* Preallocate memory for the image */
-        if (!error && nr_pages > unsafe_pages) {
+        safe_pages_list = NULL;
-                nr_pages -= unsafe_pages;
+        nr_pages = nr_copy_pages - allocated_unsafe_pages;
-                while (nr_pages--) {
+        while (nr_pages > 0) {
-                        struct safe_page *ptr;
+                lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
+                if (!lp) {
-                        ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
+                        error = -ENOMEM;
-                        if (!ptr) {
+                        goto Free;
-                                error = -ENOMEM;
+                }
-                                break;
+                if (!PageNosaveFree(virt_to_page(lp))) {
-                        }
+                        /* The page is "safe", add it to the list */
-                        if (!PageNosaveFree(virt_to_page(ptr))) {
+                        lp->next = safe_pages_list;
-                                /* The page is "safe", add it to the list */
+                        safe_pages_list = lp;
-                                ptr->next = safe_pages;
-                                safe_pages = ptr;
-                        }
-                        /* Mark the page as allocated */
-                        SetPageNosave(virt_to_page(ptr));
-                        SetPageNosaveFree(virt_to_page(ptr));
                }
+                /* Mark the page as allocated */
+                SetPageNosave(virt_to_page(lp));
+                SetPageNosaveFree(virt_to_page(lp));
+                nr_pages--;
        }
-        if (!error) {
+        /* Free the reserved safe pages so that chain_alloc() can use them */
-                pagedir_nosave = pblist;
+        while (sp_list) {
-        } else {
+                lp = sp_list->next;
-                handle->pbe = NULL;
+                free_image_page(sp_list, PG_UNSAFE_CLEAR);
-                swsusp_free();
+                sp_list = lp;
        }
+        return 0;
+Free:
+        swsusp_free();
        return error;
 }
-static void *get_buffer(struct snapshot_handle *handle)
+/**
+ *      get_buffer - compute the address that snapshot_write_next() should
+ *      set for its caller to write to.
+ */
+static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
 {
-        struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
+        struct pbe *pbe;
-        struct page *page = virt_to_page(pbe->orig_address);
+        struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
-        if (PageNosave(page) && PageNosaveFree(page)) {
+        if (PageNosave(page) && PageNosaveFree(page))
-                /*
+                /* We have allocated the "original" page frame and we can
-                 * We have allocated the "original" page frame and we can
+                 * use it directly to store the loaded page.
-                 * use it directly to store the read page
                 */
-                pbe->address = 0;
+                return page_address(page);
-                if (last && last->next)
-                        last->next = NULL;
+        /* The "original" page frame has not been allocated and we have to
-                return (void *)pbe->orig_address;
+         * use a "safe" page frame to store the loaded page.
-        }
-        /*
-         * The "original" page frame has not been allocated and we have to
-         * use a "safe" page frame to store the read page
         */
-        pbe->address = (unsigned long)safe_pages;
+        pbe = chain_alloc(ca, sizeof(struct pbe));
-        safe_pages = safe_pages->next;
+        if (!pbe) {
-        if (last)
+                swsusp_free();
-                last->next = pbe;
+                return NULL;
-        handle->last_pbe = pbe;
+        }
+        pbe->orig_address = (unsigned long)page_address(page);
+        pbe->address = (unsigned long)safe_pages_list;
+        safe_pages_list = safe_pages_list->next;
+        pbe->next = restore_pblist;
+        restore_pblist = pbe;
        return (void *)pbe->address;
 }
@@ -816,46 +1242,60 @@ static void *get_buffer(struct snapshot_handle *handle)
 int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 {
+        static struct chain_allocator ca;
        int error = 0;
-        if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+        /* Check if we have already loaded the entire image */
+        if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
                return 0;
        if (!buffer) {
                /* This makes the buffer be freed by swsusp_free() */
-                buffer = alloc_image_page(GFP_ATOMIC, 0);
+                buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
                if (!buffer)
                        return -ENOMEM;
        }
        if (!handle->offset)
                handle->buffer = buffer;
-        if (handle->prev < handle->page) {
+        handle->sync_read = 1;
-                if (!handle->prev) {
+        if (handle->prev < handle->cur) {
-                        error = load_header(handle, (struct swsusp_info *)buffer);
+                if (handle->prev == 0) {
+                        error = load_header(buffer);
                        if (error)
                                return error;
+                        error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
+                        if (error)
+                                return error;
                } else if (handle->prev <= nr_meta_pages) {
-                        handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
+                        unpack_orig_pfns(buffer, &copy_bm);
-                        if (!handle->pbe) {
+                        if (handle->prev == nr_meta_pages) {
-                                error = prepare_image(handle);
+                                error = prepare_image(&orig_bm, &copy_bm);
                                if (error)
                                        return error;
-                                handle->pbe = pagedir_nosave;
-                                handle->last_pbe = NULL;
+                                chain_init(&ca, GFP_ATOMIC, PG_SAFE);
-                                handle->buffer = get_buffer(handle);
+                                memory_bm_position_reset(&orig_bm);
+                                restore_pblist = NULL;
+                                handle->buffer = get_buffer(&orig_bm, &ca);
+                                handle->sync_read = 0;
+                                if (!handle->buffer)
+                                        return -ENOMEM;
                        }
                } else {
-                        handle->pbe = handle->pbe->next;
+                        handle->buffer = get_buffer(&orig_bm, &ca);
-                        handle->buffer = get_buffer(handle);
+                        handle->sync_read = 0;
                }
-                handle->prev = handle->page;
+                handle->prev = handle->cur;
        }
-        handle->buf_offset = handle->page_offset;
+        handle->buf_offset = handle->cur_offset;
-        if (handle->page_offset + count >= PAGE_SIZE) {
+        if (handle->cur_offset + count >= PAGE_SIZE) {
-                count = PAGE_SIZE - handle->page_offset;
+                count = PAGE_SIZE - handle->cur_offset;
-                handle->page_offset = 0;
+                handle->cur_offset = 0;
-                handle->page++;
+                handle->cur++;
        } else {
-                handle->page_offset += count;
+                handle->cur_offset += count;
        }
        handle->offset += count;
        return count;
@@ -863,6 +1303,13 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 int snapshot_image_loaded(struct snapshot_handle *handle)
 {
-        return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
+        return !(!nr_copy_pages ||
-                handle->page <= nr_meta_pages + nr_copy_pages);
+                        handle->cur <= nr_meta_pages + nr_copy_pages);
+}
+void snapshot_free_unused_memory(struct snapshot_handle *handle)
+{
+        /* Free only if we have loaded the image entirely */
+        if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
+                memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index f1dd146bd64d..9b2ee5344dee 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pm.h>
@@ -49,18 +50,16 @@ static int mark_swapfiles(swp_entry_t start)
 {
        int error;
-        rw_swap_page_sync(READ,
+        rw_swap_page_sync(READ, swp_entry(root_swap, 0),
-                          swp_entry(root_swap, 0),
+                          virt_to_page((unsigned long)&swsusp_header), NULL);
-                          virt_to_page((unsigned long)&swsusp_header));
        if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
            !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
                memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
                memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
                swsusp_header.image = start;
-                error = rw_swap_page_sync(WRITE,
+                error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
-                                          swp_entry(root_swap, 0),
+                                virt_to_page((unsigned long)&swsusp_header),
-                                          virt_to_page((unsigned long)
+                                NULL);
-                                                       &swsusp_header));
        } else {
                pr_debug("swsusp: Partition is not swap space.\n");
                error = -ENODEV;
@@ -88,16 +87,37 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 *      write_page - Write one page to given swap location.
 *      @buf:           Address we're writing.
 *      @offset:        Offset of the swap page we're writing to.
+ *      @bio_chain:     Link the next write BIO here
 */
-static int write_page(void *buf, unsigned long offset)
+static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
 {
        swp_entry_t entry;
        int error = -ENOSPC;
        if (offset) {
+                struct page *page = virt_to_page(buf);
+                if (bio_chain) {
+                        /*
+                         * Whether or not we successfully allocated a copy page,
+                         * we take a ref on the page here.  It gets undone in
+                         * wait_on_bio_chain().
+                         */
+                        struct page *page_copy;
+                        page_copy = alloc_page(GFP_ATOMIC);
+                        if (page_copy == NULL) {
+                                WARN_ON_ONCE(1);
+                                bio_chain = NULL;       /* Go synchronous */
+                                get_page(page);
+                        } else {
+                                memcpy(page_address(page_copy),
+                                        page_address(page), PAGE_SIZE);
+                                page = page_copy;
+                        }
+                }
                entry = swp_entry(root_swap, offset);
-                error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
+                error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
        }
        return error;
 }
@@ -146,6 +166,26 @@ static void release_swap_writer(struct swap_map_handle *handle)
        handle->bitmap = NULL;
 }
+static void show_speed(struct timeval *start, struct timeval *stop,
+                        unsigned nr_pages, char *msg)
+{
+        s64 elapsed_centisecs64;
+        int centisecs;
+        int k;
+        int kps;
+        elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
+        do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
+        centisecs = elapsed_centisecs64;
+        if (centisecs == 0)
+                centisecs = 1;  /* avoid div-by-zero */
+        k = nr_pages * (PAGE_SIZE / 1024);
+        kps = (k * 100) / centisecs;
+        printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
+                        centisecs / 100, centisecs % 100,
+                        kps / 1000, (kps % 1000) / 10);
+}
 static int get_swap_writer(struct swap_map_handle *handle)
 {
        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -165,37 +205,70 @@ static int get_swap_writer(struct swap_map_handle *handle)
        return 0;
 }
-static int swap_write_page(struct swap_map_handle *handle, void *buf)
+static int wait_on_bio_chain(struct bio **bio_chain)
 {
-        int error;
+        struct bio *bio;
+        struct bio *next_bio;
+        int ret = 0;
+        if (bio_chain == NULL)
+                return 0;
+        bio = *bio_chain;
+        if (bio == NULL)
+                return 0;
+        while (bio) {
+                struct page *page;
+                next_bio = bio->bi_private;
+                page = bio->bi_io_vec[0].bv_page;
+                wait_on_page_locked(page);
+                if (!PageUptodate(page) || PageError(page))
+                        ret = -EIO;
+                put_page(page);
+                bio_put(bio);
+                bio = next_bio;
+        }
+        *bio_chain = NULL;
+        return ret;
+}
+static int swap_write_page(struct swap_map_handle *handle, void *buf,
+                                struct bio **bio_chain)
+{
+        int error = 0;
        unsigned long offset;
        if (!handle->cur)
                return -EINVAL;
        offset = alloc_swap_page(root_swap, handle->bitmap);
-        error = write_page(buf, offset);
+        error = write_page(buf, offset, bio_chain);
        if (error)
                return error;
        handle->cur->entries[handle->k++] = offset;
        if (handle->k >= MAP_PAGE_ENTRIES) {
+                error = wait_on_bio_chain(bio_chain);
+                if (error)
+                        goto out;
                offset = alloc_swap_page(root_swap, handle->bitmap);
                if (!offset)
                        return -ENOSPC;
                handle->cur->next_swap = offset;
-                error = write_page(handle->cur, handle->cur_swap);
+                error = write_page(handle->cur, handle->cur_swap, NULL);
                if (error)
-                        return error;
+                        goto out;
                memset(handle->cur, 0, PAGE_SIZE);
                handle->cur_swap = offset;
                handle->k = 0;
        }
-        return 0;
+out:
+        return error;
 }
 static int flush_swap_writer(struct swap_map_handle *handle)
 {
        if (handle->cur && handle->cur_swap)
-                return write_page(handle->cur, handle->cur_swap);
+                return write_page(handle->cur, handle->cur_swap, NULL);
        else
                return -EINVAL;
 }
@@ -206,21 +279,29 @@ static int flush_swap_writer(struct swap_map_handle *handle)
 static int save_image(struct swap_map_handle *handle,
                      struct snapshot_handle *snapshot,
-                      unsigned int nr_pages)
+                      unsigned int nr_to_write)
 {
        unsigned int m;
        int ret;
        int error = 0;
+        int nr_pages;
+        int err2;
+        struct bio *bio;
+        struct timeval start;
+        struct timeval stop;
-        printk("Saving image data pages (%u pages) ...     ", nr_pages);
+        printk("Saving image data pages (%u pages) ...     ", nr_to_write);
-        m = nr_pages / 100;
+        m = nr_to_write / 100;
        if (!m)
                m = 1;
        nr_pages = 0;
+        bio = NULL;
+        do_gettimeofday(&start);
        do {
                ret = snapshot_read_next(snapshot, PAGE_SIZE);
                if (ret > 0) {
-                        error = swap_write_page(handle, data_of(*snapshot));
+                        error = swap_write_page(handle, data_of(*snapshot),
+                                                &bio);
                        if (error)
                                break;
                        if (!(nr_pages % m))
@@ -228,8 +309,13 @@ static int save_image(struct swap_map_handle *handle,
                        nr_pages++;
                }
        } while (ret > 0);
+        err2 = wait_on_bio_chain(&bio);
+        do_gettimeofday(&stop);
+        if (!error)
+                error = err2;
        if (!error)
                printk("\b\b\b\bdone\n");
+        show_speed(&start, &stop, nr_to_write, "Wrote");
        return error;
 }
@@ -245,8 +331,7 @@ static int enough_swap(unsigned int nr_pages)
        unsigned int free_swap = count_swap_pages(root_swap, 1);
        pr_debug("swsusp: free swap pages: %u\n", free_swap);
-        return free_swap > (nr_pages + PAGES_FOR_IO +
+        return free_swap > nr_pages + PAGES_FOR_IO;
-                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 /**
@@ -266,7 +351,8 @@ int swsusp_write(void)
        int error;
        if ((error = swsusp_swap_check())) {
-                printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
+                printk(KERN_ERR "swsusp: Cannot find swap device, try "
+                                "swapon -a.\n");
                return error;
        }
        memset(&snapshot, 0, sizeof(struct snapshot_handle));
@@ -281,7 +367,7 @@ int swsusp_write(void)
        error = get_swap_writer(&handle);
        if (!error) {
                unsigned long start = handle.cur_swap;
-                error = swap_write_page(&handle, header);
+                error = swap_write_page(&handle, header, NULL);
                if (!error)
                        error = save_image(&handle, &snapshot,
                                        header->pages - 1);
@@ -298,27 +384,6 @@ int swsusp_write(void)
        return error;
 }
-/*
- *      Using bio to read from swap.
- *      This code requires a bit more work than just using buffer heads
- *      but, it is the recommended way for 2.5/2.6.
- *      The following are to signal the beginning and end of I/O. Bios
- *      finish asynchronously, while we want them to happen synchronously.
- *      A simple atomic_t, and a wait loop take care of this problem.
- */
-static atomic_t io_done = ATOMIC_INIT(0);
-static int end_io(struct bio *bio, unsigned int num, int err)
-{
-        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-                printk(KERN_ERR "I/O error reading swsusp image.\n");
-                return -EIO;
-        }
-        atomic_set(&io_done, 0);
-        return 0;
-}
 static struct block_device *resume_bdev;
 /**
@@ -326,15 +391,15 @@ static struct block_device *resume_bdev;
 *      @rw:    READ or WRITE.
 *      @off    physical offset of page.
 *      @page:  page we're reading or writing.
+ *      @bio_chain: list of pending biod (for async reading)
 *
 *      Straight from the textbook - allocate and initialize the bio.
- *      If we're writing, make sure the page is marked as dirty.
+ *      If we're reading, make sure the page is marked as dirty.
- *      Then submit it and wait.
+ *      Then submit it and, if @bio_chain == NULL, wait.
 */
+static int submit(int rw, pgoff_t page_off, struct page *page,
-static int submit(int rw, pgoff_t page_off, void *page)
+                        struct bio **bio_chain)
 {
-        int error = 0;
        struct bio *bio;
        bio = bio_alloc(GFP_ATOMIC, 1);
@@ -342,33 +407,40 @@ static int submit(int rw, pgoff_t page_off, void *page)
                return -ENOMEM;
        bio->bi_sector = page_off * (PAGE_SIZE >> 9);
        bio->bi_bdev = resume_bdev;
-        bio->bi_end_io = end_io;
+        bio->bi_end_io = end_swap_bio_read;
-        if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) {
+        if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-                printk("swsusp: ERROR: adding page to bio at %ld\n",page_off);
+                printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
-                error = -EFAULT;
+                bio_put(bio);
-                goto Done;
+                return -EFAULT;
        }
-        atomic_set(&io_done, 1);
+        lock_page(page);
-        submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+        bio_get(bio);
-        while (atomic_read(&io_done))
-                yield();
+        if (bio_chain == NULL) {
-        if (rw == READ)
+                submit_bio(rw | (1 << BIO_RW_SYNC), bio);
-                bio_set_pages_dirty(bio);
+                wait_on_page_locked(page);
- Done:
+                if (rw == READ)
-        bio_put(bio);
+                        bio_set_pages_dirty(bio);
-        return error;
+                bio_put(bio);
+        } else {
+                get_page(page);
+                bio->bi_private = *bio_chain;
+                *bio_chain = bio;
+                submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+        }
+        return 0;
 }
-static int bio_read_page(pgoff_t page_off, void *page)
+static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
 {
-        return submit(READ, page_off, page);
+        return submit(READ, page_off, virt_to_page(addr), bio_chain);
 }
-static int bio_write_page(pgoff_t page_off, void *page)
+static int bio_write_page(pgoff_t page_off, void *addr)
 {
-        return submit(WRITE, page_off, page);
+        return submit(WRITE, page_off, virt_to_page(addr), NULL);
 }
 /**
@@ -393,7 +465,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
        handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
        if (!handle->cur)
                return -ENOMEM;
-        error = bio_read_page(swp_offset(start), handle->cur);
+        error = bio_read_page(swp_offset(start), handle->cur, NULL);
        if (error) {
                release_swap_reader(handle);
                return error;
@@ -402,7 +474,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
        return 0;
 }
-static int swap_read_page(struct swap_map_handle *handle, void *buf)
+static int swap_read_page(struct swap_map_handle *handle, void *buf,
+                                struct bio **bio_chain)
 {
        unsigned long offset;
        int error;
@@ -412,16 +485,17 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
        offset = handle->cur->entries[handle->k];
        if (!offset)
                return -EFAULT;
-        error = bio_read_page(offset, buf);
+        error = bio_read_page(offset, buf, bio_chain);
        if (error)
                return error;
        if (++handle->k >= MAP_PAGE_ENTRIES) {
+                error = wait_on_bio_chain(bio_chain);
                handle->k = 0;
                offset = handle->cur->next_swap;
                if (!offset)
                        release_swap_reader(handle);
-                else
+                else if (!error)
-                        error = bio_read_page(offset, handle->cur);
+                        error = bio_read_page(offset, handle->cur, NULL);
        }
        return error;
 }
@@ -434,33 +508,49 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
 static int load_image(struct swap_map_handle *handle,
                      struct snapshot_handle *snapshot,
-                      unsigned int nr_pages)
+                      unsigned int nr_to_read)
 {
        unsigned int m;
-        int ret;
        int error = 0;
+        struct timeval start;
+        struct timeval stop;
+        struct bio *bio;
+        int err2;
+        unsigned nr_pages;
-        printk("Loading image data pages (%u pages) ...     ", nr_pages);
+        printk("Loading image data pages (%u pages) ...     ", nr_to_read);
-        m = nr_pages / 100;
+        m = nr_to_read / 100;
        if (!m)
                m = 1;
        nr_pages = 0;
-        do {
+        bio = NULL;
-                ret = snapshot_write_next(snapshot, PAGE_SIZE);
+        do_gettimeofday(&start);
-                if (ret > 0) {
+        for ( ; ; ) {
-                        error = swap_read_page(handle, data_of(*snapshot));
+                error = snapshot_write_next(snapshot, PAGE_SIZE);
-                        if (error)
+                if (error <= 0)
-                                break;
+                        break;
-                        if (!(nr_pages % m))
+                error = swap_read_page(handle, data_of(*snapshot), &bio);
-                                printk("\b\b\b\b%3d%%", nr_pages / m);
+                if (error)
-                        nr_pages++;
+                        break;
-                }
+                if (snapshot->sync_read)
-        } while (ret > 0);
+                        error = wait_on_bio_chain(&bio);
+                if (error)
+                        break;
+                if (!(nr_pages % m))
+                        printk("\b\b\b\b%3d%%", nr_pages / m);
+                nr_pages++;
+        }
+        err2 = wait_on_bio_chain(&bio);
+        do_gettimeofday(&stop);
+        if (!error)
+                error = err2;
        if (!error) {
                printk("\b\b\b\bdone\n");
+                snapshot_free_unused_memory(snapshot);
                if (!snapshot_image_loaded(snapshot))
                        error = -ENODATA;
        }
+        show_speed(&start, &stop, nr_to_read, "Read");
        return error;
 }
@@ -483,7 +573,7 @@ int swsusp_read(void)
        header = (struct swsusp_info *)data_of(snapshot);
        error = get_swap_reader(&handle, swsusp_header.image);
        if (!error)
-                error = swap_read_page(&handle, header);
+                error = swap_read_page(&handle, header, NULL);
        if (!error)
                error = load_image(&handle, &snapshot, header->pages - 1);
        release_swap_reader(&handle);
@@ -509,7 +599,7 @@ int swsusp_check(void)
        if (!IS_ERR(resume_bdev)) {
                set_blocksize(resume_bdev, PAGE_SIZE);
                memset(&swsusp_header, 0, sizeof(swsusp_header));
-                if ((error = bio_read_page(0, &swsusp_header)))
+                if ((error = bio_read_page(0, &swsusp_header, NULL)))
                        return error;
                if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
                        memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 17f669c83012..0b66659dc516 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -193,14 +193,13 @@ int swsusp_shrink_memory(void)
        printk("Shrinking memory...  ");
        do {
                size = 2 * count_highmem_pages();
-                size += size / 50 + count_data_pages();
+                size += size / 50 + count_data_pages() + PAGES_FOR_IO;
-                size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
-                        PAGES_FOR_IO;
                tmp = size;
                for_each_zone (zone)
                        if (!is_highmem(zone) && populated_zone(zone)) {
                                tmp -= zone->free_pages;
                                tmp += zone->lowmem_reserve[ZONE_NORMAL];
+                                tmp += snapshot_additional_pages(zone);
                        }
                if (tmp > 0) {
                        tmp = __shrink_memory(tmp);
@@ -248,6 +247,9 @@ int swsusp_suspend(void)
        restore_processor_state();
 Restore_highmem:
        restore_highmem();
+        /* NOTE:  device_power_up() is just a resume() for devices
+         * that suspended with irqs off ... no overall powerup.
+         */
        device_power_up();
 Enable_irqs:
        local_irq_enable();
@@ -257,8 +259,12 @@ Enable_irqs:
 int swsusp_resume(void)
 {
        int error;
        local_irq_disable();
-        if (device_power_down(PMSG_FREEZE))
+        /* NOTE:  device_power_down() is just a suspend() with irqs off;
+         * it has no special "power things down" semantics
+         */
+        if (device_power_down(PMSG_PRETHAW))
                printk(KERN_ERR "Some devices failed to power down, very bad\n");
        /* We'll ignore saved state, but this gets preempt count (etc) right */
        save_processor_state();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3f1539fbe48a..72825c853cd7 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -19,6 +19,7 @@
 #include <linux/swapops.h>
 #include <linux/pm.h>
 #include <linux/fs.h>
+#include <linux/cpu.h>
 #include <asm/uaccess.h>
@@ -139,12 +140,15 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                if (data->frozen)
                        break;
                down(&pm_sem);
-                disable_nonboot_cpus();
+                error = disable_nonboot_cpus();
-                if (freeze_processes()) {
+                if (!error) {
-                        thaw_processes();
+                        error = freeze_processes();
-                        enable_nonboot_cpus();
+                        if (error) {
-                        error = -EBUSY;
+                                thaw_processes();
+                                error = -EBUSY;
+                        }
                }
+                enable_nonboot_cpus();
                up(&pm_sem);
                if (!error)
                        data->frozen = 1;
@@ -189,9 +193,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
                        error = -EPERM;
                        break;
                }
+                snapshot_free_unused_memory(&data->handle);
                down(&pm_sem);
                pm_prepare_console();
-                error = device_suspend(PMSG_FREEZE);
+                error = device_suspend(PMSG_PRETHAW);
                if (!error) {
                        error = swsusp_resume();
                        device_resume();
diff --git a/kernel/printk.c b/kernel/printk.c
index 1149365e989e..771f5e861bcd 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -721,6 +721,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
        return 0;
 }
+#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
 /**
 * suspend_console - suspend the console subsystem
 *
@@ -728,6 +729,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
 */
 void suspend_console(void)
 {
+        printk("Suspending console(s)\n");
        acquire_console_sem();
        console_suspended = 1;
 }
@@ -737,6 +739,7 @@ void resume_console(void)
        console_suspended = 0;
        release_console_sem();
 }
+#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
 /**
 * acquire_console_sem - lock the console system for exclusive use.
diff --git a/kernel/profile.c b/kernel/profile.c
index d5bd75e7501c..fb660c7d35ba 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -309,13 +309,17 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
                node = cpu_to_node(cpu);
                per_cpu(cpu_profile_flip, cpu) = 0;
                if (!per_cpu(cpu_profile_hits, cpu)[1]) {
-                        page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+                        page = alloc_pages_node(node,
+                                        GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+                                        0);
                        if (!page)
                                return NOTIFY_BAD;
                        per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
                }
                if (!per_cpu(cpu_profile_hits, cpu)[0]) {
-                        page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+                        page = alloc_pages_node(node,
+                                        GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+                                        0);
                        if (!page)
                                goto out_free;
                        per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
@@ -491,12 +495,16 @@ static int __init create_hash_tables(void)
                int node = cpu_to_node(cpu);
                struct page *page;
-                page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+                page = alloc_pages_node(node,
+                                GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+                                0);
                if (!page)
                        goto out_cleanup;
                per_cpu(cpu_profile_hits, cpu)[1]
                                = (struct profile_hit *)page_address(page);
-                page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+                page = alloc_pages_node(node,
+                                GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+                                0);
                if (!page)
                        goto out_cleanup;
                per_cpu(cpu_profile_hits, cpu)[0]
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9a111f70145c..4d50e06fd745 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -241,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
        return 0;
 }
-/*
- * Access another process' address space.
- * Source/target buffer must be kernel space, 
- * Do not walk the page table directly, use get_user_pages
- */
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
-{
-        struct mm_struct *mm;
-        struct vm_area_struct *vma;
-        struct page *page;
-        void *old_buf = buf;
-        mm = get_task_mm(tsk);
-        if (!mm)
-                return 0;
-        down_read(&mm->mmap_sem);
-        /* ignore errors, just check how much was sucessfully transfered */
-        while (len) {
-                int bytes, ret, offset;
-                void *maddr;
-                ret = get_user_pages(tsk, mm, addr, 1,
-                                write, 1, &page, &vma);
-                if (ret <= 0)
-                        break;
-                bytes = len;
-                offset = addr & (PAGE_SIZE-1);
-                if (bytes > PAGE_SIZE-offset)
-                        bytes = PAGE_SIZE-offset;
-                maddr = kmap(page);
-                if (write) {
-                        copy_to_user_page(vma, page, addr,
-                                          maddr + offset, buf, bytes);
-                        set_page_dirty_lock(page);
-                } else {
-                        copy_from_user_page(vma, page, addr,
-                                            buf, maddr + offset, bytes);
-                }
-                kunmap(page);
-                page_cache_release(page);
-                len -= bytes;
-                buf += bytes;
-                addr += bytes;
-        }
-        up_read(&mm->mmap_sem);
-        mmput(mm);
-        
-        return buf - old_buf;
-}
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
 {
        int copied = 0;
@@ -494,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
        child = find_task_by_pid(pid);
        if (child)
                get_task_struct(child);
        read_unlock(&tasklist_lock);
        if (!child)
                return ERR_PTR(-ESRCH);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 4d1c3d247127..4f2c4272d59c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -192,13 +192,13 @@ static struct rcu_torture_ops *cur_ops = NULL;
 * Definitions for rcu torture testing.
 */
-static int rcu_torture_read_lock(void)
+static int rcu_torture_read_lock(void) __acquires(RCU)
 {
        rcu_read_lock();
        return 0;
 }
-static void rcu_torture_read_unlock(int idx)
+static void rcu_torture_read_unlock(int idx) __releases(RCU)
 {
        rcu_read_unlock();
 }
@@ -250,13 +250,13 @@ static struct rcu_torture_ops rcu_ops = {
 * Definitions for rcu_bh torture testing.
 */
-static int rcu_bh_torture_read_lock(void)
+static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
 {
        rcu_read_lock_bh();
        return 0;
 }
-static void rcu_bh_torture_read_unlock(int idx)
+static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
 {
        rcu_read_unlock_bh();
 }
diff --git a/kernel/relay.c b/kernel/relay.c
index 33345e73485c..1d63ecddfa70 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
 *      @buf: the buffer struct
 *      @size: total size of the buffer
 *
- *      Returns a pointer to the resulting buffer, NULL if unsuccessful. The
+ *      Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
 *      passed in size will get page aligned, if it isn't already.
 */
 static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
@@ -132,10 +132,9 @@ depopulate:
 /**
 *      relay_create_buf - allocate and initialize a channel buffer
- *      @alloc_size: size of the buffer to allocate
+ *      @chan: the relay channel
- *      @n_subbufs: number of sub-buffers in the channel
 *
- *      Returns channel buffer if successful, NULL otherwise
+ *      Returns channel buffer if successful, %NULL otherwise.
 */
 struct rchan_buf *relay_create_buf(struct rchan *chan)
 {
@@ -163,6 +162,7 @@ free_buf:
 /**
 *      relay_destroy_channel - free the channel struct
+ *      @kref: target kernel reference that contains the relay channel
 *
 *      Should only be called from kref_put().
 */
@@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
 /**
 *      relay_remove_buf - remove a channel buffer
+ *      @kref: target kernel reference that contains the relay buffer
 *
 *      Removes the file from the fileystem, which also frees the
 *      rchan_buf_struct and the channel buffer.  Should only be called from
@@ -374,7 +375,7 @@ void relay_reset(struct rchan *chan)
 }
 EXPORT_SYMBOL_GPL(relay_reset);
-/**
+/*
 *      relay_open_buf - create a new relay channel buffer
 *
 *      Internal - used by relay_open().
@@ -448,12 +449,12 @@ static inline void setup_callbacks(struct rchan *chan,
 /**
 *      relay_open - create a new relay channel
 *      @base_filename: base name of files to create
- *      @parent: dentry of parent directory, NULL for root directory
+ *      @parent: dentry of parent directory, %NULL for root directory
 *      @subbuf_size: size of sub-buffers
 *      @n_subbufs: number of sub-buffers
 *      @cb: client callback functions
 *
- *      Returns channel pointer if successful, NULL otherwise.
+ *      Returns channel pointer if successful, %NULL otherwise.
 *
 *      Creates a channel buffer for each cpu using the sizes and
 *      attributes specified.  The created channel buffer files
@@ -585,7 +586,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf);
 *      subbufs_consumed should be the number of sub-buffers newly consumed,
 *      not the total consumed.
 *
- *      NOTE: kernel clients don't need to call this function if the channel
+ *      NOTE: Kernel clients don't need to call this function if the channel
 *      mode is 'overwrite'.
 */
 void relay_subbufs_consumed(struct rchan *chan,
@@ -641,7 +642,7 @@ EXPORT_SYMBOL_GPL(relay_close);
 *      relay_flush - close the channel
 *      @chan: the channel
 *
- *      Flushes all channel buffers i.e. forces buffer switch.
+ *      Flushes all channel buffers, i.e. forces buffer switch.
 */
 void relay_flush(struct rchan *chan)
 {
@@ -669,7 +670,7 @@ EXPORT_SYMBOL_GPL(relay_flush);
 */
 static int relay_file_open(struct inode *inode, struct file *filp)
 {
-        struct rchan_buf *buf = inode->u.generic_ip;
+        struct rchan_buf *buf = inode->i_private;
        kref_get(&buf->kref);
        filp->private_data = buf;
@@ -729,7 +730,7 @@ static int relay_file_release(struct inode *inode, struct file *filp)
        return 0;
 }
-/**
+/*
 *      relay_file_read_consume - update the consumed count for the buffer
 */
 static void relay_file_read_consume(struct rchan_buf *buf,
@@ -756,7 +757,7 @@ static void relay_file_read_consume(struct rchan_buf *buf,
        }
 }
-/**
+/*
 *      relay_file_read_avail - boolean, are there unconsumed bytes available?
 */
 static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
@@ -793,6 +794,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
 /**
 *      relay_file_read_subbuf_avail - return bytes available in sub-buffer
+ *      @read_pos: file read position
+ *      @buf: relay channel buffer
 */
 static size_t relay_file_read_subbuf_avail(size_t read_pos,
                                           struct rchan_buf *buf)
@@ -818,6 +821,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
 /**
 *      relay_file_read_start_pos - find the first available byte to read
+ *      @read_pos: file read position
+ *      @buf: relay channel buffer
 *
 *      If the read_pos is in the middle of padding, return the
 *      position of the first actually available byte, otherwise
@@ -844,6 +849,9 @@ static size_t relay_file_read_start_pos(size_t read_pos,
 /**
 *      relay_file_read_end_pos - return the new read position
+ *      @read_pos: file read position
+ *      @buf: relay channel buffer
+ *      @count: number of bytes to be read
 */
 static size_t relay_file_read_end_pos(struct rchan_buf *buf,
                                      size_t read_pos,
@@ -865,7 +873,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
        return end_pos;
 }
-/**
+/*
 *      subbuf_read_actor - read up to one subbuf's worth of data
 */
 static int subbuf_read_actor(size_t read_start,
@@ -890,7 +898,7 @@ static int subbuf_read_actor(size_t read_start,
        return ret;
 }
-/**
+/*
 *      subbuf_send_actor - send up to one subbuf's worth of data
 */
 static int subbuf_send_actor(size_t read_start,
@@ -933,7 +941,7 @@ typedef int (*subbuf_actor_t) (size_t read_start,
                               read_descriptor_t *desc,
                               read_actor_t actor);
-/**
+/*
 *      relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
 */
 static inline ssize_t relay_file_read_subbufs(struct file *filp,
diff --git a/kernel/resource.c b/kernel/resource.c
index 46286434af80..9db38a1a7520 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -344,12 +344,11 @@ EXPORT_SYMBOL(allocate_resource);
 *
 * Returns 0 on success, -EBUSY if the resource can't be inserted.
 *
- * This function is equivalent of request_resource when no conflict
+ * This function is equivalent to request_resource when no conflict
 * happens. If a conflict happens, and the conflicting resources
 * entirely fit within the range of the new resource, then the new
- * resource is inserted and the conflicting resources become childs of
+ * resource is inserted and the conflicting resources become children of
- * the new resource.  Otherwise the new resource becomes the child of
+ * the new resource.
- * the conflicting resource
 */
 int insert_resource(struct resource *parent, struct resource *new)
 {
@@ -357,20 +356,21 @@ int insert_resource(struct resource *parent, struct resource *new)
        struct resource *first, *next;
        write_lock(&resource_lock);
- begin:
-        result = 0;
-        first = __request_resource(parent, new);
-        if (!first)
-                goto out;
-        result = -EBUSY;
+        for (;; parent = first) {
-        if (first == parent)
+                result = 0;
-                goto out;
+                first = __request_resource(parent, new);
+                if (!first)
+                        goto out;
-        /* Resource fully contained by the clashing resource? Recurse into it */
+                result = -EBUSY;
-        if (first->start <= new->start && first->end >= new->end) {
+                if (first == parent)
-                parent = first;
+                        goto out;
-                goto begin;
+                if ((first->start > new->start) || (first->end < new->end))
+                        break;
+                if ((first->start == new->start) && (first->end == new->end))
+                        break;
        }
        for (next = first; ; next = next->sibling) {
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 3e13a1e5856f..4ab17da46fd8 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -251,6 +251,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
        /* Grab the next task */
        task = rt_mutex_owner(lock);
+        get_task_struct(task);
        spin_lock_irqsave(&task->pi_lock, flags);
        if (waiter == rt_mutex_top_waiter(lock)) {
@@ -269,7 +270,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
                __rt_mutex_adjust_prio(task);
        }
-        get_task_struct(task);
        spin_unlock_irqrestore(&task->pi_lock, flags);
        top_waiter = rt_mutex_top_waiter(lock);
@@ -409,7 +409,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        struct task_struct *owner = rt_mutex_owner(lock);
        struct rt_mutex_waiter *top_waiter = waiter;
        unsigned long flags;
-        int boost = 0, res;
+        int chain_walk = 0, res;
        spin_lock_irqsave(&current->pi_lock, flags);
        __rt_mutex_adjust_prio(current);
@@ -433,25 +433,23 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
                __rt_mutex_adjust_prio(owner);
-                if (owner->pi_blocked_on) {
+                if (owner->pi_blocked_on)
-                        boost = 1;
+                        chain_walk = 1;
-                        /* gets dropped in rt_mutex_adjust_prio_chain()! */
-                        get_task_struct(owner);
-                }
-                spin_unlock_irqrestore(&owner->pi_lock, flags);
-        }
-        else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
-                spin_lock_irqsave(&owner->pi_lock, flags);
-                if (owner->pi_blocked_on) {
-                        boost = 1;
-                        /* gets dropped in rt_mutex_adjust_prio_chain()! */
-                        get_task_struct(owner);
-                }
                spin_unlock_irqrestore(&owner->pi_lock, flags);
        }
-        if (!boost)
+        else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
+                chain_walk = 1;
+        if (!chain_walk)
                return 0;
+        /*
+         * The owner can't disappear while holding a lock,
+         * so the owner struct is protected by wait_lock.
+         * Gets dropped in rt_mutex_adjust_prio_chain()!
+         */
+        get_task_struct(owner);
        spin_unlock(&lock->wait_lock);
        res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
@@ -532,7 +530,7 @@ static void remove_waiter(struct rt_mutex *lock,
        int first = (waiter == rt_mutex_top_waiter(lock));
        struct task_struct *owner = rt_mutex_owner(lock);
        unsigned long flags;
-        int boost = 0;
+        int chain_walk = 0;
        spin_lock_irqsave(&current->pi_lock, flags);
        plist_del(&waiter->list_entry, &lock->wait_list);
@@ -554,19 +552,20 @@ static void remove_waiter(struct rt_mutex *lock,
                }
                __rt_mutex_adjust_prio(owner);
-                if (owner->pi_blocked_on) {
+                if (owner->pi_blocked_on)
-                        boost = 1;
+                        chain_walk = 1;
-                        /* gets dropped in rt_mutex_adjust_prio_chain()! */
-                        get_task_struct(owner);
-                }
                spin_unlock_irqrestore(&owner->pi_lock, flags);
        }
        WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
-        if (!boost)
+        if (!chain_walk)
                return;
+        /* gets dropped in rt_mutex_adjust_prio_chain()! */
+        get_task_struct(owner);
        spin_unlock(&lock->wait_lock);
        rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
@@ -592,10 +591,10 @@ void rt_mutex_adjust_pi(struct task_struct *task)
                return;
        }
-        /* gets dropped in rt_mutex_adjust_prio_chain()! */
-        get_task_struct(task);
        spin_unlock_irqrestore(&task->pi_lock, flags);
+        /* gets dropped in rt_mutex_adjust_prio_chain()! */
+        get_task_struct(task);
        rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index a234fbee1238..74f169ac0773 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -238,6 +238,7 @@ struct rq {
        /* For active balancing */
        int active_balance;
        int push_cpu;
+        int cpu;                /* cpu of this runqueue */
        struct task_struct *migration_thread;
        struct list_head migration_queue;
@@ -267,6 +268,15 @@ struct rq {
 static DEFINE_PER_CPU(struct rq, runqueues);
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+        return rq->cpu;
+#else
+        return 0;
+#endif
+}
 /*
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 * See detach_destroy_domains: synchronize_sched for details.
@@ -1745,27 +1755,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
        __releases(rq->lock)
 {
        struct mm_struct *mm = rq->prev_mm;
-        unsigned long prev_task_flags;
+        long prev_state;
        rq->prev_mm = NULL;
        /*
         * A task struct has one reference for the use as "current".
-         * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
+         * If a task dies, then it sets TASK_DEAD in tsk->state and calls
-         * calls schedule one last time. The schedule call will never return,
+         * schedule one last time. The schedule call will never return, and
-         * and the scheduled task must drop that reference.
+         * the scheduled task must drop that reference.
-         * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+         * The test for TASK_DEAD must occur while the runqueue locks are
         * still held, otherwise prev could be scheduled on another cpu, die
         * there before we look at prev->state, and then the reference would
         * be dropped twice.
         *              Manfred Spraul <manfred@colorfullife.com>
         */
-        prev_task_flags = prev->flags;
+        prev_state = prev->state;
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
        if (mm)
                mmdrop(mm);
-        if (unlikely(prev_task_flags & PF_DEAD)) {
+        if (unlikely(prev_state == TASK_DEAD)) {
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
@@ -2211,7 +2221,8 @@ out:
 */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-                   unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+                   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+                   cpumask_t *cpus)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2248,7 +2259,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                sum_weighted_load = sum_nr_running = avg_load = 0;
                for_each_cpu_mask(i, group->cpumask) {
-                        struct rq *rq = cpu_rq(i);
+                        struct rq *rq;
+                        if (!cpu_isset(i, *cpus))
+                                continue;
+                        rq = cpu_rq(i);
                        if (*sd_idle && !idle_cpu(i))
                                *sd_idle = 0;
@@ -2466,13 +2482,17 @@ ret:
 */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum idle_type idle,
-                   unsigned long imbalance)
+                   unsigned long imbalance, cpumask_t *cpus)
 {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
        int i;
        for_each_cpu_mask(i, group->cpumask) {
+                if (!cpu_isset(i, *cpus))
+                        continue;
                rq = cpu_rq(i);
                if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
@@ -2511,6 +2531,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
+        cpumask_t cpus = CPU_MASK_ALL;
        if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
            !sched_smt_power_savings)
@@ -2518,13 +2539,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_cnt[idle]);
-        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
+redo:
+        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+                                                        &cpus);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[idle]);
                goto out_balanced;
        }
-        busiest = find_busiest_queue(group, idle, imbalance);
+        busiest = find_busiest_queue(group, idle, imbalance, &cpus);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -2549,8 +2572,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                double_rq_unlock(this_rq, busiest);
                /* All tasks on this runqueue were pinned by CPU affinity */
-                if (unlikely(all_pinned))
+                if (unlikely(all_pinned)) {
+                        cpu_clear(cpu_of(busiest), cpus);
+                        if (!cpus_empty(cpus))
+                                goto redo;
                        goto out_balanced;
+                }
        }
        if (!nr_moved) {
@@ -2639,18 +2666,22 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        unsigned long imbalance;
        int nr_moved = 0;
        int sd_idle = 0;
+        cpumask_t cpus = CPU_MASK_ALL;
        if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
                sd_idle = 1;
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
-        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
+redo:
+        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+                                &sd_idle, &cpus);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                goto out_balanced;
        }
-        busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
+        busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+                                &cpus);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
                goto out_balanced;
@@ -2668,6 +2699,12 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
                                        minus_1_or_zero(busiest->nr_running),
                                        imbalance, sd, NEWLY_IDLE, NULL);
                spin_unlock(&busiest->lock);
+                if (!nr_moved) {
+                        cpu_clear(cpu_of(busiest), cpus);
+                        if (!cpus_empty(cpus))
+                                goto redo;
+                }
        }
        if (!nr_moved) {
@@ -3311,9 +3348,6 @@ need_resched_nonpreemptible:
        spin_lock_irq(&rq->lock);
-        if (unlikely(prev->flags & PF_DEAD))
-                prev->state = EXIT_DEAD;
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                switch_count = &prev->nvcsw;
@@ -4043,6 +4077,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
 * @p: the task in question.
 * @policy: new policy.
 * @param: structure containing the new RT priority.
+ *
+ * NOTE: the task may be already dead
 */
 int sched_setscheduler(struct task_struct *p, int policy,
                       struct sched_param *param)
@@ -4070,28 +4106,32 @@ recheck:
            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
-        if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
+        if (is_rt_policy(policy) != (param->sched_priority != 0))
-                                        != (param->sched_priority == 0))
                return -EINVAL;
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
        if (!capable(CAP_SYS_NICE)) {
-                /*
+                if (is_rt_policy(policy)) {
-                 * can't change policy, except between SCHED_NORMAL
+                        unsigned long rlim_rtprio;
-                 * and SCHED_BATCH:
+                        unsigned long flags;
-                 */
-                if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
+                        if (!lock_task_sighand(p, &flags))
-                        (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
+                                return -ESRCH;
-                                !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+                        rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
-                        return -EPERM;
+                        unlock_task_sighand(p, &flags);
-                /* can't increase priority */
-                if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
+                        /* can't set/change the rt policy */
-                    param->sched_priority > p->rt_priority &&
+                        if (policy != p->policy && !rlim_rtprio)
-                    param->sched_priority >
+                                return -EPERM;
-                                p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
-                        return -EPERM;
+                        /* can't increase priority */
+                        if (param->sched_priority > p->rt_priority &&
+                            param->sched_priority > rlim_rtprio)
+                                return -EPERM;
+                }
                /* can't change other user's priorities */
                if ((current->euid != p->euid) &&
                    (current->euid != p->uid))
@@ -4156,14 +4196,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
                return -EINVAL;
        if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
                return -EFAULT;
-        read_lock_irq(&tasklist_lock);
+        rcu_read_lock();
+        retval = -ESRCH;
        p = find_process_by_pid(pid);
-        if (!p) {
+        if (p != NULL)
-                read_unlock_irq(&tasklist_lock);
+                retval = sched_setscheduler(p, policy, &lparam);
-                return -ESRCH;
+        rcu_read_unlock();
-        }
-        retval = sched_setscheduler(p, policy, &lparam);
-        read_unlock_irq(&tasklist_lock);
        return retval;
 }
@@ -5114,7 +5153,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
        BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
        /* Cannot have done final schedule yet: would have vanished. */
-        BUG_ON(p->flags & PF_DEAD);
+        BUG_ON(p->state == TASK_DEAD);
        get_task_struct(p);
@@ -5235,9 +5274,11 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 int __init migration_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
+        int err;
        /* Start one for the boot CPU: */
-        migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+        err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+        BUG_ON(err == NOTIFY_BAD);
        migration_call(&migration_notifier, CPU_ONLINE, cpu);
        register_cpu_notifier(&migration_notifier);
@@ -6747,6 +6788,7 @@ void __init sched_init(void)
                        rq->cpu_load[j] = 0;
                rq->active_balance = 0;
                rq->push_cpu = 0;
+                rq->cpu = i;
                rq->migration_thread = NULL;
                INIT_LIST_HEAD(&rq->migration_queue);
 #endif
diff --git a/kernel/signal.c b/kernel/signal.c
index bfdb5686fa3e..fb5da6d19f14 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -417,9 +417,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
                        siginfo_t *info)
 {
-        int sig = 0;
+        int sig = next_signal(pending, mask);
-        sig = next_signal(pending, mask);
        if (sig) {
                if (current->notifier) {
                        if (sigismember(current->notifier_mask, sig)) {
@@ -432,9 +431,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
                if (!collect_signal(sig, pending, info))
                        sig = 0;
-                                
        }
-        recalc_sigpending();
        return sig;
 }
@@ -451,6 +448,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
        if (!signr)
                signr = __dequeue_signal(&tsk->signal->shared_pending,
                                         mask, info);
+        recalc_sigpending_tsk(tsk);
        if (signr && unlikely(sig_kernel_stop(signr))) {
                /*
                 * Set a marker that we have dequeued a stop signal.  Our
@@ -2577,6 +2575,11 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
 }
 #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
+__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+{
+        return NULL;
+}
 void __init signals_init(void)
 {
        sigqueue_cachep =
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3789ca98197c..bf25015dce16 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -612,7 +612,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 __init int spawn_ksoftirqd(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
-        cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        BUG_ON(err == NOTIFY_BAD);
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
        return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 03e6a2b0b787..50afeb813305 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -149,8 +149,9 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
 __init void spawn_softlockup_task(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
+        int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-        cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        BUG_ON(err == NOTIFY_BAD);
        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
        register_cpu_notifier(&cpu_nfb);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index fb524b009eef..d48143eafbfd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -7,6 +7,11 @@
 *
 * This file contains the spinlock/rwlock implementations for the
 * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
+ *
+ * Note that some architectures have special knowledge about the
+ * stack frames of these functions in their profile_pc. If you
+ * change anything significant here that could change the stack
+ * frame contact the architecture maintainers.
 */
 #include <linux/linkage.h>
@@ -16,17 +21,6 @@
 #include <linux/debug_locks.h>
 #include <linux/module.h>
-/*
- * Generic declaration of the raw read_trylock() function,
- * architectures are supposed to optimize this:
- */
-int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
-{
-        __raw_read_lock(lock);
-        return 1;
-}
-EXPORT_SYMBOL(generic__raw_read_trylock);
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
        preempt_disable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 51cacd111dbd..12458040e665 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,6 @@
+/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
+ * GPL v2 and any later version.
+ */
 #include <linux/stop_machine.h>
 #include <linux/kthread.h>
 #include <linux/sched.h>
diff --git a/kernel/sys.c b/kernel/sys.c
index e236f98f7ec5..b88806c66244 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/tty.h>
 #include <linux/signal.h>
 #include <linux/cn_proc.h>
+#include <linux/getcpu.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -611,7 +612,6 @@ void kernel_restart(char *cmd)
        } else {
                printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
        }
-        printk(".\n");
        machine_restart(cmd);
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
@@ -2062,3 +2062,33 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
        }
        return error;
 }
+asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
+                           struct getcpu_cache __user *cache)
+{
+        int err = 0;
+        int cpu = raw_smp_processor_id();
+        if (cpup)
+                err |= put_user(cpu, cpup);
+        if (nodep)
+                err |= put_user(cpu_to_node(cpu), nodep);
+        if (cache) {
+                /*
+                 * The cache is not needed for this implementation,
+                 * but make sure user programs pass something
+                 * valid. vsyscall implementations can instead make
+                 * good use of the cache. Only use t0 and t1 because
+                 * these are available in both 32bit and 64bit ABI (no
+                 * need for a compat_getcpu). 32bit has enough
+                 * padding
+                 */
+                unsigned long t0, t1;
+                get_user(t0, &cache->blob[0]);
+                get_user(t1, &cache->blob[1]);
+                t0++;
+                t1++;
+                put_user(t0, &cache->blob[0]);
+                put_user(t1, &cache->blob[1]);
+        }
+        return err ? -EFAULT : 0;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 362a0cc37138..c57c4532e296 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -52,6 +52,10 @@
 extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
                     void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_X86
+#include <asm/nmi.h>
+#endif
 #if defined(CONFIG_SYSCTL)
 /* External variables not in a header file. */
@@ -74,12 +78,6 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
-int unknown_nmi_panic;
-extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
-                                  void __user *, size_t *, loff_t *);
-#endif
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
 static int minolduid;
@@ -136,8 +134,11 @@ extern int no_unaligned_warning;
 extern int max_lock_depth;
 #endif
-static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
+#ifdef CONFIG_SYSCTL_SYSCALL
-                       ctl_table *, void **);
+static int parse_table(int __user *, int, void __user *, size_t __user *,
+                void __user *, size_t, ctl_table *, void **);
+#endif
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -164,7 +165,7 @@ int sysctl_legacy_va_layout;
 /* /proc declarations: */
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
 static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
@@ -628,11 +629,27 @@ static ctl_table kern_table[] = {
                .data           = &unknown_nmi_panic,
                .maxlen         = sizeof (int),
                .mode           = 0644,
-                .proc_handler   = &proc_unknown_nmi_panic,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
+                .ctl_name       = KERN_NMI_WATCHDOG,
+                .procname       = "nmi_watchdog",
+                .data           = &nmi_watchdog_enabled,
+                .maxlen         = sizeof (int),
+                .mode           = 0644,
+                .proc_handler   = &proc_nmi_enabled,
        },
 #endif
 #if defined(CONFIG_X86)
        {
+                .ctl_name       = KERN_PANIC_ON_NMI,
+                .procname       = "panic_on_unrecovered_nmi",
+                .data           = &panic_on_unrecovered_nmi,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec,
+        },
+        {
                .ctl_name       = KERN_BOOTLOADER_TYPE,
                .procname       = "bootloader_type",
                .data           = &bootloader_type,
@@ -943,6 +960,17 @@ static ctl_table vm_table[] = {
                .extra1         = &zero,
                .extra2         = &one_hundred,
        },
+        {
+                .ctl_name       = VM_MIN_SLAB,
+                .procname       = "min_slab_ratio",
+                .data           = &sysctl_min_slab_ratio,
+                .maxlen         = sizeof(sysctl_min_slab_ratio),
+                .mode           = 0644,
+                .proc_handler   = &sysctl_min_slab_ratio_sysctl_handler,
+                .strategy       = &sysctl_intvec,
+                .extra1         = &zero,
+                .extra2         = &one_hundred,
+        },
 #endif
 #ifdef CONFIG_X86_32
        {
@@ -1138,12 +1166,13 @@ static void start_unregistering(struct ctl_table_header *p)
 void __init sysctl_init(void)
 {
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
        register_proc_table(root_table, proc_sys_root, &root_table_header);
        init_irq_proc();
 #endif
 }
+#ifdef CONFIG_SYSCTL_SYSCALL
 int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
               void __user *newval, size_t newlen)
 {
@@ -1197,6 +1226,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
        unlock_kernel();
        return error;
 }
+#endif /* CONFIG_SYSCTL_SYSCALL */
 /*
 * ctl_perm does NOT grant the superuser all rights automatically, because
@@ -1223,6 +1253,7 @@ static inline int ctl_perm(ctl_table *table, int op)
        return test_perm(table->mode, op);
 }
+#ifdef CONFIG_SYSCTL_SYSCALL
 static int parse_table(int __user *name, int nlen,
                       void __user *oldval, size_t __user *oldlenp,
                       void __user *newval, size_t newlen,
@@ -1312,6 +1343,7 @@ int do_sysctl_strategy (ctl_table *table,
        }
        return 0;
 }
+#endif /* CONFIG_SYSCTL_SYSCALL */
 /**
 * register_sysctl_table - register a sysctl hierarchy
@@ -1399,7 +1431,7 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
        else
                list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
        spin_unlock(&sysctl_lock);
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
        register_proc_table(table, proc_sys_root, tmp);
 #endif
        return tmp;
@@ -1417,18 +1449,31 @@ void unregister_sysctl_table(struct ctl_table_header * header)
        might_sleep();
        spin_lock(&sysctl_lock);
        start_unregistering(header);
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
        unregister_proc_table(header->ctl_table, proc_sys_root);
 #endif
        spin_unlock(&sysctl_lock);
        kfree(header);
 }
+#else /* !CONFIG_SYSCTL */
+struct ctl_table_header * register_sysctl_table(ctl_table * table,
+                                                int insert_at_head)
+{
+        return NULL;
+}
+void unregister_sysctl_table(struct ctl_table_header * table)
+{
+}
+#endif /* CONFIG_SYSCTL */
 /*
 * /proc/sys support
 */
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 /* Scan the sysctl entries in table and add them all into /proc */
 static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
@@ -1867,7 +1912,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
                return -EPERM;
        }
-        op = (current->pid == 1) ? OP_SET : OP_AND;
+        op = is_init(current) ? OP_SET : OP_AND;
        return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
                                do_proc_dointvec_bset_conv,&op);
 }
@@ -2290,6 +2335,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
 #endif /* CONFIG_PROC_FS */
+#ifdef CONFIG_SYSCTL_SYSCALL
 /*
 * General sysctl support routines 
 */
@@ -2432,11 +2478,19 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
        return 1;
 }
-#else /* CONFIG_SYSCTL */
+#else /* CONFIG_SYSCTL_SYSCALL */
 asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
 {
+        static int msg_count;
+        if (msg_count < 5) {
+                msg_count++;
+                printk(KERN_INFO
+                        "warning: process `%s' used the removed sysctl "
+                        "system call\n", current->comm);
+        }
        return -ENOSYS;
 }
@@ -2468,73 +2522,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
        return -ENOSYS;
 }
-int proc_dostring(ctl_table *table, int write, struct file *filp,
+#endif /* CONFIG_SYSCTL_SYSCALL */
-                  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        return -ENOSYS;
-}
-int proc_dointvec(ctl_table *table, int write, struct file *filp,
-                  void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        return -ENOSYS;
-}
-int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
-                        void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        return -ENOSYS;
-}
-int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
-                    void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        return -ENOSYS;
-}
-int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
-                          void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        return -ENOSYS;
-}
-int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
-                          void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        return -ENOSYS;
-}
-int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
-                             void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        return -ENOSYS;
-}
-int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
-                    void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-        return -ENOSYS;
-}
-int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
-                                      struct file *filp,
-                                      void __user *buffer,
-                                      size_t *lenp, loff_t *ppos)
-{
-    return -ENOSYS;
-}
-struct ctl_table_header * register_sysctl_table(ctl_table * table, 
-                                                int insert_at_head)
-{
-        return NULL;
-}
-void unregister_sysctl_table(struct ctl_table_header * table)
-{
-}
-#endif /* CONFIG_SYSCTL */
 /*
 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e78187657330..2ed4040d0dc5 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -75,7 +75,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
        /*
         * If new attributes are added, please revisit this allocation
         */
-        skb = nlmsg_new(size);
+        skb = nlmsg_new(size, GFP_KERNEL);
        if (!skb)
                return -ENOMEM;
diff --git a/kernel/timer.c b/kernel/timer.c
index 1d7dd6267c2d..4f55622b0d38 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -136,7 +136,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
        list_add_tail(&timer->entry, vec);
 }
-/***
+/**
 * init_timer - initialize a timer.
 * @timer: the timer to be initialized
 *
@@ -175,6 +175,7 @@ static inline void detach_timer(struct timer_list *timer,
 */
 static tvec_base_t *lock_timer_base(struct timer_list *timer,
                                        unsigned long *flags)
+        __acquires(timer->base->lock)
 {
        tvec_base_t *base;
@@ -235,7 +236,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 EXPORT_SYMBOL(__mod_timer);
-/***
+/**
 * add_timer_on - start a timer on a particular CPU
 * @timer: the timer to be added
 * @cpu: the CPU to start it on
@@ -255,9 +256,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
 }
-/***
+/**
 * mod_timer - modify a timer's timeout
 * @timer: the timer to be modified
+ * @expires: new timeout in jiffies
 *
 * mod_timer is a more efficient way to update the expire field of an
 * active timer (if the timer is inactive it will be activated)
@@ -291,7 +293,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 EXPORT_SYMBOL(mod_timer);
-/***
+/**
 * del_timer - deactive a timer.
 * @timer: the timer to be deactivated
 *
@@ -323,7 +325,10 @@ int del_timer(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer);
 #ifdef CONFIG_SMP
-/*
+/**
+ * try_to_del_timer_sync - Try to deactivate a timer
+ * @timer: timer do del
+ *
 * This function tries to deactivate a timer. Upon successful (ret >= 0)
 * exit the timer is not queued and the handler is not running on any CPU.
 *
@@ -351,7 +356,7 @@ out:
        return ret;
 }
-/***
+/**
 * del_timer_sync - deactivate a timer and wait for the handler to finish.
 * @timer: the timer to be deactivated
 *
@@ -401,15 +406,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
        return index;
 }
-/***
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
+/**
 * __run_timers - run all expired timers (if any) on this CPU.
 * @base: the timer vector to be processed.
 *
 * This function cascades all vectors and executes all expired timer
 * vectors.
 */
-#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
 static inline void __run_timers(tvec_base_t *base)
 {
        struct timer_list *timer;
@@ -970,7 +975,7 @@ void __init timekeeping_init(void)
 static int timekeeping_suspended;
-/*
+/**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 * @dev:        unused
 *
@@ -1106,7 +1111,7 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
        clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
 }
-/*
+/**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 * Called from the timer interrupt, must hold a write on xtime_lock.
@@ -1217,10 +1222,8 @@ static inline void calc_load(unsigned long ticks)
        unsigned long active_tasks; /* fixed-point */
        static int count = LOAD_FREQ;
-        count -= ticks;
+        active_tasks = count_active_tasks();
-        if (count < 0) {
+        for (count -= ticks; count < 0; count += LOAD_FREQ) {
-                count += LOAD_FREQ;
-                active_tasks = count_active_tasks();
                CALC_LOAD(avenrun[0], EXP_1, active_tasks);
                CALC_LOAD(avenrun[1], EXP_5, active_tasks);
                CALC_LOAD(avenrun[2], EXP_15, active_tasks);
@@ -1265,11 +1268,8 @@ void run_local_timers(void)
 * Called by the timer interrupt. xtime_lock must already be taken
 * by the timer IRQ!
 */
-static inline void update_times(void)
+static inline void update_times(unsigned long ticks)
 {
-        unsigned long ticks;
-        ticks = jiffies - wall_jiffies;
        wall_jiffies += ticks;
        update_wall_time();
        calc_load(ticks);
@@ -1281,12 +1281,10 @@ static inline void update_times(void)
 * jiffies is defined in the linker script...
 */
-void do_timer(struct pt_regs *regs)
+void do_timer(unsigned long ticks)
 {
-        jiffies_64++;
+        jiffies_64 += ticks;
-        /* prevent loading jiffies before storing new jiffies_64 value. */
+        update_times(ticks);
-        barrier();
-        update_times();
 }
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1470,8 +1468,9 @@ asmlinkage long sys_gettid(void)
        return current->pid;
 }
-/*
+/**
 * sys_sysinfo - fill in sysinfo struct
+ * @info: pointer to buffer to fill
 */ 
 asmlinkage long sys_sysinfo(struct sysinfo __user *info)
 {
@@ -1688,8 +1687,10 @@ static struct notifier_block __cpuinitdata timers_nb = {
 void __init init_timers(void)
 {
-        timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+        int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
                                (void *)(long)smp_processor_id());
+        BUG_ON(err == NOTIFY_BAD);
        register_cpu_notifier(&timers_nb);
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
 }
diff --git a/kernel/unwind.c b/kernel/unwind.c
index f69c804c8e62..2e2368607aab 100644
--- a/kernel/unwind.c
+++ b/kernel/unwind.c
@@ -102,7 +102,7 @@ static struct unwind_table {
        unsigned long size;
        struct unwind_table *link;
        const char *name;
-} root_table, *last_table;
+} root_table;
 struct unwind_item {
        enum item_location {
@@ -174,6 +174,8 @@ void __init unwind_init(void)
 #ifdef CONFIG_MODULES
+static struct unwind_table *last_table;
 /* Must be called with module_mutex held. */
 void *unwind_add_table(struct module *module,
                       const void *table_start,
@@ -603,6 +605,7 @@ int unwind(struct unwind_frame_info *frame)
 #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
        const u32 *fde = NULL, *cie = NULL;
        const u8 *ptr = NULL, *end = NULL;
+        unsigned long pc = UNW_PC(frame) - frame->call_frame;
        unsigned long startLoc = 0, endLoc = 0, cfa;
        unsigned i;
        signed ptrType = -1;
@@ -612,7 +615,7 @@ int unwind(struct unwind_frame_info *frame)
        if (UNW_PC(frame) == 0)
                return -EINVAL;
-        if ((table = find_table(UNW_PC(frame))) != NULL
+        if ((table = find_table(pc)) != NULL
            && !(table->size & (sizeof(*fde) - 1))) {
                unsigned long tableSize = table->size;
@@ -647,7 +650,7 @@ int unwind(struct unwind_frame_info *frame)
                                                ptrType & DW_EH_PE_indirect
                                                ? ptrType
                                                : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
-                        if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc)
+                        if (pc >= startLoc && pc < endLoc)
                                break;
                        cie = NULL;
                }
@@ -657,16 +660,28 @@ int unwind(struct unwind_frame_info *frame)
                state.cieEnd = ptr; /* keep here temporarily */
                ptr = (const u8 *)(cie + 2);
                end = (const u8 *)(cie + 1) + *cie;
+                frame->call_frame = 1;
                if ((state.version = *ptr) != 1)
                        cie = NULL; /* unsupported version */
                else if (*++ptr) {
                        /* check if augmentation size is first (and thus present) */
                        if (*ptr == 'z') {
-                                /* check for ignorable (or already handled)
+                                while (++ptr < end && *ptr) {
-                                 * nul-terminated augmentation string */
+                                        switch(*ptr) {
-                                while (++ptr < end && *ptr)
+                                        /* check for ignorable (or already handled)
-                                        if (strchr("LPR", *ptr) == NULL)
+                                         * nul-terminated augmentation string */
+                                        case 'L':
+                                        case 'P':
+                                        case 'R':
+                                                continue;
+                                        case 'S':
+                                                frame->call_frame = 0;
+                                                continue;
+                                        default:
                                                break;
+                                        }
+                                        break;
+                                }
                        }
                        if (ptr >= end || *ptr)
                                cie = NULL;
@@ -755,7 +770,7 @@ int unwind(struct unwind_frame_info *frame)
        state.org = startLoc;
        memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
        /* process instructions */
-        if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state)
+        if (!processCFI(ptr, end, pc, ptrType, &state)
           || state.loc > endLoc
           || state.regs[retAddrReg].where == Nowhere
           || state.cfa.reg >= ARRAY_SIZE(reg_info)
@@ -763,6 +778,11 @@ int unwind(struct unwind_frame_info *frame)
           || state.cfa.offs % sizeof(unsigned long))
                return -EIO;
        /* update frame */
+#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
+        if(frame->call_frame
+           && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign))
+                frame->call_frame = 0;
+#endif
        cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
        startLoc = min((unsigned long)UNW_SP(frame), cfa);
        endLoc = max((unsigned long)UNW_SP(frame), cfa);
@@ -866,6 +886,7 @@ int unwind_init_frame_info(struct unwind_frame_info *info,
                           /*const*/ struct pt_regs *regs)
 {
        info->task = tsk;
+        info->call_frame = 0;
        arch_unw_init_frame_info(info, regs);
        return 0;
@@ -879,6 +900,7 @@ int unwind_init_blocked(struct unwind_frame_info *info,
                        struct task_struct *tsk)
 {
        info->task = tsk;
+        info->call_frame = 0;
        arch_unw_init_blocked(info);
        return 0;
@@ -894,6 +916,7 @@ int unwind_init_running(struct unwind_frame_info *info,
                        void *arg)
 {
        info->task = current;
+        info->call_frame = 0;
        return arch_unwind_init_running(info, callback, arg);
 }