21 files changed, 676 insertions, 396 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 2e3f4a47e7..6312d6bd43 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -54,6 +54,7 @@
 #include <linux/jiffies.h>
 #include <linux/times.h>
 #include <linux/syscalls.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
@@ -192,6 +193,7 @@ static void acct_file_reopen(struct file *file)
                add_timer(&acct_globals.timer);
        }
        if (old_acct) {
+                mnt_unpin(old_acct->f_vfsmnt);
                spin_unlock(&acct_globals.lock);
                do_acct_process(0, old_acct);
                filp_close(old_acct, NULL);
@@ -199,6 +201,42 @@ static void acct_file_reopen(struct file *file)
        }
 }
+static int acct_on(char *name)
+{
+        struct file *file;
+        int error;
+        /* Difference from BSD - they don't do O_APPEND */
+        file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+        if (IS_ERR(file))
+                return PTR_ERR(file);
+        if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+                filp_close(file, NULL);
+                return -EACCES;
+        }
+        if (!file->f_op->write) {
+                filp_close(file, NULL);
+                return -EIO;
+        }
+        error = security_acct(file);
+        if (error) {
+                filp_close(file, NULL);
+                return error;
+        }
+        spin_lock(&acct_globals.lock);
+        mnt_pin(file->f_vfsmnt);
+        acct_file_reopen(file);
+        spin_unlock(&acct_globals.lock);
+        mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */
+        return 0;
+}
 /**
 * sys_acct - enable/disable process accounting
 * @name: file name for accounting records or NULL to shutdown accounting
@@ -212,47 +250,41 @@ static void acct_file_reopen(struct file *file)
 */
 asmlinkage long sys_acct(const char __user *name)
 {
-        struct file *file = NULL;
-        char *tmp;
        int error;
        if (!capable(CAP_SYS_PACCT))
                return -EPERM;
        if (name) {
-                tmp = getname(name);
+                char *tmp = getname(name);
-                if (IS_ERR(tmp)) {
+                if (IS_ERR(tmp))
                        return (PTR_ERR(tmp));
-                }
+                error = acct_on(tmp);
-                /* Difference from BSD - they don't do O_APPEND */
-                file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
                putname(tmp);
-                if (IS_ERR(file)) {
+        } else {
-                        return (PTR_ERR(file));
+                error = security_acct(NULL);
-                }
+                if (!error) {
-                if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+                        spin_lock(&acct_globals.lock);
-                        filp_close(file, NULL);
+                        acct_file_reopen(NULL);
-                        return (-EACCES);
+                        spin_unlock(&acct_globals.lock);
-                }
-                if (!file->f_op->write) {
-                        filp_close(file, NULL);
-                        return (-EIO);
                }
        }
+        return error;
+}
-        error = security_acct(file);
+/**
-        if (error) {
+ * acct_auto_close - turn off a filesystem's accounting if it is on
-                if (file)
+ * @m: vfsmount being shut down
-                        filp_close(file, NULL);
+ *
-                return error;
+ * If the accounting is turned on for a file in the subtree pointed to
-        }
+ * to by m, turn accounting off.  Done when m is about to die.
+ */
+void acct_auto_close_mnt(struct vfsmount *m)
+{
        spin_lock(&acct_globals.lock);
-        acct_file_reopen(file);
+        if (acct_globals.file && acct_globals.file->f_vfsmnt == m)
+                acct_file_reopen(NULL);
        spin_unlock(&acct_globals.lock);
-        return (0);
 }
 /**
@@ -266,8 +298,8 @@ void acct_auto_close(struct super_block *sb)
 {
        spin_lock(&acct_globals.lock);
        if (acct_globals.file &&
-            acct_globals.file->f_dentry->d_inode->i_sb == sb) {
+            acct_globals.file->f_vfsmnt->mnt_sb == sb) {
-                acct_file_reopen((struct file *)NULL);
+                acct_file_reopen(NULL);
        }
        spin_unlock(&acct_globals.lock);
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3619e93918..d61ba88f34 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,24 @@ EXPORT_SYMBOL_GPL(cpucontrol);
 static struct notifier_block *cpu_chain;
+/*
+ * Used to check by callers if they need to acquire the cpucontrol
+ * or not to protect a cpu from being removed. Its sometimes required to
+ * call these functions both for normal operations, and in response to
+ * a cpu being added/removed. If the context of the call is in the same
+ * thread context as a CPU hotplug thread, we dont need to take the lock
+ * since its already protected
+ * check drivers/cpufreq/cpufreq.c for its usage - Ashok Raj
+ */
+int current_in_cpu_hotplug(void)
+{
+        return (current->flags & PF_HOTPLUG_CPU);
+}
+EXPORT_SYMBOL_GPL(current_in_cpu_hotplug);
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
 {
@@ -94,6 +112,13 @@ int cpu_down(unsigned int cpu)
                goto out;
        }
+        /*
+         * Leave a trace in current->flags indicating we are already in
+         * process of performing CPU hotplug. Callers can check if cpucontrol
+         * is already acquired by current thread, and if so not cause
+         * a dead lock by not acquiring the lock
+         */
+        current->flags |= PF_HOTPLUG_CPU;
        err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
                                                (void *)(long)cpu);
        if (err == NOTIFY_BAD) {
@@ -146,6 +171,7 @@ out_thread:
 out_allowed:
        set_cpus_allowed(current, old_allowed);
 out:
+        current->flags &= ~PF_HOTPLUG_CPU;
        unlock_cpu_hotplug();
        return err;
 }
@@ -163,6 +189,12 @@ int __devinit cpu_up(unsigned int cpu)
                ret = -EINVAL;
                goto out;
        }
+        /*
+         * Leave a trace in current->flags indicating we are already in
+         * process of performing CPU hotplug.
+         */
+        current->flags |= PF_HOTPLUG_CPU;
        ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
        if (ret == NOTIFY_BAD) {
                printk("%s: attempt to bring up CPU %u failed\n",
@@ -185,6 +217,7 @@ out_notify:
        if (ret != 0)
                notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
 out:
+        current->flags &= ~PF_HOTPLUG_CPU;
        up(&cpucontrol);
        return ret;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 537394b25e..452a1d1161 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -28,6 +28,7 @@
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <linux/cn_proc.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -863,6 +864,7 @@ fastcall NORET_TYPE void do_exit(long code)
                module_put(tsk->binfmt->module);
        tsk->exit_code = code;
+        proc_exit_connector(tsk);
        exit_notify(tsk);
 #ifdef CONFIG_NUMA
        mpol_free(tsk->mempolicy);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8a069612ea..158710d225 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -42,6 +42,7 @@
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/cn_proc.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -469,13 +470,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
        if (clone_flags & CLONE_VM) {
                atomic_inc(&oldmm->mm_users);
                mm = oldmm;
-                /*
-                 * There are cases where the PTL is held to ensure no
-                 * new threads start up in user mode using an mm, which
-                 * allows optimizing out ipis; the tlb_gather_mmu code
-                 * is an example.
-                 */
-                spin_unlock_wait(&oldmm->page_table_lock);
                goto good_mm;
        }
@@ -1143,6 +1137,7 @@ static task_t *copy_process(unsigned long clone_flags,
                        __get_cpu_var(process_counts)++;
        }
+        proc_fork_connector(p);
        if (!current->signal->tty && p->signal->tty)
                p->signal->tty = NULL;
diff --git a/kernel/futex.c b/kernel/futex.c
index 3b4d5ad44c..aca8d10704 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -365,6 +365,11 @@ retry:
                if (bh1 != bh2)
                        spin_unlock(&bh2->lock);
+                if (unlikely(op_ret != -EFAULT)) {
+                        ret = op_ret;
+                        goto out;
+                }
                /* futex_atomic_op_inuser needs to both read and write
                 * *(int __user *)uaddr2, but we can't modify it
                 * non-atomically.  Therefore, if get_user below is not
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1cfdb08ddf..3bd7226d15 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -24,6 +24,7 @@ cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
 /**
 *      synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ *      @irq: interrupt number to wait for
 *
 *      This function waits for any pending IRQ handlers for this interrupt
 *      to complete before returning. If you use this function while
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ce4915dd68..5beda378cc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -32,7 +32,6 @@
 *              <prasanna@in.ibm.com> added function-return probes.
 */
 #include <linux/kprobes.h>
-#include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -49,9 +48,9 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
-unsigned int kprobe_cpu = NR_CPUS;
+static DEFINE_SPINLOCK(kprobe_lock);    /* Protects kprobe_table */
-static DEFINE_SPINLOCK(kprobe_lock);
+DEFINE_SPINLOCK(kretprobe_lock);        /* Protects kretprobe_inst_table */
-static struct kprobe *curr_kprobe;
+static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 /*
 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -153,50 +152,31 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
        }
 }
-/* Locks kprobe: irqs must be disabled */
+/* We have preemption disabled.. so it is safe to use __ versions */
-void __kprobes lock_kprobes(void)
+static inline void set_kprobe_instance(struct kprobe *kp)
 {
-        unsigned long flags = 0;
+        __get_cpu_var(kprobe_instance) = kp;
-        /* Avoiding local interrupts to happen right after we take the kprobe_lock
-         * and before we get a chance to update kprobe_cpu, this to prevent
-         * deadlock when we have a kprobe on ISR routine and a kprobe on task
-         * routine
-         */
-        local_irq_save(flags);
-        spin_lock(&kprobe_lock);
-        kprobe_cpu = smp_processor_id();
-        local_irq_restore(flags);
 }
-void __kprobes unlock_kprobes(void)
+static inline void reset_kprobe_instance(void)
 {
-        unsigned long flags = 0;
+        __get_cpu_var(kprobe_instance) = NULL;
-        /* Avoiding local interrupts to happen right after we update
-         * kprobe_cpu and before we get a a chance to release kprobe_lock,
-         * this to prevent deadlock when we have a kprobe on ISR routine and
-         * a kprobe on task routine
-         */
-        local_irq_save(flags);
-        kprobe_cpu = NR_CPUS;
-        spin_unlock(&kprobe_lock);
-        local_irq_restore(flags);
 }
-/* You have to be holding the kprobe_lock */
+/*
+ * This routine is called either:
+ *      - under the kprobe_lock spinlock - during kprobe_[un]register()
+ *                              OR
+ *      - with preemption disabled - from arch/xxx/kernel/kprobes.c
+ */
 struct kprobe __kprobes *get_kprobe(void *addr)
 {
        struct hlist_head *head;
        struct hlist_node *node;
+        struct kprobe *p;
        head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
-        hlist_for_each(node, head) {
+        hlist_for_each_entry_rcu(p, node, head, hlist) {
-                struct kprobe *p = hlist_entry(node, struct kprobe, hlist);
                if (p->addr == addr)
                        return p;
        }
@@ -211,13 +191,13 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe *kp;
-        list_for_each_entry(kp, &p->list, list) {
+        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->pre_handler) {
-                        curr_kprobe = kp;
+                        set_kprobe_instance(kp);
                        if (kp->pre_handler(kp, regs))
                                return 1;
                }
-                curr_kprobe = NULL;
+                reset_kprobe_instance();
        }
        return 0;
 }
@@ -227,11 +207,11 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 {
        struct kprobe *kp;
-        list_for_each_entry(kp, &p->list, list) {
+        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->post_handler) {
-                        curr_kprobe = kp;
+                        set_kprobe_instance(kp);
                        kp->post_handler(kp, regs, flags);
-                        curr_kprobe = NULL;
+                        reset_kprobe_instance();
                }
        }
        return;
@@ -240,12 +220,14 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
                                        int trapnr)
 {
+        struct kprobe *cur = __get_cpu_var(kprobe_instance);
        /*
         * if we faulted "during" the execution of a user specified
         * probe handler, invoke just that probe's fault handler
         */
-        if (curr_kprobe && curr_kprobe->fault_handler) {
+        if (cur && cur->fault_handler) {
-                if (curr_kprobe->fault_handler(curr_kprobe, regs, trapnr))
+                if (cur->fault_handler(cur, regs, trapnr))
                        return 1;
        }
        return 0;
@@ -253,17 +235,18 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
 static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
-        struct kprobe *kp = curr_kprobe;
+        struct kprobe *cur = __get_cpu_var(kprobe_instance);
-        if (curr_kprobe && kp->break_handler) {
+        int ret = 0;
-                if (kp->break_handler(kp, regs)) {
-                        curr_kprobe = NULL;
+        if (cur && cur->break_handler) {
-                        return 1;
+                if (cur->break_handler(cur, regs))
-                }
+                        ret = 1;
        }
-        curr_kprobe = NULL;
+        reset_kprobe_instance();
-        return 0;
+        return ret;
 }
+/* Called with kretprobe_lock held */
 struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 {
        struct hlist_node *node;
@@ -273,6 +256,7 @@ struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
        return NULL;
 }
+/* Called with kretprobe_lock held */
 static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
                                                              *rp)
 {
@@ -283,6 +267,7 @@ static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
        return NULL;
 }
+/* Called with kretprobe_lock held */
 void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 {
        /*
@@ -301,6 +286,7 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri)
        hlist_add_head(&ri->uflist, &ri->rp->used_instances);
 }
+/* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
 {
        /* remove rp inst off the rprobe_inst_table */
@@ -334,13 +320,13 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
        struct hlist_node *node, *tmp;
        unsigned long flags = 0;
-        spin_lock_irqsave(&kprobe_lock, flags);
+        spin_lock_irqsave(&kretprobe_lock, flags);
        head = kretprobe_inst_table_head(current);
        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
                if (ri->task == tk)
                        recycle_rp_inst(ri);
        }
-        spin_unlock_irqrestore(&kprobe_lock, flags);
+        spin_unlock_irqrestore(&kretprobe_lock, flags);
 }
 /*
@@ -351,9 +337,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
                                           struct pt_regs *regs)
 {
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+        unsigned long flags = 0;
        /*TODO: consider to only swap the RA after the last pre_handler fired */
+        spin_lock_irqsave(&kretprobe_lock, flags);
        arch_prepare_kretprobe(rp, regs);
+        spin_unlock_irqrestore(&kretprobe_lock, flags);
        return 0;
 }
@@ -384,13 +373,13 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
        struct kprobe *kp;
        if (p->break_handler) {
-                list_for_each_entry(kp, &old_p->list, list) {
+                list_for_each_entry_rcu(kp, &old_p->list, list) {
                        if (kp->break_handler)
                                return -EEXIST;
                }
-                list_add_tail(&p->list, &old_p->list);
+                list_add_tail_rcu(&p->list, &old_p->list);
        } else
-                list_add(&p->list, &old_p->list);
+                list_add_rcu(&p->list, &old_p->list);
        return 0;
 }
@@ -408,18 +397,18 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
        ap->break_handler = aggr_break_handler;
        INIT_LIST_HEAD(&ap->list);
-        list_add(&p->list, &ap->list);
+        list_add_rcu(&p->list, &ap->list);
        INIT_HLIST_NODE(&ap->hlist);
-        hlist_del(&p->hlist);
+        hlist_del_rcu(&p->hlist);
-        hlist_add_head(&ap->hlist,
+        hlist_add_head_rcu(&ap->hlist,
                &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]);
 }
 /*
 * This is the second or subsequent kprobe at the address - handle
 * the intricacies
- * TODO: Move kcalloc outside the spinlock
+ * TODO: Move kcalloc outside the spin_lock
 */
 static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
                                          struct kprobe *p)
@@ -445,7 +434,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
 {
        arch_disarm_kprobe(p);
-        hlist_del(&p->hlist);
+        hlist_del_rcu(&p->hlist);
        spin_unlock_irqrestore(&kprobe_lock, flags);
        arch_remove_kprobe(p);
 }
@@ -453,11 +442,10 @@ static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
 static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
                struct kprobe *p, unsigned long flags)
 {
-        list_del(&p->list);
+        list_del_rcu(&p->list);
-        if (list_empty(&old_p->list)) {
+        if (list_empty(&old_p->list))
                cleanup_kprobe(old_p, flags);
-                kfree(old_p);
+        else
-        } else
                spin_unlock_irqrestore(&kprobe_lock, flags);
 }
@@ -480,9 +468,9 @@ int __kprobes register_kprobe(struct kprobe *p)
        if ((ret = arch_prepare_kprobe(p)) != 0)
                goto rm_kprobe;
+        p->nmissed = 0;
        spin_lock_irqsave(&kprobe_lock, flags);
        old_p = get_kprobe(p->addr);
-        p->nmissed = 0;
        if (old_p) {
                ret = register_aggr_kprobe(old_p, p);
                goto out;
@@ -490,7 +478,7 @@ int __kprobes register_kprobe(struct kprobe *p)
        arch_copy_kprobe(p);
        INIT_HLIST_NODE(&p->hlist);
-        hlist_add_head(&p->hlist,
+        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
        arch_arm_kprobe(p);
@@ -511,10 +499,16 @@ void __kprobes unregister_kprobe(struct kprobe *p)
        spin_lock_irqsave(&kprobe_lock, flags);
        old_p = get_kprobe(p->addr);
        if (old_p) {
+                /* cleanup_*_kprobe() does the spin_unlock_irqrestore */
                if (old_p->pre_handler == aggr_pre_handler)
                        cleanup_aggr_kprobe(old_p, p, flags);
                else
                        cleanup_kprobe(p, flags);
+                synchronize_sched();
+                if (old_p->pre_handler == aggr_pre_handler &&
+                                list_empty(&old_p->list))
+                        kfree(old_p);
        } else
                spin_unlock_irqrestore(&kprobe_lock, flags);
 }
@@ -591,13 +585,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
        unregister_kprobe(&rp->kp);
        /* No race here */
-        spin_lock_irqsave(&kprobe_lock, flags);
+        spin_lock_irqsave(&kretprobe_lock, flags);
        free_rp_inst(rp);
        while ((ri = get_used_rp_inst(rp)) != NULL) {
                ri->rp = NULL;
                hlist_del(&ri->uflist);
        }
-        spin_unlock_irqrestore(&kprobe_lock, flags);
+        spin_unlock_irqrestore(&kretprobe_lock, flags);
 }
 static int __init init_kprobes(void)
diff --git a/kernel/module.c b/kernel/module.c
index ff5c500ab6..2ea929d51a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -37,6 +37,7 @@
 #include <linux/stop_machine.h>
 #include <linux/device.h>
 #include <linux/string.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 91a8942649..84af54c39e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -497,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p,
                left = cputime_div(cputime_sub(expires.cpu, val.cpu),
                                   nthreads);
                do {
-                        if (!unlikely(t->flags & PF_EXITING)) {
+                        if (likely(!(t->flags & PF_EXITING))) {
                                ticks = cputime_add(prof_ticks(t), left);
                                if (cputime_eq(t->it_prof_expires,
                                               cputime_zero) ||
@@ -512,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p,
                left = cputime_div(cputime_sub(expires.cpu, val.cpu),
                                   nthreads);
                do {
-                        if (!unlikely(t->flags & PF_EXITING)) {
+                        if (likely(!(t->flags & PF_EXITING))) {
                                ticks = cputime_add(virt_ticks(t), left);
                                if (cputime_eq(t->it_virt_expires,
                                               cputime_zero) ||
@@ -527,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p,
                nsleft = expires.sched - val.sched;
                do_div(nsleft, nthreads);
                do {
-                        if (!unlikely(t->flags & PF_EXITING)) {
+                        if (likely(!(t->flags & PF_EXITING))) {
                                ns = t->sched_time + nsleft;
                                if (t->it_sched_expires == 0 ||
                                    t->it_sched_expires > ns) {
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 18d7d693fb..6ee2cad530 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -167,7 +167,7 @@ static int enter_state(suspend_state_t state)
 {
        int error;
-        if (pm_ops->valid && !pm_ops->valid(state))
+        if (pm_ops && pm_ops->valid && !pm_ops->valid(state))
                return -ENODEV;
        if (down_trylock(&pm_sem))
                return -EBUSY;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index d4fd96a135..6c042b5ee1 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -65,8 +65,8 @@ extern suspend_pagedir_t *pagedir_save;
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
-extern int restore_highmem(void);
+extern void free_pagedir(struct pbe *pblist);
-extern struct pbe * alloc_pagedir(unsigned nr_pages);
+extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
 extern void swsusp_free(void);
-extern int enough_swap(unsigned nr_pages);
+extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 42a6287043..4a6dbcefd3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -88,8 +88,7 @@ static int save_highmem_zone(struct zone *zone)
        return 0;
 }
+int save_highmem(void)
-static int save_highmem(void)
 {
        struct zone *zone;
        int res = 0;
@@ -120,11 +119,7 @@ int restore_highmem(void)
        }
        return 0;
 }
-#else
+#endif
-static int save_highmem(void) { return 0; }
-int restore_highmem(void) { return 0; }
-#endif /* CONFIG_HIGHMEM */
 static int pfn_is_nosave(unsigned long pfn)
 {
@@ -168,9 +163,8 @@ static unsigned count_data_pages(void)
 {
        struct zone *zone;
        unsigned long zone_pfn;
-        unsigned n;
+        unsigned int n = 0;
-        n = 0;
        for_each_zone (zone) {
                if (is_highmem(zone))
                        continue;
@@ -217,7 +211,7 @@ static void copy_data_pages(struct pbe *pblist)
 *      free_pagedir - free pages allocated with alloc_pagedir()
 */
-static void free_pagedir(struct pbe *pblist)
+void free_pagedir(struct pbe *pblist)
 {
        struct pbe *pbe;
@@ -250,10 +244,10 @@ static inline void fill_pb_page(struct pbe *pbpage)
 *      of memory pages allocated with alloc_pagedir()
 */
-void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
+void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 {
        struct pbe *pbpage, *p;
-        unsigned num = PBES_PER_PAGE;
+        unsigned int num = PBES_PER_PAGE;
        for_each_pb_page (pbpage, pblist) {
                if (num >= nr_pages)
@@ -270,9 +264,30 @@ void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
        pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
 }
-static void *alloc_image_page(void)
+/**
+ *      @safe_needed - on resume, for storing the PBE list and the image,
+ *      we can only use memory pages that do not conflict with the pages
+ *      which had been used before suspend.
+ *
+ *      The unsafe pages are marked with the PG_nosave_free flag
+ *
+ *      Allocated but unusable (ie eaten) memory pages should be marked
+ *      so that swsusp_free() can release them
+ */
+static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 {
-        void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+        void *res;
+        if (safe_needed)
+                do {
+                        res = (void *)get_zeroed_page(gfp_mask);
+                        if (res && PageNosaveFree(virt_to_page(res)))
+                                /* This is for swsusp_free() */
+                                SetPageNosave(virt_to_page(res));
+                } while (res && PageNosaveFree(virt_to_page(res)));
+        else
+                res = (void *)get_zeroed_page(gfp_mask);
        if (res) {
                SetPageNosave(virt_to_page(res));
                SetPageNosaveFree(virt_to_page(res));
@@ -280,6 +295,11 @@ static void *alloc_image_page(void)
        return res;
 }
+unsigned long get_safe_page(gfp_t gfp_mask)
+{
+        return (unsigned long)alloc_image_page(gfp_mask, 1);
+}
 /**
 *      alloc_pagedir - Allocate the page directory.
 *
@@ -293,21 +313,21 @@ static void *alloc_image_page(void)
 *      On each page we set up a list of struct_pbe elements.
 */
-struct pbe *alloc_pagedir(unsigned nr_pages)
+struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed)
 {
-        unsigned num;
+        unsigned int num;
        struct pbe *pblist, *pbe;
        if (!nr_pages)
                return NULL;
        pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-        pblist = alloc_image_page();
+        pblist = alloc_image_page(gfp_mask, safe_needed);
        /* FIXME: rewrite this ugly loop */
        for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
                        pbe = pbe->next, num += PBES_PER_PAGE) {
                pbe += PB_PAGE_SKIP;
-                pbe->next = alloc_image_page();
+                pbe->next = alloc_image_page(gfp_mask, safe_needed);
        }
        if (!pbe) { /* get_zeroed_page() failed */
                free_pagedir(pblist);
@@ -329,7 +349,7 @@ void swsusp_free(void)
        for_each_zone(zone) {
                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
                        if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
-                                struct page * page;
+                                struct page *page;
                                page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
                                if (PageNosave(page) && PageNosaveFree(page)) {
                                        ClearPageNosave(page);
@@ -348,31 +368,39 @@ void swsusp_free(void)
 *      free pages.
 */
-static int enough_free_mem(unsigned nr_pages)
+static int enough_free_mem(unsigned int nr_pages)
 {
        pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
        return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
+int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+{
+        struct pbe *p;
-static struct pbe *swsusp_alloc(unsigned nr_pages)
+        for_each_pbe (p, pblist) {
+                p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
+                if (!p->address)
+                        return -ENOMEM;
+        }
+        return 0;
+}
+static struct pbe *swsusp_alloc(unsigned int nr_pages)
 {
-        struct pbe *pblist, *p;
+        struct pbe *pblist;
-        if (!(pblist = alloc_pagedir(nr_pages))) {
+        if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
                printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
                return NULL;
        }
        create_pbe_list(pblist, nr_pages);
-        for_each_pbe (p, pblist) {
+        if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
-                p->address = (unsigned long)alloc_image_page();
+                printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-                if (!p->address) {
+                swsusp_free();
-                        printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+                return NULL;
-                        swsusp_free();
-                        return NULL;
-                }
        }
        return pblist;
@@ -380,14 +408,9 @@ static struct pbe *swsusp_alloc(unsigned nr_pages)
 asmlinkage int swsusp_save(void)
 {
-        unsigned nr_pages;
+        unsigned int nr_pages;
        pr_debug("swsusp: critical section: \n");
-        if (save_highmem()) {
-                printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
-                restore_highmem();
-                return -ENOMEM;
-        }
        drain_local_pages();
        nr_pages = count_data_pages();
@@ -407,11 +430,6 @@ asmlinkage int swsusp_save(void)
                return -ENOMEM;
        }
-        if (!enough_swap(nr_pages)) {
-                printk(KERN_ERR "swsusp: Not enough free swap\n");
-                return -ENOSPC;
-        }
        pagedir_nosave = swsusp_alloc(nr_pages);
        if (!pagedir_nosave)
                return -ENOMEM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 12db1d2ad6..c05f46e734 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -73,6 +73,14 @@
 #include "power.h"
+#ifdef CONFIG_HIGHMEM
+int save_highmem(void);
+int restore_highmem(void);
+#else
+static int save_highmem(void) { return 0; }
+static int restore_highmem(void) { return 0; }
+#endif
 #define CIPHER "aes"
 #define MAXKEY 32
 #define MAXIV  32
@@ -85,18 +93,11 @@ unsigned int nr_copy_pages __nosavedata = 0;
 /* Suspend pagedir is allocated before final copy, therefore it
   must be freed after resume
-   Warning: this is evil. There are actually two pagedirs at time of
-   resume. One is "pagedir_save", which is empty frame allocated at
-   time of suspend, that must be freed. Second is "pagedir_nosave",
-   allocated at time of resume, that travels through memory not to
-   collide with anything.
   Warning: this is even more evil than it seems. Pagedirs this file
   talks about are completely different from page directories used by
   MMU hardware.
 */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
-suspend_pagedir_t *pagedir_save;
 #define SWSUSP_SIG      "S1SUSPEND"
@@ -122,8 +123,8 @@ static struct swsusp_info swsusp_info;
 static unsigned short swapfile_used[MAX_SWAPFILES];
 static unsigned short root_swap;
-static int write_page(unsigned long addr, swp_entry_t * loc);
+static int write_page(unsigned long addr, swp_entry_t *loc);
-static int bio_read_page(pgoff_t page_off, void * page);
+static int bio_read_page(pgoff_t page_off, void *page);
 static u8 key_iv[MAXKEY+MAXIV];
@@ -355,7 +356,7 @@ static void lock_swapdevices(void)
 *      This is a partial improvement, since we will at least return other
 *      errors, though we need to eventually fix the damn code.
 */
-static int write_page(unsigned long addr, swp_entry_t * loc)
+static int write_page(unsigned long addr, swp_entry_t *loc)
 {
        swp_entry_t entry;
        int error = 0;
@@ -383,9 +384,9 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
 static void data_free(void)
 {
        swp_entry_t entry;
-        struct pbe * p;
+        struct pbe *p;
-        for_each_pbe(p, pagedir_nosave) {
+        for_each_pbe (p, pagedir_nosave) {
                entry = p->swap_address;
                if (entry.val)
                        swap_free(entry);
@@ -492,8 +493,8 @@ static void free_pagedir_entries(void)
 static int write_pagedir(void)
 {
        int error = 0;
-        unsigned n = 0;
+        unsigned int n = 0;
-        struct pbe * pbe;
+        struct pbe *pbe;
        printk( "Writing pagedir...");
        for_each_pb_page (pbe, pagedir_nosave) {
@@ -507,6 +508,26 @@ static int write_pagedir(void)
 }
 /**
+ *      enough_swap - Make sure we have enough swap to save the image.
+ *
+ *      Returns TRUE or FALSE after checking the total amount of swap
+ *      space avaiable.
+ *
+ *      FIXME: si_swapinfo(&i) returns all swap devices information.
+ *      We should only consider resume_device.
+ */
+static int enough_swap(unsigned int nr_pages)
+{
+        struct sysinfo i;
+        si_swapinfo(&i);
+        pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
+        return i.freeswap > (nr_pages + PAGES_FOR_IO +
+                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+}
+/**
 *      write_suspend_image - Write entire image and metadata.
 *
 */
@@ -514,6 +535,11 @@ static int write_suspend_image(void)
 {
        int error;
+        if (!enough_swap(nr_copy_pages)) {
+                printk(KERN_ERR "swsusp: Not enough free swap\n");
+                return -ENOSPC;
+        }
        init_header();
        if ((error = data_write()))
                goto FreeData;
@@ -533,27 +559,6 @@ static int write_suspend_image(void)
        goto Done;
 }
-/**
- *      enough_swap - Make sure we have enough swap to save the image.
- *
- *      Returns TRUE or FALSE after checking the total amount of swap
- *      space avaiable.
- *
- *      FIXME: si_swapinfo(&i) returns all swap devices information.
- *      We should only consider resume_device.
- */
-int enough_swap(unsigned nr_pages)
-{
-        struct sysinfo i;
-        si_swapinfo(&i);
-        pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
-        return i.freeswap > (nr_pages + PAGES_FOR_IO +
-                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
-}
 /* It is important _NOT_ to umount filesystems at this point. We want
 * them synced (in case something goes wrong) but we DO not want to mark
 * filesystem clean: it is not. (And it does not matter, if we resume
@@ -563,12 +568,15 @@ int swsusp_write(void)
 {
        int error;
+        if ((error = swsusp_swap_check())) {
+                printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
+                return error;
+        }
        lock_swapdevices();
        error = write_suspend_image();
        /* This will unlock ignored swap devices since writing is finished */
        lock_swapdevices();
        return error;
 }
@@ -576,6 +584,7 @@ int swsusp_write(void)
 int swsusp_suspend(void)
 {
        int error;
        if ((error = arch_prepare_suspend()))
                return error;
        local_irq_disable();
@@ -587,15 +596,12 @@ int swsusp_suspend(void)
         */
        if ((error = device_power_down(PMSG_FREEZE))) {
                printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
-                local_irq_enable();
+                goto Enable_irqs;
-                return error;
        }
-        if ((error = swsusp_swap_check())) {
+        if ((error = save_highmem())) {
-                printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
+                printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
-                device_power_up();
+                goto Restore_highmem;
-                local_irq_enable();
-                return error;
        }
        save_processor_state();
@@ -603,8 +609,10 @@ int swsusp_suspend(void)
                printk(KERN_ERR "Error %d suspending\n", error);
        /* Restore control flow magically appears here */
        restore_processor_state();
+Restore_highmem:
        restore_highmem();
        device_power_up();
+Enable_irqs:
        local_irq_enable();
        return error;
 }
@@ -636,127 +644,43 @@ int swsusp_resume(void)
 }
 /**
- *      On resume, for storing the PBE list and the image,
+ *      mark_unsafe_pages - mark the pages that cannot be used for storing
- *      we can only use memory pages that do not conflict with the pages
+ *      the image during resume, because they conflict with the pages that
- *      which had been used before suspend.
+ *      had been used before suspend
- *
- *      We don't know which pages are usable until we allocate them.
- *
- *      Allocated but unusable (ie eaten) memory pages are marked so that
- *      swsusp_free() can release them
- */
-unsigned long get_safe_page(gfp_t gfp_mask)
-{
-        unsigned long m;
-        do {
-                m = get_zeroed_page(gfp_mask);
-                if (m && PageNosaveFree(virt_to_page(m)))
-                        /* This is for swsusp_free() */
-                        SetPageNosave(virt_to_page(m));
-        } while (m && PageNosaveFree(virt_to_page(m)));
-        if (m) {
-                /* This is for swsusp_free() */
-                SetPageNosave(virt_to_page(m));
-                SetPageNosaveFree(virt_to_page(m));
-        }
-        return m;
-}
-/**
- *      check_pagedir - We ensure here that pages that the PBEs point to
- *      won't collide with pages where we're going to restore from the loaded
- *      pages later
- */
-static int check_pagedir(struct pbe *pblist)
-{
-        struct pbe *p;
-        /* This is necessary, so that we can free allocated pages
-         * in case of failure
-         */
-        for_each_pbe (p, pblist)
-                p->address = 0UL;
-        for_each_pbe (p, pblist) {
-                p->address = get_safe_page(GFP_ATOMIC);
-                if (!p->address)
-                        return -ENOMEM;
-        }
-        return 0;
-}
-/**
- *      swsusp_pagedir_relocate - It is possible, that some memory pages
- *      occupied by the list of PBEs collide with pages where we're going to
- *      restore from the loaded pages later.  We relocate them here.
 */
-static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
+static void mark_unsafe_pages(struct pbe *pblist)
 {
        struct zone *zone;
        unsigned long zone_pfn;
-        struct pbe *pbpage, *tail, *p;
+        struct pbe *p;
-        void *m;
-        int rel = 0;
        if (!pblist) /* a sanity check */
-                return NULL;
+                return;
-        pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
-                        swsusp_info.pagedir_pages);
        /* Clear page flags */
        for_each_zone (zone) {
-                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-                        if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+                        if (pfn_valid(zone_pfn + zone->zone_start_pfn))
-                                ClearPageNosaveFree(pfn_to_page(zone_pfn +
+                                ClearPageNosaveFree(pfn_to_page(zone_pfn +
                                        zone->zone_start_pfn));
        }
        /* Mark orig addresses */
        for_each_pbe (p, pblist)
                SetPageNosaveFree(virt_to_page(p->orig_address));
-        tail = pblist + PB_PAGE_SKIP;
+}
-        /* Relocate colliding pages */
-        for_each_pb_page (pbpage, pblist) {
-                if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
-                        m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD);
-                        if (!m)
-                                return NULL;
-                        memcpy(m, (void *)pbpage, PAGE_SIZE);
-                        if (pbpage == pblist)
-                                pblist = (struct pbe *)m;
-                        else
-                                tail->next = (struct pbe *)m;
-                        pbpage = (struct pbe *)m;
-                        /* We have to link the PBEs again */
-                        for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
-                                if (p->next) /* needed to save the end */
-                                        p->next = p + 1;
-                        rel++;
-                }
-                tail = pbpage + PB_PAGE_SKIP;
-        }
-        /* This is for swsusp_free() */
+static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
-        for_each_pb_page (pbpage, pblist) {
+{
-                SetPageNosave(virt_to_page(pbpage));
+        /* We assume both lists contain the same number of elements */
-                SetPageNosaveFree(virt_to_page(pbpage));
+        while (src) {
+                dst->orig_address = src->orig_address;
+                dst->swap_address = src->swap_address;
+                dst = dst->next;
+                src = src->next;
        }
-        printk("swsusp: Relocated %d pages\n", rel);
-        return pblist;
 }
 /*
@@ -770,7 +694,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 static atomic_t io_done = ATOMIC_INIT(0);
-static int end_io(struct bio * bio, unsigned int num, int err)
+static int end_io(struct bio *bio, unsigned int num, int err)
 {
        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                panic("I/O error reading memory image");
@@ -778,7 +702,7 @@ static int end_io(struct bio * bio, unsigned int num, int err)
        return 0;
 }
-static struct block_device * resume_bdev;
+static struct block_device *resume_bdev;
 /**
 *      submit - submit BIO request.
@@ -791,10 +715,10 @@ static struct block_device * resume_bdev;
 *      Then submit it and wait.
 */
-static int submit(int rw, pgoff_t page_off, void * page)
+static int submit(int rw, pgoff_t page_off, void *page)
 {
        int error = 0;
-        struct bio * bio;
+        struct bio *bio;
        bio = bio_alloc(GFP_ATOMIC, 1);
        if (!bio)
@@ -823,12 +747,12 @@ static int submit(int rw, pgoff_t page_off, void * page)
        return error;
 }
-static int bio_read_page(pgoff_t page_off, void * page)
+static int bio_read_page(pgoff_t page_off, void *page)
 {
        return submit(READ, page_off, page);
 }
-static int bio_write_page(pgoff_t page_off, void * page)
+static int bio_write_page(pgoff_t page_off, void *page)
 {
        return submit(WRITE, page_off, page);
 }
@@ -838,7 +762,7 @@ static int bio_write_page(pgoff_t page_off, void * page)
 * I really don't think that it's foolproof but more than nothing..
 */
-static const char * sanity_check(void)
+static const char *sanity_check(void)
 {
        dump_info();
        if (swsusp_info.version_code != LINUX_VERSION_CODE)
@@ -864,7 +788,7 @@ static const char * sanity_check(void)
 static int check_header(void)
 {
-        const char * reason = NULL;
+        const char *reason = NULL;
        int error;
        if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
@@ -895,7 +819,7 @@ static int check_sig(void)
                 * Reset swap signature now.
                 */
                error = bio_write_page(0, &swsusp_header);
-        } else { 
+        } else {
                return -EINVAL;
        }
        if (!error)
@@ -912,7 +836,7 @@ static int check_sig(void)
 static int data_read(struct pbe *pblist)
 {
-        struct pbe * p;
+        struct pbe *p;
        int error = 0;
        int i = 0;
        int mod = swsusp_info.image_pages / 100;
@@ -950,7 +874,7 @@ static int data_read(struct pbe *pblist)
 static int read_pagedir(struct pbe *pblist)
 {
        struct pbe *pbpage, *p;
-        unsigned i = 0;
+        unsigned int i = 0;
        int error;
        if (!pblist)
@@ -997,20 +921,25 @@ static int read_suspend_image(void)
        int error = 0;
        struct pbe *p;
-        if (!(p = alloc_pagedir(nr_copy_pages)))
+        if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0)))
                return -ENOMEM;
        if ((error = read_pagedir(p)))
                return error;
        create_pbe_list(p, nr_copy_pages);
+        mark_unsafe_pages(p);
-        if (!(pagedir_nosave = swsusp_pagedir_relocate(p)))
+        pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+        if (pagedir_nosave) {
+                create_pbe_list(pagedir_nosave, nr_copy_pages);
+                copy_page_backup_list(pagedir_nosave, p);
+        }
+        free_pagedir(p);
+        if (!pagedir_nosave)
                return -ENOMEM;
        /* Allocate memory for the image and read the data from swap */
-        error = check_pagedir(pagedir_nosave);
+        error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1);
        if (!error)
                error = data_read(pagedir_nosave);
diff --git a/kernel/printk.c b/kernel/printk.c
index 3cb9708209..e9be027bc9 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -806,7 +806,6 @@ void console_unblank(void)
                        c->unblank();
        release_console_sem();
 }
-EXPORT_SYMBOL(console_unblank);
 /*
 * Return the console tty driver structure and its associated index
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 863eee8bff..b88d4186cd 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -155,7 +155,7 @@ int ptrace_attach(struct task_struct *task)
        retval = -EPERM;
        if (task->pid <= 1)
                goto bad;
-        if (task == current)
+        if (task->tgid == current->tgid)
                goto bad;
        /* the same process cannot be attached many times */
        if (task->ptrace & PT_PTRACED)
@@ -406,3 +406,85 @@ int ptrace_request(struct task_struct *child, long request,
        return ret;
 }
+#ifndef __ARCH_SYS_PTRACE
+static int ptrace_get_task_struct(long request, long pid,
+                struct task_struct **childp)
+{
+        struct task_struct *child;
+        int ret;
+        /*
+         * Callers use child == NULL as an indication to exit early even
+         * when the return value is 0, so make sure it is non-NULL here.
+         */
+        *childp = NULL;
+        if (request == PTRACE_TRACEME) {
+                /*
+                 * Are we already being traced?
+                 */
+                if (current->ptrace & PT_PTRACED)
+                        return -EPERM;
+                ret = security_ptrace(current->parent, current);
+                if (ret)
+                        return -EPERM;
+                /*
+                 * Set the ptrace bit in the process ptrace flags.
+                 */
+                current->ptrace |= PT_PTRACED;
+                return 0;
+        }
+        /*
+         * You may not mess with init
+         */
+        if (pid == 1)
+                return -EPERM;
+        ret = -ESRCH;
+        read_lock(&tasklist_lock);
+        child = find_task_by_pid(pid);
+        if (child)
+                get_task_struct(child);
+        read_unlock(&tasklist_lock);
+        if (!child)
+                return -ESRCH;
+        *childp = child;
+        return 0;
+}
+asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+{
+        struct task_struct *child;
+        long ret;
+        /*
+         * This lock_kernel fixes a subtle race with suid exec
+         */
+        lock_kernel();
+        ret = ptrace_get_task_struct(request, pid, &child);
+        if (!child)
+                goto out;
+        if (request == PTRACE_ATTACH) {
+                ret = ptrace_attach(child);
+                goto out;
+        }
+        ret = ptrace_check_attach(child, request == PTRACE_KILL);
+        if (ret < 0)
+                goto out_put_task_struct;
+        ret = arch_ptrace(child, request, addr, data);
+        if (ret < 0)
+                goto out_put_task_struct;
+ out_put_task_struct:
+        put_task_struct(child);
+ out:
+        unlock_kernel();
+        return ret;
+}
+#endif /* __ARCH_SYS_PTRACE */
diff --git a/kernel/sched.c b/kernel/sched.c
index 340dd238c1..b6506671b2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,6 +206,7 @@ struct runqueue {
         */
        unsigned long nr_running;
 #ifdef CONFIG_SMP
+        unsigned long prio_bias;
        unsigned long cpu_load[3];
 #endif
        unsigned long long nr_switches;
@@ -659,13 +660,68 @@ static int effective_prio(task_t *p)
        return prio;
 }
+#ifdef CONFIG_SMP
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
+{
+        rq->prio_bias += MAX_PRIO - prio;
+}
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
+{
+        rq->prio_bias -= MAX_PRIO - prio;
+}
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+        rq->nr_running++;
+        if (rt_task(p)) {
+                if (p != rq->migration_thread)
+                        /*
+                         * The migration thread does the actual balancing. Do
+                         * not bias by its priority as the ultra high priority
+                         * will skew balancing adversely.
+                         */
+                        inc_prio_bias(rq, p->prio);
+        } else
+                inc_prio_bias(rq, p->static_prio);
+}
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+        rq->nr_running--;
+        if (rt_task(p)) {
+                if (p != rq->migration_thread)
+                        dec_prio_bias(rq, p->prio);
+        } else
+                dec_prio_bias(rq, p->static_prio);
+}
+#else
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
+{
+}
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
+{
+}
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+        rq->nr_running++;
+}
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+        rq->nr_running--;
+}
+#endif
 /*
 * __activate_task - move a task to the runqueue.
 */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
        enqueue_task(p, rq->active);
-        rq->nr_running++;
+        inc_nr_running(p, rq);
 }
 /*
@@ -674,7 +730,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
        enqueue_task_head(p, rq->active);
-        rq->nr_running++;
+        inc_nr_running(p, rq);
 }
 static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -759,7 +815,8 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
        }
 #endif
-        p->prio = recalc_task_prio(p, now);
+        if (!rt_task(p))
+                p->prio = recalc_task_prio(p, now);
        /*
         * This checks to make sure it's not an uninterruptible task
@@ -793,7 +850,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
 */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-        rq->nr_running--;
+        dec_nr_running(p, rq);
        dequeue_task(p, p->array);
        p->array = NULL;
 }
@@ -808,21 +865,28 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 #ifdef CONFIG_SMP
 static void resched_task(task_t *p)
 {
-        int need_resched, nrpolling;
+        int cpu;
        assert_spin_locked(&task_rq(p)->lock);
-        /* minimise the chance of sending an interrupt to poll_idle() */
+        if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
-        nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
+                return;
-        need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
-        nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
+        set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+        cpu = task_cpu(p);
+        if (cpu == smp_processor_id())
+                return;
-        if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
+        /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */
-                smp_send_reschedule(task_cpu(p));
+        smp_mb();
+        if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))
+                smp_send_reschedule(cpu);
 }
 #else
 static inline void resched_task(task_t *p)
 {
+        assert_spin_locked(&task_rq(p)->lock);
        set_tsk_need_resched(p);
 }
 #endif
@@ -930,27 +994,61 @@ void kick_process(task_t *p)
 * We want to under-estimate the load of migration sources, to
 * balance conservatively.
 */
-static inline unsigned long source_load(int cpu, int type)
+static inline unsigned long __source_load(int cpu, int type, enum idle_type idle)
 {
        runqueue_t *rq = cpu_rq(cpu);
-        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+        unsigned long running = rq->nr_running;
+        unsigned long source_load, cpu_load = rq->cpu_load[type-1],
+                load_now = running * SCHED_LOAD_SCALE;
        if (type == 0)
-                return load_now;
+                source_load = load_now;
+        else
+                source_load = min(cpu_load, load_now);
+        if (running > 1 || (idle == NOT_IDLE && running))
+                /*
+                 * If we are busy rebalancing the load is biased by
+                 * priority to create 'nice' support across cpus. When
+                 * idle rebalancing we should only bias the source_load if
+                 * there is more than one task running on that queue to
+                 * prevent idle rebalance from trying to pull tasks from a
+                 * queue with only one running task.
+                 */
+                source_load = source_load * rq->prio_bias / running;
+        return source_load;
+}
-        return min(rq->cpu_load[type-1], load_now);
+static inline unsigned long source_load(int cpu, int type)
+{
+        return __source_load(cpu, type, NOT_IDLE);
 }
 /*
 * Return a high guess at the load of a migration-target cpu
 */
-static inline unsigned long target_load(int cpu, int type)
+static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
 {
        runqueue_t *rq = cpu_rq(cpu);
-        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+        unsigned long running = rq->nr_running;
+        unsigned long target_load, cpu_load = rq->cpu_load[type-1],
+                load_now = running * SCHED_LOAD_SCALE;
        if (type == 0)
-                return load_now;
+                target_load = load_now;
+        else
+                target_load = max(cpu_load, load_now);
+        if (running > 1 || (idle == NOT_IDLE && running))
+                target_load = target_load * rq->prio_bias / running;
+        return target_load;
+}
-        return max(rq->cpu_load[type-1], load_now);
+static inline unsigned long target_load(int cpu, int type)
+{
+        return __target_load(cpu, type, NOT_IDLE);
 }
 /*
@@ -1411,7 +1509,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
                                list_add_tail(&p->run_list, &current->run_list);
                                p->array = current->array;
                                p->array->nr_active++;
-                                rq->nr_running++;
+                                inc_nr_running(p, rq);
                        }
                        set_need_resched();
                } else
@@ -1468,7 +1566,7 @@ void fastcall sched_exit(task_t *p)
         * the sleep_avg of the parent as well.
         */
        rq = task_rq_lock(p->parent, &flags);
-        if (p->first_time_slice) {
+        if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
                p->parent->time_slice += p->time_slice;
                if (unlikely(p->parent->time_slice > task_timeslice(p)))
                        p->parent->time_slice = task_timeslice(p);
@@ -1756,9 +1854,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
               runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
        dequeue_task(p, src_array);
-        src_rq->nr_running--;
+        dec_nr_running(p, src_rq);
        set_task_cpu(p, this_cpu);
-        this_rq->nr_running++;
+        inc_nr_running(p, this_rq);
        enqueue_task(p, this_array);
        p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
                                + this_rq->timestamp_last_tick;
@@ -1937,9 +2035,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
-                                load = target_load(i, load_idx);
+                                load = __target_load(i, load_idx, idle);
                        else
-                                load = source_load(i, load_idx);
+                                load = __source_load(i, load_idx, idle);
                        avg_load += load;
                }
@@ -2044,14 +2142,15 @@ out_balanced:
 /*
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
-static runqueue_t *find_busiest_queue(struct sched_group *group)
+static runqueue_t *find_busiest_queue(struct sched_group *group,
+        enum idle_type idle)
 {
        unsigned long load, max_load = 0;
        runqueue_t *busiest = NULL;
        int i;
        for_each_cpu_mask(i, group->cpumask) {
-                load = source_load(i, 0);
+                load = __source_load(i, 0, idle);
                if (load > max_load) {
                        max_load = load;
@@ -2095,7 +2194,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                goto out_balanced;
        }
-        busiest = find_busiest_queue(group);
+        busiest = find_busiest_queue(group, idle);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -2218,7 +2317,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
                goto out_balanced;
        }
-        busiest = find_busiest_queue(group);
+        busiest = find_busiest_queue(group, NEWLY_IDLE);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
                goto out_balanced;
@@ -3451,8 +3550,10 @@ void set_user_nice(task_t *p, long nice)
                goto out_unlock;
        }
        array = p->array;
-        if (array)
+        if (array) {
                dequeue_task(p, array);
+                dec_prio_bias(rq, p->static_prio);
+        }
        old_prio = p->prio;
        new_prio = NICE_TO_PRIO(nice);
@@ -3462,6 +3563,7 @@ void set_user_nice(task_t *p, long nice)
        if (array) {
                enqueue_task(p, array);
+                inc_prio_bias(rq, p->static_prio);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -3563,8 +3665,6 @@ int idle_cpu(int cpu)
        return cpu_curr(cpu) == cpu_rq(cpu)->idle;
 }
-EXPORT_SYMBOL_GPL(idle_cpu);
 /**
 * idle_task - return the idle task for a given cpu.
 * @cpu: the processor in question.
@@ -4680,7 +4780,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
                /* Unbind it from offline cpu so it can run.  Fall thru. */
-                kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
+                kthread_bind(cpu_rq(cpu)->migration_thread,
+                             any_online_cpu(cpu_online_map));
                kthread_stop(cpu_rq(cpu)->migration_thread);
                cpu_rq(cpu)->migration_thread = NULL;
                break;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f766b2fc48..ad3295cdde 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -470,7 +470,8 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
                /* Unbind so it can run.  Fall thru. */
-                kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id());
+                kthread_bind(per_cpu(ksoftirqd, hotcpu),
+                             any_online_cpu(cpu_online_map));
        case CPU_DEAD:
                p = per_cpu(ksoftirqd, hotcpu);
                per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 75976209ce..c67189a25d 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -73,9 +73,6 @@ void softlockup_tick(struct pt_regs *regs)
 static int watchdog(void * __bind_cpu)
 {
        struct sched_param param = { .sched_priority = 99 };
-        int this_cpu = (long) __bind_cpu;
-        printk("softlockup thread %d started up.\n", this_cpu);
        sched_setscheduler(current, SCHED_FIFO, &param);
        current->flags |= PF_NOFREEZE;
@@ -123,7 +120,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
                /* Unbind so it can run.  Fall thru. */
-                kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+                kthread_bind(per_cpu(watchdog_task, hotcpu),
+                             any_online_cpu(cpu_online_map));
        case CPU_DEAD:
                p = per_cpu(watchdog_task, hotcpu);
                per_cpu(watchdog_task, hotcpu) = NULL;
diff --git a/kernel/sys.c b/kernel/sys.c
index 2fa1ed1812..c43b3e22bb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/cn_proc.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -375,18 +376,21 @@ void emergency_restart(void)
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
-/**
- *      kernel_restart - reboot the system
- *
- *      Shutdown everything and perform a clean reboot.
- *      This is not safe to call in interrupt context.
- */
 void kernel_restart_prepare(char *cmd)
 {
        notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        system_state = SYSTEM_RESTART;
        device_shutdown();
 }
+/**
+ *      kernel_restart - reboot the system
+ *      @cmd: pointer to buffer containing command to execute for restart
+ *              or %NULL
+ *
+ *      Shutdown everything and perform a clean reboot.
+ *      This is not safe to call in interrupt context.
+ */
 void kernel_restart(char *cmd)
 {
        kernel_restart_prepare(cmd);
@@ -623,6 +627,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
        current->egid = new_egid;
        current->gid = new_rgid;
        key_fsgid_changed(current);
+        proc_id_connector(current, PROC_EVENT_GID);
        return 0;
 }
@@ -662,6 +667,7 @@ asmlinkage long sys_setgid(gid_t gid)
                return -EPERM;
        key_fsgid_changed(current);
+        proc_id_connector(current, PROC_EVENT_GID);
        return 0;
 }
  
@@ -751,6 +757,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
        current->fsuid = current->euid;
        key_fsuid_changed(current);
+        proc_id_connector(current, PROC_EVENT_UID);
        return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
 }
@@ -798,6 +805,7 @@ asmlinkage long sys_setuid(uid_t uid)
        current->suid = new_suid;
        key_fsuid_changed(current);
+        proc_id_connector(current, PROC_EVENT_UID);
        return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
 }
@@ -846,6 +854,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
                current->suid = suid;
        key_fsuid_changed(current);
+        proc_id_connector(current, PROC_EVENT_UID);
        return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
 }
@@ -898,6 +907,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
                current->sgid = sgid;
        key_fsgid_changed(current);
+        proc_id_connector(current, PROC_EVENT_GID);
        return 0;
 }
@@ -940,6 +950,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
        }
        key_fsuid_changed(current);
+        proc_id_connector(current, PROC_EVENT_UID);
        security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
@@ -968,6 +979,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
                }
                current->fsgid = gid;
                key_fsgid_changed(current);
+                proc_id_connector(current, PROC_EVENT_GID);
        }
        return old_fsgid;
 }
@@ -1485,8 +1497,6 @@ EXPORT_SYMBOL(in_egroup_p);
 DECLARE_RWSEM(uts_sem);
-EXPORT_SYMBOL(uts_sem);
 asmlinkage long sys_newuname(struct new_utsname __user * name)
 {
        int errno = 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8e56e24955..9990e10192 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -169,7 +169,7 @@ struct file_operations proc_sys_file_operations = {
 extern struct proc_dir_entry *proc_sys_root;
-static void register_proc_table(ctl_table *, struct proc_dir_entry *);
+static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *);
 static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
 #endif
@@ -952,7 +952,7 @@ static ctl_table fs_table[] = {
                .data           = &aio_nr,
                .maxlen         = sizeof(aio_nr),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = &proc_doulongvec_minmax,
        },
        {
                .ctl_name       = FS_AIO_MAX_NR,
@@ -960,7 +960,7 @@ static ctl_table fs_table[] = {
                .data           = &aio_max_nr,
                .maxlen         = sizeof(aio_max_nr),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = &proc_doulongvec_minmax,
        },
 #ifdef CONFIG_INOTIFY
        {
@@ -992,10 +992,51 @@ static ctl_table dev_table[] = {
 extern void init_irq_proc (void);
+static DEFINE_SPINLOCK(sysctl_lock);
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+        if (unlikely(p->unregistering))
+                return 0;
+        p->used++;
+        return 1;
+}
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+        if (!--p->used)
+                if (unlikely(p->unregistering))
+                        complete(p->unregistering);
+}
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+        /*
+         * if p->used is 0, nobody will ever touch that entry again;
+         * we'll eliminate all paths to it before dropping sysctl_lock
+         */
+        if (unlikely(p->used)) {
+                struct completion wait;
+                init_completion(&wait);
+                p->unregistering = &wait;
+                spin_unlock(&sysctl_lock);
+                wait_for_completion(&wait);
+                spin_lock(&sysctl_lock);
+        }
+        /*
+         * do not remove from the list until nobody holds it; walking the
+         * list in do_sysctl() relies on that.
+         */
+        list_del_init(&p->ctl_entry);
+}
 void __init sysctl_init(void)
 {
 #ifdef CONFIG_PROC_FS
-        register_proc_table(root_table, proc_sys_root);
+        register_proc_table(root_table, proc_sys_root, &root_table_header);
        init_irq_proc();
 #endif
 }
@@ -1004,6 +1045,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
               void __user *newval, size_t newlen)
 {
        struct list_head *tmp;
+        int error = -ENOTDIR;
        if (nlen <= 0 || nlen >= CTL_MAXNAME)
                return -ENOTDIR;
@@ -1012,20 +1054,30 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
                if (!oldlenp || get_user(old_len, oldlenp))
                        return -EFAULT;
        }
+        spin_lock(&sysctl_lock);
        tmp = &root_table_header.ctl_entry;
        do {
                struct ctl_table_header *head =
                        list_entry(tmp, struct ctl_table_header, ctl_entry);
                void *context = NULL;
-                int error = parse_table(name, nlen, oldval, oldlenp, 
+                if (!use_table(head))
+                        continue;
+                spin_unlock(&sysctl_lock);
+                error = parse_table(name, nlen, oldval, oldlenp, 
                                        newval, newlen, head->ctl_table,
                                        &context);
                kfree(context);
+                spin_lock(&sysctl_lock);
+                unuse_table(head);
                if (error != -ENOTDIR)
-                        return error;
+                        break;
-                tmp = tmp->next;
+        } while ((tmp = tmp->next) != &root_table_header.ctl_entry);
-        } while (tmp != &root_table_header.ctl_entry);
+        spin_unlock(&sysctl_lock);
-        return -ENOTDIR;
+        return error;
 }
 asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
@@ -1236,12 +1288,16 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
                return NULL;
        tmp->ctl_table = table;
        INIT_LIST_HEAD(&tmp->ctl_entry);
+        tmp->used = 0;
+        tmp->unregistering = NULL;
+        spin_lock(&sysctl_lock);
        if (insert_at_head)
                list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
        else
                list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+        spin_unlock(&sysctl_lock);
 #ifdef CONFIG_PROC_FS
-        register_proc_table(table, proc_sys_root);
+        register_proc_table(table, proc_sys_root, tmp);
 #endif
        return tmp;
 }
@@ -1255,10 +1311,13 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
 */
 void unregister_sysctl_table(struct ctl_table_header * header)
 {
-        list_del(&header->ctl_entry);
+        might_sleep();
+        spin_lock(&sysctl_lock);
+        start_unregistering(header);
 #ifdef CONFIG_PROC_FS
        unregister_proc_table(header->ctl_table, proc_sys_root);
 #endif
+        spin_unlock(&sysctl_lock);
        kfree(header);
 }
@@ -1269,7 +1328,7 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 #ifdef CONFIG_PROC_FS
 /* Scan the sysctl entries in table and add them all into /proc */
-static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
+static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
 {
        struct proc_dir_entry *de;
        int len;
@@ -1305,13 +1364,14 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
                        de = create_proc_entry(table->procname, mode, root);
                        if (!de)
                                continue;
+                        de->set = set;
                        de->data = (void *) table;
                        if (table->proc_handler)
                                de->proc_fops = &proc_sys_file_operations;
                }
                table->de = de;
                if (de->mode & S_IFDIR)
-                        register_proc_table(table->child, de);
+                        register_proc_table(table->child, de, set);
        }
 }
@@ -1336,6 +1396,13 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root
                                continue;
                }
+                /*
+                 * In any case, mark the entry as goner; we'll keep it
+                 * around if it's busy, but we'll know to do nothing with
+                 * its fields.  We are under sysctl_lock here.
+                 */
+                de->data = NULL;
                /* Don't unregister proc entries that are still being used.. */
                if (atomic_read(&de->count))
                        continue;
@@ -1349,27 +1416,38 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
                          size_t count, loff_t *ppos)
 {
        int op;
-        struct proc_dir_entry *de;
+        struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
        struct ctl_table *table;
        size_t res;
-        ssize_t error;
+        ssize_t error = -ENOTDIR;
-        
-        de = PDE(file->f_dentry->d_inode);
-        if (!de || !de->data)
-                return -ENOTDIR;
-        table = (struct ctl_table *) de->data;
-        if (!table || !table->proc_handler)
-                return -ENOTDIR;
-        op = (write ? 002 : 004);
-        if (ctl_perm(table, op))
-                return -EPERM;
        
-        res = count;
+        spin_lock(&sysctl_lock);
+        if (de && de->data && use_table(de->set)) {
-        error = (*table->proc_handler) (table, write, file, buf, &res, ppos);
+                /*
-        if (error)
+                 * at that point we know that sysctl was not unregistered
-                return error;
+                 * and won't be until we finish
-        return res;
+                 */
+                spin_unlock(&sysctl_lock);
+                table = (struct ctl_table *) de->data;
+                if (!table || !table->proc_handler)
+                        goto out;
+                error = -EPERM;
+                op = (write ? 002 : 004);
+                if (ctl_perm(table, op))
+                        goto out;
+                
+                /* careful: calling conventions are nasty here */
+                res = count;
+                error = (*table->proc_handler)(table, write, file,
+                                                buf, &res, ppos);
+                if (!error)
+                        error = res;
+        out:
+                spin_lock(&sysctl_lock);
+                unuse_table(de->set);
+        }
+        spin_unlock(&sysctl_lock);
+        return error;
 }
 static int proc_opensys(struct inode *inode, struct file *file)
@@ -1997,6 +2075,7 @@ int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
 * @filp: the file structure
 * @buffer: the user buffer
 * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
 *
 * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
 * values from/to the user buffer, treated as an ASCII string. 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7cee222231..42df83d7fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -524,7 +524,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
                list_for_each_entry(wq, &workqueues, list) {
                        /* Unbind so it can run. */
                        kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
-                                     smp_processor_id());
+                                     any_online_cpu(cpu_online_map));
                        cleanup_workqueue_thread(wq, hotcpu);
                }
                break;