Pull release into acpica branch

author: Len Brown <len.brown@intel.com> 2005-12-06 17:31:30 -0500
committer: Len Brown <len.brown@intel.com> 2005-12-06 17:31:30 -0500
commit: 3d5271f9883cba7b54762bc4fe027d4172f06db7 (patch)
tree: ab8a881a14478598a0c8bda0d26c62cdccfffd6d /kernel
parent: 378b2556f4e09fa6f87ff0cb5c4395ff28257d02 (diff)
parent: 9115a6c787596e687df03010d97fccc5e0762506 (diff)
43 files changed, 2573 insertions, 1620 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ff4dc02ce170..4f5a1453093a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
@@ -32,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index b756f527497e..6312d6bd43e3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -54,6 +54,7 @@
 #include <linux/jiffies.h>
 #include <linux/times.h>
 #include <linux/syscalls.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
@@ -192,6 +193,7 @@ static void acct_file_reopen(struct file *file)
                add_timer(&acct_globals.timer);
        }
        if (old_acct) {
+                mnt_unpin(old_acct->f_vfsmnt);
                spin_unlock(&acct_globals.lock);
                do_acct_process(0, old_acct);
                filp_close(old_acct, NULL);
@@ -199,6 +201,42 @@ static void acct_file_reopen(struct file *file)
        }
 }
+static int acct_on(char *name)
+{
+        struct file *file;
+        int error;
+        /* Difference from BSD - they don't do O_APPEND */
+        file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+        if (IS_ERR(file))
+                return PTR_ERR(file);
+        if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+                filp_close(file, NULL);
+                return -EACCES;
+        }
+        if (!file->f_op->write) {
+                filp_close(file, NULL);
+                return -EIO;
+        }
+        error = security_acct(file);
+        if (error) {
+                filp_close(file, NULL);
+                return error;
+        }
+        spin_lock(&acct_globals.lock);
+        mnt_pin(file->f_vfsmnt);
+        acct_file_reopen(file);
+        spin_unlock(&acct_globals.lock);
+        mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */
+        return 0;
+}
 /**
 * sys_acct - enable/disable process accounting
 * @name: file name for accounting records or NULL to shutdown accounting
@@ -212,47 +250,41 @@ static void acct_file_reopen(struct file *file)
 */
 asmlinkage long sys_acct(const char __user *name)
 {
-        struct file *file = NULL;
-        char *tmp;
        int error;
        if (!capable(CAP_SYS_PACCT))
                return -EPERM;
        if (name) {
-                tmp = getname(name);
+                char *tmp = getname(name);
-                if (IS_ERR(tmp)) {
+                if (IS_ERR(tmp))
                        return (PTR_ERR(tmp));
-                }
+                error = acct_on(tmp);
-                /* Difference from BSD - they don't do O_APPEND */
-                file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
                putname(tmp);
-                if (IS_ERR(file)) {
+        } else {
-                        return (PTR_ERR(file));
+                error = security_acct(NULL);
-                }
+                if (!error) {
-                if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+                        spin_lock(&acct_globals.lock);
-                        filp_close(file, NULL);
+                        acct_file_reopen(NULL);
-                        return (-EACCES);
+                        spin_unlock(&acct_globals.lock);
-                }
-                if (!file->f_op->write) {
-                        filp_close(file, NULL);
-                        return (-EIO);
                }
        }
+        return error;
+}
-        error = security_acct(file);
+/**
-        if (error) {
+ * acct_auto_close - turn off a filesystem's accounting if it is on
-                if (file)
+ * @m: vfsmount being shut down
-                        filp_close(file, NULL);
+ *
-                return error;
+ * If the accounting is turned on for a file in the subtree pointed to
-        }
+ * to by m, turn accounting off.  Done when m is about to die.
+ */
+void acct_auto_close_mnt(struct vfsmount *m)
+{
        spin_lock(&acct_globals.lock);
-        acct_file_reopen(file);
+        if (acct_globals.file && acct_globals.file->f_vfsmnt == m)
+                acct_file_reopen(NULL);
        spin_unlock(&acct_globals.lock);
-        return (0);
 }
 /**
@@ -266,8 +298,8 @@ void acct_auto_close(struct super_block *sb)
 {
        spin_lock(&acct_globals.lock);
        if (acct_globals.file &&
-            acct_globals.file->f_dentry->d_inode->i_sb == sb) {
+            acct_globals.file->f_vfsmnt->mnt_sb == sb) {
-                acct_file_reopen((struct file *)NULL);
+                acct_file_reopen(NULL);
        }
        spin_unlock(&acct_globals.lock);
 }
@@ -553,7 +585,7 @@ void acct_update_integrals(struct task_struct *tsk)
                if (delta == 0)
                        return;
                tsk->acct_stimexpd = tsk->stime;
-                tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
+                tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
                tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
        }
 }
diff --git a/kernel/audit.c b/kernel/audit.c
index 83096b67510a..0c56320d38dc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -133,7 +133,7 @@ struct audit_buffer {
        struct list_head     list;
        struct sk_buff       *skb;      /* formatted skb ready to send */
        struct audit_context *ctx;      /* NULL or associated context */
-        int                  gfp_mask;
+        gfp_t                gfp_mask;
 };
 static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
@@ -560,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
 }
 static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
-                                                unsigned int __nocast gfp_mask, int type)
+                                                gfp_t gfp_mask, int type)
 {
        unsigned long flags;
        struct audit_buffer *ab = NULL;
@@ -647,7 +647,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
 * will be written at syscall exit.  If there is no associated task, tsk
 * should be NULL. */
-struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask,
+struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
                                     int type)
 {
        struct audit_buffer     *ab     = NULL;
@@ -879,7 +879,7 @@ void audit_log_end(struct audit_buffer *ab)
 /* Log an audit record.  This is a convenience function that calls
 * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
 * called in any context. */
-void audit_log(struct audit_context *ctx, int gfp_mask, int type, 
+void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 
               const char *fmt, ...)
 {
        struct audit_buffer *ab;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 88696f639aab..d8a68509e729 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -803,7 +803,7 @@ static void audit_log_task_info(struct audit_buffer *ab)
        up_read(&mm->mmap_sem);
 }
-static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask)
+static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 {
        int i;
        struct audit_buffer *ab;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53d8263ae12e..e882c6babf41 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,28 +16,76 @@
 #include <asm/semaphore.h>
 /* This protects CPUs going up and down... */
-DECLARE_MUTEX(cpucontrol);
+static DECLARE_MUTEX(cpucontrol);
 static struct notifier_block *cpu_chain;
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct *lock_cpu_hotplug_owner;
+static int lock_cpu_hotplug_depth;
+static int __lock_cpu_hotplug(int interruptible)
+{
+        int ret = 0;
+        if (lock_cpu_hotplug_owner != current) {
+                if (interruptible)
+                        ret = down_interruptible(&cpucontrol);
+                else
+                        down(&cpucontrol);
+        }
+        /*
+         * Set only if we succeed in locking
+         */
+        if (!ret) {
+                lock_cpu_hotplug_depth++;
+                lock_cpu_hotplug_owner = current;
+        }
+        return ret;
+}
+void lock_cpu_hotplug(void)
+{
+        __lock_cpu_hotplug(0);
+}
+EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
+void unlock_cpu_hotplug(void)
+{
+        if (--lock_cpu_hotplug_depth == 0) {
+                lock_cpu_hotplug_owner = NULL;
+                up(&cpucontrol);
+        }
+}
+EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
+int lock_cpu_hotplug_interruptible(void)
+{
+        return __lock_cpu_hotplug(1);
+}
+EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
+#endif  /* CONFIG_HOTPLUG_CPU */
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
 {
        int ret;
-        if ((ret = down_interruptible(&cpucontrol)) != 0)
+        if ((ret = lock_cpu_hotplug_interruptible()) != 0)
                return ret;
        ret = notifier_chain_register(&cpu_chain, nb);
-        up(&cpucontrol);
+        unlock_cpu_hotplug();
        return ret;
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-        down(&cpucontrol);
+        lock_cpu_hotplug();
        notifier_chain_unregister(&cpu_chain, nb);
-        up(&cpucontrol);
+        unlock_cpu_hotplug();
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
@@ -155,13 +203,14 @@ int __devinit cpu_up(unsigned int cpu)
        int ret;
        void *hcpu = (void *)(long)cpu;
-        if ((ret = down_interruptible(&cpucontrol)) != 0)
+        if ((ret = lock_cpu_hotplug_interruptible()) != 0)
                return ret;
        if (cpu_online(cpu) || !cpu_present(cpu)) {
                ret = -EINVAL;
                goto out;
        }
        ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
        if (ret == NOTIFY_BAD) {
                printk("%s: attempt to bring up CPU %u failed\n",
@@ -184,6 +233,6 @@ out_notify:
        if (ret != 0)
                notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
 out:
-        up(&cpucontrol);
+        unlock_cpu_hotplug();
        return ret;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 79866bc6b3a1..7430640f9816 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -32,6 +32,7 @@
 #include <linux/kernel.h>
 #include <linux/kmod.h>
 #include <linux/list.h>
+#include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -60,6 +61,9 @@ struct cpuset {
        cpumask_t cpus_allowed;         /* CPUs allowed to tasks in cpuset */
        nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
+        /*
+         * Count is atomic so can incr (fork) or decr (exit) without a lock.
+         */
        atomic_t count;                 /* count tasks using this cpuset */
        /*
@@ -142,80 +146,91 @@ static struct vfsmount *cpuset_mount;
 static struct super_block *cpuset_sb = NULL;
 /*
- * cpuset_sem should be held by anyone who is depending on the children
+ * We have two global cpuset semaphores below.  They can nest.
- * or sibling lists of any cpuset, or performing non-atomic operations
+ * It is ok to first take manage_sem, then nest callback_sem.  We also
- * on the flags or *_allowed values of a cpuset, such as raising the
+ * require taking task_lock() when dereferencing a tasks cpuset pointer.
- * CS_REMOVED flag bit iff it is not already raised, or reading and
+ * See "The task_lock() exception", at the end of this comment.
- * conditionally modifying the *_allowed values.  One kernel global
+ *
- * cpuset semaphore should be sufficient - these things don't change
+ * A task must hold both semaphores to modify cpusets.  If a task
- * that much.
+ * holds manage_sem, then it blocks others wanting that semaphore,
- *
+ * ensuring that it is the only task able to also acquire callback_sem
- * The code that modifies cpusets holds cpuset_sem across the entire
+ * and be able to modify cpusets.  It can perform various checks on
- * operation, from cpuset_common_file_write() down, single threading
+ * the cpuset structure first, knowing nothing will change.  It can
- * all cpuset modifications (except for counter manipulations from
+ * also allocate memory while just holding manage_sem.  While it is
- * fork and exit) across the system.  This presumes that cpuset
+ * performing these checks, various callback routines can briefly
- * modifications are rare - better kept simple and safe, even if slow.
+ * acquire callback_sem to query cpusets.  Once it is ready to make
- *
+ * the changes, it takes callback_sem, blocking everyone else.
- * The code that reads cpusets, such as in cpuset_common_file_read()
+ *
- * and below, only holds cpuset_sem across small pieces of code, such
+ * Calls to the kernel memory allocator can not be made while holding
- * as when reading out possibly multi-word cpumasks and nodemasks, as
+ * callback_sem, as that would risk double tripping on callback_sem
- * the risks are less, and the desire for performance a little greater.
+ * from one of the callbacks into the cpuset code from within
- * The proc_cpuset_show() routine needs to hold cpuset_sem to insure
+ * __alloc_pages().
- * that no cs->dentry is NULL, as it walks up the cpuset tree to root.
+ *
- *
+ * If a task is only holding callback_sem, then it has read-only
- * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
+ * access to cpusets.
- * (usually) grab cpuset_sem.  These are the two most performance
+ *
- * critical pieces of code here.  The exception occurs on exit(),
+ * The task_struct fields mems_allowed and mems_generation may only
- * when a task in a notify_on_release cpuset exits.  Then cpuset_sem
+ * be accessed in the context of that task, so require no locks.
+ *
+ * Any task can increment and decrement the count field without lock.
+ * So in general, code holding manage_sem or callback_sem can't rely
+ * on the count field not changing.  However, if the count goes to
+ * zero, then only attach_task(), which holds both semaphores, can
+ * increment it again.  Because a count of zero means that no tasks
+ * are currently attached, therefore there is no way a task attached
+ * to that cpuset can fork (the other way to increment the count).
+ * So code holding manage_sem or callback_sem can safely assume that
+ * if the count is zero, it will stay zero.  Similarly, if a task
+ * holds manage_sem or callback_sem on a cpuset with zero count, it
+ * knows that the cpuset won't be removed, as cpuset_rmdir() needs
+ * both of those semaphores.
+ *
+ * A possible optimization to improve parallelism would be to make
+ * callback_sem a R/W semaphore (rwsem), allowing the callback routines
+ * to proceed in parallel, with read access, until the holder of
+ * manage_sem needed to take this rwsem for exclusive write access
+ * and modify some cpusets.
+ *
+ * The cpuset_common_file_write handler for operations that modify
+ * the cpuset hierarchy holds manage_sem across the entire operation,
+ * single threading all such cpuset modifications across the system.
+ *
+ * The cpuset_common_file_read() handlers only hold callback_sem across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ *
+ * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
+ * (usually) take either semaphore.  These are the two most performance
+ * critical pieces of code here.  The exception occurs on cpuset_exit(),
+ * when a task in a notify_on_release cpuset exits.  Then manage_sem
 * is taken, and if the cpuset count is zero, a usermode call made
 * to /sbin/cpuset_release_agent with the name of the cpuset (path
 * relative to the root of cpuset file system) as the argument.
 *
- * A cpuset can only be deleted if both its 'count' of using tasks is
+ * A cpuset can only be deleted if both its 'count' of using tasks
- * zero, and its list of 'children' cpusets is empty.  Since all tasks
+ * is zero, and its list of 'children' cpusets is empty.  Since all
- * in the system use _some_ cpuset, and since there is always at least
+ * tasks in the system use _some_ cpuset, and since there is always at
- * one task in the system (init, pid == 1), therefore, top_cpuset
+ * least one task in the system (init, pid == 1), therefore, top_cpuset
- * always has either children cpusets and/or using tasks.  So no need
+ * always has either children cpusets and/or using tasks.  So we don't
- * for any special hack to ensure that top_cpuset cannot be deleted.
+ * need a special hack to ensure that top_cpuset cannot be deleted.
+ *
+ * The above "Tale of Two Semaphores" would be complete, but for:
+ *
+ *      The task_lock() exception
+ *
+ * The need for this exception arises from the action of attach_task(),
+ * which overwrites one tasks cpuset pointer with another.  It does
+ * so using both semaphores, however there are several performance
+ * critical places that need to reference task->cpuset without the
+ * expense of grabbing a system global semaphore.  Therefore except as
+ * noted below, when dereferencing or, as in attach_task(), modifying
+ * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
+ * (task->alloc_lock) already in the task_struct routinely used for
+ * such matters.
 */
-static DECLARE_MUTEX(cpuset_sem);
+static DECLARE_MUTEX(manage_sem);
-static struct task_struct *cpuset_sem_owner;
+static DECLARE_MUTEX(callback_sem);
-static int cpuset_sem_depth;
-/*
- * The global cpuset semaphore cpuset_sem can be needed by the
- * memory allocator to update a tasks mems_allowed (see the calls
- * to cpuset_update_current_mems_allowed()) or to walk up the
- * cpuset hierarchy to find a mem_exclusive cpuset see the calls
- * to cpuset_excl_nodes_overlap()).
- *
- * But if the memory allocation is being done by cpuset.c code, it
- * usually already holds cpuset_sem.  Double tripping on a kernel
- * semaphore deadlocks the current task, and any other task that
- * subsequently tries to obtain the lock.
- *
- * Run all up's and down's on cpuset_sem through the following
- * wrappers, which will detect this nested locking, and avoid
- * deadlocking.
- */
-static inline void cpuset_down(struct semaphore *psem)
-{
-        if (cpuset_sem_owner != current) {
-                down(psem);
-                cpuset_sem_owner = current;
-        }
-        cpuset_sem_depth++;
-}
-static inline void cpuset_up(struct semaphore *psem)
-{
-        if (--cpuset_sem_depth == 0) {
-                cpuset_sem_owner = NULL;
-                up(psem);
-        }
-}
 /*
 * A couple of forward declarations required, due to cyclic reference loop:
@@ -390,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 }
 /*
- * Call with cpuset_sem held.  Writes path of cpuset into buf.
+ * Call with manage_sem held.  Writes path of cpuset into buf.
 * Returns 0 on success, -errno on error.
 */
@@ -442,10 +457,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
 * status of the /sbin/cpuset_release_agent task, so no sense holding
 * our caller up for that.
 *
- * The simple act of forking that task might require more memory,
+ * When we had only one cpuset semaphore, we had to call this
- * which might need cpuset_sem.  So this routine must be called while
+ * without holding it, to avoid deadlock when call_usermodehelper()
- * cpuset_sem is not held, to avoid a possible deadlock.  See also
+ * allocated memory.  With two locks, we could now call this while
- * comments for check_for_release(), below.
+ * holding manage_sem, but we still don't, so as to minimize
+ * the time manage_sem is held.
 */
 static void cpuset_release_agent(const char *pathbuf)
@@ -477,15 +493,15 @@ static void cpuset_release_agent(const char *pathbuf)
 * cs is notify_on_release() and now both the user count is zero and
 * the list of children is empty, prepare cpuset path in a kmalloc'd
 * buffer, to be returned via ppathbuf, so that the caller can invoke
- * cpuset_release_agent() with it later on, once cpuset_sem is dropped.
+ * cpuset_release_agent() with it later on, once manage_sem is dropped.
- * Call here with cpuset_sem held.
+ * Call here with manage_sem held.
 *
 * This check_for_release() routine is responsible for kmalloc'ing
 * pathbuf.  The above cpuset_release_agent() is responsible for
 * kfree'ing pathbuf.  The caller of these routines is responsible
 * for providing a pathbuf pointer, initialized to NULL, then
- * calling check_for_release() with cpuset_sem held and the address
+ * calling check_for_release() with manage_sem held and the address
- * of the pathbuf pointer, then dropping cpuset_sem, then calling
+ * of the pathbuf pointer, then dropping manage_sem, then calling
 * cpuset_release_agent() with pathbuf, as set by check_for_release().
 */
@@ -516,7 +532,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
 * One way or another, we guarantee to return some non-empty subset
 * of cpu_online_map.
 *
- * Call with cpuset_sem held.
+ * Call with callback_sem held.
 */
 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -540,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
 * One way or another, we guarantee to return some non-empty subset
 * of node_online_map.
 *
- * Call with cpuset_sem held.
+ * Call with callback_sem held.
 */
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -555,22 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 }
 /*
- * Refresh current tasks mems_allowed and mems_generation from
+ * Refresh current tasks mems_allowed and mems_generation from current
- * current tasks cpuset.  Call with cpuset_sem held.
+ * tasks cpuset.
+ *
+ * Call without callback_sem or task_lock() held.  May be called with
+ * or without manage_sem held.  Will acquire task_lock() and might
+ * acquire callback_sem during call.
+ *
+ * The task_lock() is required to dereference current->cpuset safely.
+ * Without it, we could pick up the pointer value of current->cpuset
+ * in one instruction, and then attach_task could give us a different
+ * cpuset, and then the cpuset we had could be removed and freed,
+ * and then on our next instruction, we could dereference a no longer
+ * valid cpuset pointer to get its mems_generation field.
 *
- * This routine is needed to update the per-task mems_allowed
+ * This routine is needed to update the per-task mems_allowed data,
- * data, within the tasks context, when it is trying to allocate
+ * within the tasks context, when it is trying to allocate memory
- * memory (in various mm/mempolicy.c routines) and notices
+ * (in various mm/mempolicy.c routines) and notices that some other
- * that some other task has been modifying its cpuset.
+ * task has been modifying its cpuset.
 */
 static void refresh_mems(void)
 {
-        struct cpuset *cs = current->cpuset;
+        int my_cpusets_mem_gen;
-        if (current->cpuset_mems_generation != cs->mems_generation) {
+        task_lock(current);
+        my_cpusets_mem_gen = current->cpuset->mems_generation;
+        task_unlock(current);
+        if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
+                struct cpuset *cs;
+                nodemask_t oldmem = current->mems_allowed;
+                down(&callback_sem);
+                task_lock(current);
+                cs = current->cpuset;
                guarantee_online_mems(cs, &current->mems_allowed);
                current->cpuset_mems_generation = cs->mems_generation;
+                task_unlock(current);
+                up(&callback_sem);
+                if (!nodes_equal(oldmem, current->mems_allowed))
+                        numa_policy_rebind(&oldmem, &current->mems_allowed);
        }
 }
@@ -579,7 +620,7 @@ static void refresh_mems(void)
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.
+ * are only set if the other's are set.  Call holding manage_sem.
 */
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -597,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 * If we replaced the flag and mask values of the current cpuset
 * (cur) with those values in the trial cpuset (trial), would
 * our various subset and exclusive rules still be valid?  Presumes
- * cpuset_sem held.
+ * manage_sem held.
 *
 * 'cur' is the address of an actual, in-use cpuset.  Operations
 * such as list traversal that depend on the actual address of the
@@ -651,7 +692,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 *    exclusive child cpusets
 * Build these two partitions by calling partition_sched_domains
 *
- * Call with cpuset_sem held.  May nest a call to the
+ * Call with manage_sem held.  May nest a call to the
 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
 */
@@ -696,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur)
        unlock_cpu_hotplug();
 }
+/*
+ * Call with manage_sem held.  May take callback_sem during call.
+ */
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
        struct cpuset trialcs;
@@ -712,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        if (retval < 0)
                return retval;
        cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+        down(&callback_sem);
        cs->cpus_allowed = trialcs.cpus_allowed;
+        up(&callback_sem);
        if (is_cpu_exclusive(cs) && !cpus_unchanged)
                update_cpu_domains(cs);
        return 0;
 }
+/*
+ * Call with manage_sem held.  May take callback_sem during call.
+ */
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
        struct cpuset trialcs;
@@ -732,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
                return -ENOSPC;
        retval = validate_change(cs, &trialcs);
        if (retval == 0) {
+                down(&callback_sem);
                cs->mems_allowed = trialcs.mems_allowed;
                atomic_inc(&cpuset_mems_generation);
                cs->mems_generation = atomic_read(&cpuset_mems_generation);
+                up(&callback_sem);
        }
        return retval;
 }
@@ -745,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 *                                              CS_NOTIFY_ON_RELEASE)
 * cs:  the cpuset to update
 * buf: the buffer where we read the 0 or 1
+ *
+ * Call with manage_sem held.
 */
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -766,16 +821,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
                return err;
        cpu_exclusive_changed =
                (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
+        down(&callback_sem);
        if (turning_on)
                set_bit(bit, &cs->flags);
        else
                clear_bit(bit, &cs->flags);
+        up(&callback_sem);
        if (cpu_exclusive_changed)
                update_cpu_domains(cs);
        return 0;
 }
+/*
+ * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
+ * writing the path of the old cpuset in 'ppathbuf' if it needs to be
+ * notified on release.
+ *
+ * Call holding manage_sem.  May take callback_sem and task_lock of
+ * the task 'pid' during call.
+ */
 static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 {
        pid_t pid;
@@ -792,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
                read_lock(&tasklist_lock);
                tsk = find_task_by_pid(pid);
-                if (!tsk) {
+                if (!tsk || tsk->flags & PF_EXITING) {
                        read_unlock(&tasklist_lock);
                        return -ESRCH;
                }
@@ -810,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
                get_task_struct(tsk);
        }
+        down(&callback_sem);
        task_lock(tsk);
        oldcs = tsk->cpuset;
        if (!oldcs) {
                task_unlock(tsk);
+                up(&callback_sem);
                put_task_struct(tsk);
                return -ESRCH;
        }
@@ -824,6 +893,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
        guarantee_online_cpus(cs, &cpus);
        set_cpus_allowed(tsk, cpus);
+        up(&callback_sem);
        put_task_struct(tsk);
        if (atomic_dec_and_test(&oldcs->count))
                check_for_release(oldcs, ppathbuf);
@@ -867,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        }
        buffer[nbytes] = 0;     /* nul-terminate */
-        cpuset_down(&cpuset_sem);
+        down(&manage_sem);
        if (is_removed(cs)) {
                retval = -ENODEV;
@@ -901,7 +971,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        if (retval == 0)
                retval = nbytes;
 out2:
-        cpuset_up(&cpuset_sem);
+        up(&manage_sem);
        cpuset_release_agent(pathbuf);
 out1:
        kfree(buffer);
@@ -941,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
        cpumask_t mask;
-        cpuset_down(&cpuset_sem);
+        down(&callback_sem);
        mask = cs->cpus_allowed;
-        cpuset_up(&cpuset_sem);
+        up(&callback_sem);
        return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -952,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
        nodemask_t mask;
-        cpuset_down(&cpuset_sem);
+        down(&callback_sem);
        mask = cs->mems_allowed;
-        cpuset_up(&cpuset_sem);
+        up(&callback_sem);
        return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -968,8 +1038,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
        char *page;
        ssize_t retval = 0;
        char *s;
-        char *start;
-        size_t n;
        if (!(page = (char *)__get_free_page(GFP_KERNEL)))
                return -ENOMEM;
@@ -997,16 +1065,8 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
                goto out;
        }
        *s++ = '\n';
-        *s = '\0';
-        /* Do nothing if *ppos is at the eof or beyond the eof. */
+        retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
-        if (s - page <= *ppos)
-                return 0;
-        start = page + *ppos;
-        n = s - start;
-        retval = n - copy_to_user(buf, start, min(n, nbytes));
-        *ppos += retval;
 out:
        free_page((unsigned long)page);
        return retval;
@@ -1057,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file)
        return 0;
 }
+/*
+ * cpuset_rename - Only allow simple rename of directories in place.
+ */
+static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
+                  struct inode *new_dir, struct dentry *new_dentry)
+{
+        if (!S_ISDIR(old_dentry->d_inode->i_mode))
+                return -ENOTDIR;
+        if (new_dentry->d_inode)
+                return -EEXIST;
+        if (old_dir != new_dir)
+                return -EIO;
+        return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
 static struct file_operations cpuset_file_operations = {
        .read = cpuset_file_read,
        .write = cpuset_file_write,
@@ -1069,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = {
        .lookup = simple_lookup,
        .mkdir = cpuset_mkdir,
        .rmdir = cpuset_rmdir,
+        .rename = cpuset_rename,
 };
 static int cpuset_create_file(struct dentry *dentry, int mode)
@@ -1172,7 +1248,9 @@ struct ctr_struct {
 /*
 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
- * Return actual number of pids loaded.
+ * Return actual number of pids loaded.  No need to task_lock(p)
+ * when reading out p->cpuset, as we don't really care if it changes
+ * on the next cycle, and we are not going to try to dereference it.
 */
 static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
 {
@@ -1214,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
        return cnt;
 }
+/*
+ * Handle an open on 'tasks' file.  Prepare a buffer listing the
+ * process id's of tasks currently attached to the cpuset being opened.
+ *
+ * Does not require any specific cpuset semaphores, and does not take any.
+ */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
        struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1361,7 +1445,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
        if (!cs)
                return -ENOMEM;
-        cpuset_down(&cpuset_sem);
+        down(&manage_sem);
+        refresh_mems();
        cs->flags = 0;
        if (notify_on_release(parent))
                set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1375,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
        cs->parent = parent;
+        down(&callback_sem);
        list_add(&cs->sibling, &cs->parent->children);
+        up(&callback_sem);
        err = cpuset_create_dir(cs, name, mode);
        if (err < 0)
                goto err;
        /*
-         * Release cpuset_sem before cpuset_populate_dir() because it
+         * Release manage_sem before cpuset_populate_dir() because it
         * will down() this new directory's i_sem and if we race with
         * another mkdir, we might deadlock.
         */
-        cpuset_up(&cpuset_sem);
+        up(&manage_sem);
        err = cpuset_populate_dir(cs->dentry);
        /* If err < 0, we have a half-filled directory - oh well ;) */
        return 0;
 err:
        list_del(&cs->sibling);
-        cpuset_up(&cpuset_sem);
+        up(&manage_sem);
        kfree(cs);
        return err;
 }
@@ -1415,29 +1502,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
        /* the vfs holds both inode->i_sem already */
-        cpuset_down(&cpuset_sem);
+        down(&manage_sem);
+        refresh_mems();
        if (atomic_read(&cs->count) > 0) {
-                cpuset_up(&cpuset_sem);
+                up(&manage_sem);
                return -EBUSY;
        }
        if (!list_empty(&cs->children)) {
-                cpuset_up(&cpuset_sem);
+                up(&manage_sem);
                return -EBUSY;
        }
        parent = cs->parent;
+        down(&callback_sem);
        set_bit(CS_REMOVED, &cs->flags);
        if (is_cpu_exclusive(cs))
                update_cpu_domains(cs);
        list_del(&cs->sibling); /* delete my sibling from parent->children */
-        if (list_empty(&parent->children))
-                check_for_release(parent, &pathbuf);
        spin_lock(&cs->dentry->d_lock);
        d = dget(cs->dentry);
        cs->dentry = NULL;
        spin_unlock(&d->d_lock);
        cpuset_d_remove_dir(d);
        dput(d);
-        cpuset_up(&cpuset_sem);
+        up(&callback_sem);
+        if (list_empty(&parent->children))
+                check_for_release(parent, &pathbuf);
+        up(&manage_sem);
        cpuset_release_agent(pathbuf);
        return 0;
 }
@@ -1497,16 +1587,26 @@ void __init cpuset_init_smp(void)
 * cpuset_fork - attach newly forked task to its parents cpuset.
 * @tsk: pointer to task_struct of forking parent process.
 *
- * Description: By default, on fork, a task inherits its
+ * Description: A task inherits its parent's cpuset at fork().
- * parent's cpuset.  The pointer to the shared cpuset is
+ *
- * automatically copied in fork.c by dup_task_struct().
+ * A pointer to the shared cpuset was automatically copied in fork.c
- * This cpuset_fork() routine need only increment the usage
+ * by dup_task_struct().  However, we ignore that copy, since it was
- * counter in that cpuset.
+ * not made under the protection of task_lock(), so might no longer be
+ * a valid cpuset pointer.  attach_task() might have already changed
+ * current->cpuset, allowing the previously referenced cpuset to
+ * be removed and freed.  Instead, we task_lock(current) and copy
+ * its present value of current->cpuset for our freshly forked child.
+ *
+ * At the point that cpuset_fork() is called, 'current' is the parent
+ * task, and the passed argument 'child' points to the child task.
 **/
-void cpuset_fork(struct task_struct *tsk)
+void cpuset_fork(struct task_struct *child)
 {
-        atomic_inc(&tsk->cpuset->count);
+        task_lock(current);
+        child->cpuset = current->cpuset;
+        atomic_inc(&child->cpuset->count);
+        task_unlock(current);
 }
 /**
@@ -1515,35 +1615,42 @@ void cpuset_fork(struct task_struct *tsk)
 *
 * Description: Detach cpuset from @tsk and release it.
 *
- * Note that cpusets marked notify_on_release force every task
+ * Note that cpusets marked notify_on_release force every task in
- * in them to take the global cpuset_sem semaphore when exiting.
+ * them to take the global manage_sem semaphore when exiting.
- * This could impact scaling on very large systems.  Be reluctant
+ * This could impact scaling on very large systems.  Be reluctant to
- * to use notify_on_release cpusets where very high task exit
+ * use notify_on_release cpusets where very high task exit scaling
- * scaling is required on large systems.
+ * is required on large systems.
- *
+ *
- * Don't even think about derefencing 'cs' after the cpuset use
+ * Don't even think about derefencing 'cs' after the cpuset use count
- * count goes to zero, except inside a critical section guarded
+ * goes to zero, except inside a critical section guarded by manage_sem
- * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
+ * or callback_sem.   Otherwise a zero cpuset use count is a license to
- * then a zero cpuset use count is a license to any other task to
+ * any other task to nuke the cpuset immediately, via cpuset_rmdir().
- * nuke the cpuset immediately.
+ *
+ * This routine has to take manage_sem, not callback_sem, because
+ * it is holding that semaphore while calling check_for_release(),
+ * which calls kmalloc(), so can't be called holding callback__sem().
+ *
+ * We don't need to task_lock() this reference to tsk->cpuset,
+ * because tsk is already marked PF_EXITING, so attach_task() won't
+ * mess with it.
 **/
 void cpuset_exit(struct task_struct *tsk)
 {
        struct cpuset *cs;
-        task_lock(tsk);
+        BUG_ON(!(tsk->flags & PF_EXITING));
        cs = tsk->cpuset;
        tsk->cpuset = NULL;
-        task_unlock(tsk);
        if (notify_on_release(cs)) {
                char *pathbuf = NULL;
-                cpuset_down(&cpuset_sem);
+                down(&manage_sem);
                if (atomic_dec_and_test(&cs->count))
                        check_for_release(cs, &pathbuf);
-                cpuset_up(&cpuset_sem);
+                up(&manage_sem);
                cpuset_release_agent(pathbuf);
        } else {
                atomic_dec(&cs->count);
@@ -1564,11 +1671,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
 {
        cpumask_t mask;
-        cpuset_down(&cpuset_sem);
+        down(&callback_sem);
        task_lock((struct task_struct *)tsk);
        guarantee_online_cpus(tsk->cpuset, &mask);
        task_unlock((struct task_struct *)tsk);
-        cpuset_up(&cpuset_sem);
+        up(&callback_sem);
        return mask;
 }
@@ -1584,19 +1691,28 @@ void cpuset_init_current_mems_allowed(void)
 * If the current tasks cpusets mems_allowed changed behind our backs,
 * update current->mems_allowed and mems_generation to the new value.
 * Do not call this routine if in_interrupt().
+ *
+ * Call without callback_sem or task_lock() held.  May be called
+ * with or without manage_sem held.  Unless exiting, it will acquire
+ * task_lock().  Also might acquire callback_sem during call to
+ * refresh_mems().
 */
 void cpuset_update_current_mems_allowed(void)
 {
-        struct cpuset *cs = current->cpuset;
+        struct cpuset *cs;
+        int need_to_refresh = 0;
+        task_lock(current);
+        cs = current->cpuset;
        if (!cs)
-                return;         /* task is exiting */
+                goto done;
-        if (current->cpuset_mems_generation != cs->mems_generation) {
+        if (current->cpuset_mems_generation != cs->mems_generation)
-                cpuset_down(&cpuset_sem);
+                need_to_refresh = 1;
+done:
+        task_unlock(current);
+        if (need_to_refresh)
                refresh_mems();
-                cpuset_up(&cpuset_sem);
-        }
 }
 /**
@@ -1630,7 +1746,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 /*
 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
- * ancestor to the specified cpuset.  Call while holding cpuset_sem.
+ * ancestor to the specified cpuset.  Call holding callback_sem.
 * If no ancestor is mem_exclusive (an unusual configuration), then
 * returns the root cpuset.
 */
@@ -1657,12 +1773,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 * GFP_KERNEL allocations are not so marked, so can escape to the
 * nearest mem_exclusive ancestor cpuset.
 *
- * Scanning up parent cpusets requires cpuset_sem.  The __alloc_pages()
+ * Scanning up parent cpusets requires callback_sem.  The __alloc_pages()
 * routine only calls here with __GFP_HARDWALL bit _not_ set if
 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
 * mems_allowed came up empty on the first pass over the zonelist.
 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
- * short of memory, might require taking the cpuset_sem semaphore.
+ * short of memory, might require taking the callback_sem semaphore.
 *
 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -1679,7 +1795,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
 **/
-int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
+int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
        int node;                       /* node that zone z is on */
        const struct cpuset *cs;        /* current cpuset ancestors */
@@ -1693,15 +1809,18 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
        if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
                return 0;
+        if (current->flags & PF_EXITING) /* Let dying task have memory */
+                return 1;
        /* Not hardwall and node outside mems_allowed: scan up cpusets */
-        cpuset_down(&cpuset_sem);
+        down(&callback_sem);
-        cs = current->cpuset;
-        if (!cs)
+        task_lock(current);
-                goto done;              /* current task exiting */
+        cs = nearest_exclusive_ancestor(current->cpuset);
-        cs = nearest_exclusive_ancestor(cs);
+        task_unlock(current);
        allowed = node_isset(node, cs->mems_allowed);
-done:
+        up(&callback_sem);
-        cpuset_up(&cpuset_sem);
        return allowed;
 }
@@ -1714,7 +1833,7 @@ done:
 * determine if task @p's memory usage might impact the memory
 * available to the current task.
 *
- * Acquires cpuset_sem - not suitable for calling from a fast path.
+ * Acquires callback_sem - not suitable for calling from a fast path.
 **/
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -1722,18 +1841,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
        const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
        int overlap = 0;                /* do cpusets overlap? */
-        cpuset_down(&cpuset_sem);
+        down(&callback_sem);
-        cs1 = current->cpuset;
-        if (!cs1)
+        task_lock(current);
-                goto done;              /* current task exiting */
+        if (current->flags & PF_EXITING) {
-        cs2 = p->cpuset;
+                task_unlock(current);
-        if (!cs2)
+                goto done;
-                goto done;              /* task p is exiting */
+        }
-        cs1 = nearest_exclusive_ancestor(cs1);
+        cs1 = nearest_exclusive_ancestor(current->cpuset);
-        cs2 = nearest_exclusive_ancestor(cs2);
+        task_unlock(current);
+        task_lock((struct task_struct *)p);
+        if (p->flags & PF_EXITING) {
+                task_unlock((struct task_struct *)p);
+                goto done;
+        }
+        cs2 = nearest_exclusive_ancestor(p->cpuset);
+        task_unlock((struct task_struct *)p);
        overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
 done:
-        cpuset_up(&cpuset_sem);
+        up(&callback_sem);
        return overlap;
 }
@@ -1742,6 +1870,10 @@ done:
 * proc_cpuset_show()
 *  - Print tasks cpuset path into seq_file.
 *  - Used for /proc/<pid>/cpuset.
+ *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
+ *    doesn't really matter if tsk->cpuset changes after we read it,
+ *    and we take manage_sem, keeping attach_task() from changing it
+ *    anyway.
 */
 static int proc_cpuset_show(struct seq_file *m, void *v)
@@ -1756,10 +1888,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
                return -ENOMEM;
        tsk = m->private;
-        cpuset_down(&cpuset_sem);
+        down(&manage_sem);
-        task_lock(tsk);
        cs = tsk->cpuset;
-        task_unlock(tsk);
        if (!cs) {
                retval = -EINVAL;
                goto out;
@@ -1771,7 +1901,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
        seq_puts(m, buf);
        seq_putc(m, '\n');
 out:
-        cpuset_up(&cpuset_sem);
+        up(&manage_sem);
        kfree(buf);
        return retval;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index ee6d8b8abef5..ee515683b92d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -28,6 +28,7 @@
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <linux/cn_proc.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -547,7 +548,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
        if (p->pdeath_signal)
                /* We already hold the tasklist_lock here.  */
-                group_send_sig_info(p->pdeath_signal, (void *) 0, p);
+                group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
        /* Move the child from its dying parent to the new one.  */
        if (unlikely(traced)) {
@@ -591,8 +592,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
                int pgrp = process_group(p);
                if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
-                        __kill_pg_info(SIGHUP, (void *)1, pgrp);
+                        __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
-                        __kill_pg_info(SIGCONT, (void *)1, pgrp);
+                        __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
                }
        }
 }
@@ -727,8 +728,8 @@ static void exit_notify(struct task_struct *tsk)
            (t->signal->session == tsk->signal->session) &&
            will_become_orphaned_pgrp(process_group(tsk), tsk) &&
            has_stopped_jobs(process_group(tsk))) {
-                __kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
+                __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
-                __kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
+                __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk));
        }
        /* Let father know we died 
@@ -783,10 +784,6 @@ static void exit_notify(struct task_struct *tsk)
        /* If the process is dead, release it - nobody will wait for it */
        if (state == EXIT_DEAD)
                release_task(tsk);
-        /* PF_DEAD causes final put_task_struct after we schedule. */
-        preempt_disable();
-        tsk->flags |= PF_DEAD;
 }
 fastcall NORET_TYPE void do_exit(long code)
@@ -839,10 +836,14 @@ fastcall NORET_TYPE void do_exit(long code)
                                preempt_count());
        acct_update_integrals(tsk);
-        update_mem_hiwater(tsk);
+        if (tsk->mm) {
+                update_hiwater_rss(tsk->mm);
+                update_hiwater_vm(tsk->mm);
+        }
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                del_timer_sync(&tsk->signal->real_timer);
+                exit_itimers(tsk->signal);
                acct_process(code);
        }
        exit_mm(tsk);
@@ -858,18 +859,23 @@ fastcall NORET_TYPE void do_exit(long code)
        if (group_dead && tsk->signal->leader)
                disassociate_ctty(1);
-        module_put(tsk->thread_info->exec_domain->module);
+        module_put(task_thread_info(tsk)->exec_domain->module);
        if (tsk->binfmt)
                module_put(tsk->binfmt->module);
        tsk->exit_code = code;
+        proc_exit_connector(tsk);
        exit_notify(tsk);
 #ifdef CONFIG_NUMA
        mpol_free(tsk->mempolicy);
        tsk->mempolicy = NULL;
 #endif
-        BUG_ON(!(current->flags & PF_DEAD));
+        /* PF_DEAD causes final put_task_struct after we schedule. */
+        preempt_disable();
+        BUG_ON(tsk->flags & PF_DEAD);
+        tsk->flags |= PF_DEAD;
        schedule();
        BUG();
        /* Avoid "noreturn function does return".  */
@@ -1203,7 +1209,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
                exit_code = p->exit_code;
                if (unlikely(!exit_code) ||
-                    unlikely(p->state > TASK_STOPPED))
+                    unlikely(p->state & TASK_TRACED))
                        goto bail_ref;
                return wait_noreap_copyout(p, pid, uid,
                                           why, (exit_code << 8) | 0x7f,
@@ -1379,6 +1385,15 @@ repeat:
                        switch (p->state) {
                        case TASK_TRACED:
+                                /*
+                                 * When we hit the race with PTRACE_ATTACH,
+                                 * we will not report this child.  But the
+                                 * race means it has not yet been moved to
+                                 * our ptrace_children list, so we need to
+                                 * set the flag here to avoid a spurious ECHILD
+                                 * when the race happens with the only child.
+                                 */
+                                flag = 1;
                                if (!my_ptrace_child(p))
                                        continue;
                                /*FALLTHROUGH*/
diff --git a/kernel/fork.c b/kernel/fork.c
index 533ce27f4b2c..fb8572a42297 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -42,6 +42,7 @@
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/cn_proc.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -170,10 +171,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
                return NULL;
        }
-        *ti = *orig->thread_info;
        *tsk = *orig;
        tsk->thread_info = ti;
-        ti->task = tsk;
+        setup_thread_stack(tsk, orig);
        /* One for us, one for whoever does the "release_task()" (usually parent) */
        atomic_set(&tsk->usage,2);
@@ -182,37 +182,37 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 }
 #ifdef CONFIG_MMU
-static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
+static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-        struct vm_area_struct * mpnt, *tmp, **pprev;
+        struct vm_area_struct *mpnt, *tmp, **pprev;
        struct rb_node **rb_link, *rb_parent;
        int retval;
        unsigned long charge;
        struct mempolicy *pol;
        down_write(&oldmm->mmap_sem);
-        flush_cache_mm(current->mm);
+        flush_cache_mm(oldmm);
+        down_write(&mm->mmap_sem);
        mm->locked_vm = 0;
        mm->mmap = NULL;
        mm->mmap_cache = NULL;
        mm->free_area_cache = oldmm->mmap_base;
        mm->cached_hole_size = ~0UL;
        mm->map_count = 0;
-        set_mm_counter(mm, rss, 0);
-        set_mm_counter(mm, anon_rss, 0);
        cpus_clear(mm->cpu_vm_mask);
        mm->mm_rb = RB_ROOT;
        rb_link = &mm->mm_rb.rb_node;
        rb_parent = NULL;
        pprev = &mm->mmap;
-        for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+        for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
                struct file *file;
                if (mpnt->vm_flags & VM_DONTCOPY) {
                        long pages = vma_pages(mpnt);
                        mm->total_vm -= pages;
-                        __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+                        vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
                                                                -pages);
                        continue;
                }
@@ -253,12 +253,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
                }
                /*
-                 * Link in the new vma and copy the page table entries:
+                 * Link in the new vma and copy the page table entries.
-                 * link in first so that swapoff can see swap entries.
-                 * Note that, exceptionally, here the vma is inserted
-                 * without holding mm->mmap_sem.
                 */
-                spin_lock(&mm->page_table_lock);
                *pprev = tmp;
                pprev = &tmp->vm_next;
@@ -267,8 +263,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
                rb_parent = &tmp->vm_rb;
                mm->map_count++;
-                retval = copy_page_range(mm, current->mm, tmp);
+                retval = copy_page_range(mm, oldmm, mpnt);
-                spin_unlock(&mm->page_table_lock);
                if (tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);
@@ -277,9 +272,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
                        goto out;
        }
        retval = 0;
 out:
-        flush_tlb_mm(current->mm);
+        up_write(&mm->mmap_sem);
+        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
        return retval;
 fail_nomem_policy:
@@ -323,10 +318,11 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
        INIT_LIST_HEAD(&mm->mmlist);
        mm->core_waiters = 0;
        mm->nr_ptes = 0;
+        set_mm_counter(mm, file_rss, 0);
+        set_mm_counter(mm, anon_rss, 0);
        spin_lock_init(&mm->page_table_lock);
        rwlock_init(&mm->ioctx_list_lock);
        mm->ioctx_list = NULL;
-        mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
        mm->free_area_cache = TASK_UNMAPPED_BASE;
        mm->cached_hole_size = ~0UL;
@@ -472,13 +468,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
        if (clone_flags & CLONE_VM) {
                atomic_inc(&oldmm->mm_users);
                mm = oldmm;
-                /*
-                 * There are cases where the PTL is held to ensure no
-                 * new threads start up in user mode using an mm, which
-                 * allows optimizing out ipis; the tlb_gather_mmu code
-                 * is an example.
-                 */
-                spin_unlock_wait(&oldmm->page_table_lock);
                goto good_mm;
        }
@@ -499,7 +488,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
        if (retval)
                goto free_pt;
-        mm->hiwater_rss = get_mm_counter(mm,rss);
+        mm->hiwater_rss = get_mm_rss(mm);
        mm->hiwater_vm = mm->total_vm;
 good_mm:
@@ -848,7 +837,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long new_flags = p->flags;
-        new_flags &= ~PF_SUPERPRIV;
+        new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
        new_flags |= PF_FORKNOEXEC;
        if (!(clone_flags & CLONE_PTRACE))
                p->ptrace = 0;
@@ -928,7 +917,7 @@ static task_t *copy_process(unsigned long clone_flags,
        if (nr_threads >= max_threads)
                goto bad_fork_cleanup_count;
-        if (!try_module_get(p->thread_info->exec_domain->module))
+        if (!try_module_get(task_thread_info(p)->exec_domain->module))
                goto bad_fork_cleanup_count;
        if (p->binfmt && !try_module_get(p->binfmt->module))
@@ -1135,8 +1124,6 @@ static task_t *copy_process(unsigned long clone_flags,
        if (unlikely(p->ptrace & PT_PTRACED))
                __ptrace_link(p, current->parent);
-        cpuset_fork(p);
        attach_pid(p, PIDTYPE_PID, p->pid);
        attach_pid(p, PIDTYPE_TGID, p->tgid);
        if (thread_group_leader(p)) {
@@ -1152,6 +1139,8 @@ static task_t *copy_process(unsigned long clone_flags,
        nr_threads++;
        total_forks++;
        write_unlock_irq(&tasklist_lock);
+        proc_fork_connector(p);
+        cpuset_fork(p);
        retval = 0;
 fork_out:
@@ -1188,7 +1177,7 @@ bad_fork_cleanup:
        if (p->binfmt)
                module_put(p->binfmt->module);
 bad_fork_cleanup_put_domain:
-        module_put(p->thread_info->exec_domain->module);
+        module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
        put_group_info(p->group_info);
        atomic_dec(&p->user->processes);
diff --git a/kernel/futex.c b/kernel/futex.c
index ca05fe6a70b2..5872e3507f35 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -201,23 +201,6 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
         * from swap.  But that's a lot of code to duplicate here
         * for a rare case, so we simply fetch the page.
         */
-        /*
-         * Do a quick atomic lookup first - this is the fastpath.
-         */
-        spin_lock(&current->mm->page_table_lock);
-        page = follow_page(mm, uaddr, 0);
-        if (likely(page != NULL)) {
-                key->shared.pgoff =
-                        page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-                spin_unlock(&current->mm->page_table_lock);
-                return 0;
-        }
-        spin_unlock(&current->mm->page_table_lock);
-        /*
-         * Do it the general way.
-         */
        err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
        if (err >= 0) {
                key->shared.pgoff =
@@ -367,6 +350,11 @@ retry:
                if (bh1 != bh2)
                        spin_unlock(&bh2->lock);
+                if (unlikely(op_ret != -EFAULT)) {
+                        ret = op_ret;
+                        goto out;
+                }
                /* futex_atomic_op_inuser needs to both read and write
                 * *(int __user *)uaddr2, but we can't modify it
                 * non-atomically.  Therefore, if get_user below is not
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3ff7b925c387..51df337b37db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -117,14 +117,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
                /*
                 * No locking required for CPU-local interrupts:
                 */
-                desc->handler->ack(irq);
+                if (desc->handler->ack)
+                        desc->handler->ack(irq);
                action_ret = handle_IRQ_event(irq, regs, desc->action);
                desc->handler->end(irq);
                return 1;
        }
        spin_lock(&desc->lock);
-        desc->handler->ack(irq);
+        if (desc->handler->ack)
+                desc->handler->ack(irq);
        /*
         * REPLAY is when Linux resends an IRQ that was dropped earlier
         * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1cfdb08ddf20..81c49a4d679e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -24,6 +24,7 @@ cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
 /**
 *      synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ *      @irq: interrupt number to wait for
 *
 *      This function waits for any pending IRQ handlers for this interrupt
 *      to complete before returning. If you use this function while
@@ -35,6 +36,9 @@ void synchronize_irq(unsigned int irq)
 {
        struct irq_desc *desc = irq_desc + irq;
+        if (irq >= NR_IRQS)
+                return;
        while (desc->status & IRQ_INPROGRESS)
                cpu_relax();
 }
@@ -59,6 +63,9 @@ void disable_irq_nosync(unsigned int irq)
        irq_desc_t *desc = irq_desc + irq;
        unsigned long flags;
+        if (irq >= NR_IRQS)
+                return;
        spin_lock_irqsave(&desc->lock, flags);
        if (!desc->depth++) {
                desc->status |= IRQ_DISABLED;
@@ -85,6 +92,9 @@ void disable_irq(unsigned int irq)
 {
        irq_desc_t *desc = irq_desc + irq;
+        if (irq >= NR_IRQS)
+                return;
        disable_irq_nosync(irq);
        if (desc->action)
                synchronize_irq(irq);
@@ -107,6 +117,9 @@ void enable_irq(unsigned int irq)
        irq_desc_t *desc = irq_desc + irq;
        unsigned long flags;
+        if (irq >= NR_IRQS)
+                return;
        spin_lock_irqsave(&desc->lock, flags);
        switch (desc->depth) {
        case 0:
@@ -162,6 +175,9 @@ int setup_irq(unsigned int irq, struct irqaction * new)
        unsigned long flags;
        int shared = 0;
+        if (irq >= NR_IRQS)
+                return -EINVAL;
        if (desc->handler == &no_irq_type)
                return -ENOSYS;
        /*
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13bcec151b57..39277dd6bf90 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>        /* for cond_resched */
 #include <linux/mm.h>
 #include <asm/sections.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cdd4dcd8fb63..2c95848fbce8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p)
 static int kimage_is_destination_range(struct kimage *image,
                                       unsigned long start, unsigned long end);
 static struct page *kimage_alloc_page(struct kimage *image,
-                                       unsigned int gfp_mask,
+                                       gfp_t gfp_mask,
                                       unsigned long dest);
 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
@@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image,
        return 0;
 }
-static struct page *kimage_alloc_pages(unsigned int gfp_mask,
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-                                        unsigned int order)
 {
        struct page *pages;
@@ -335,7 +334,7 @@ static struct page *kimage_alloc_pages(unsigned int gfp_mask,
        if (pages) {
                unsigned int count, i;
                pages->mapping = NULL;
-                pages->private = order;
+                set_page_private(pages, order);
                count = 1 << order;
                for (i = 0; i < count; i++)
                        SetPageReserved(pages + i);
@@ -348,7 +347,7 @@ static void kimage_free_pages(struct page *page)
 {
        unsigned int order, count, i;
-        order = page->private;
+        order = page_private(page);
        count = 1 << order;
        for (i = 0; i < count; i++)
                ClearPageReserved(page + i);
@@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image,
 }
 static struct page *kimage_alloc_page(struct kimage *image,
-                                        unsigned int gfp_mask,
+                                        gfp_t gfp_mask,
                                        unsigned long destination)
 {
        /*
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 179baafcdd96..64ab045c3d9d 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -36,7 +36,7 @@
 * struct kfifo with kfree().
 */
 struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
-                         unsigned int __nocast gfp_mask, spinlock_t *lock)
+                         gfp_t gfp_mask, spinlock_t *lock)
 {
        struct kfifo *fifo;
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(kfifo_init);
 *
 * The size will be rounded-up to a power of 2.
 */
-struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock)
+struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
 {
        unsigned char *buffer;
        struct kfifo *ret;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 44166e3bb8af..51a892063aaa 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -131,14 +131,14 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
        struct subprocess_info *sub_info = data;
-        struct key *old_session;
+        struct key *new_session, *old_session;
        int retval;
        /* Unblock all signals and set the session keyring. */
-        key_get(sub_info->ring);
+        new_session = key_get(sub_info->ring);
        flush_signals(current);
        spin_lock_irq(&current->sighand->siglock);
-        old_session = __install_session_keyring(current, sub_info->ring);
+        old_session = __install_session_keyring(current, new_session);
        flush_signal_handlers(current, 1);
        sigemptyset(&current->blocked);
        recalc_sigpending();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f3ea492ab44d..5beda378cc75 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -32,9 +32,9 @@
 *              <prasanna@in.ibm.com> added function-return probes.
 */
 #include <linux/kprobes.h>
-#include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <asm-generic/sections.h>
@@ -48,9 +48,9 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
-unsigned int kprobe_cpu = NR_CPUS;
+static DEFINE_SPINLOCK(kprobe_lock);    /* Protects kprobe_table */
-static DEFINE_SPINLOCK(kprobe_lock);
+DEFINE_SPINLOCK(kretprobe_lock);        /* Protects kretprobe_inst_table */
-static struct kprobe *curr_kprobe;
+static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 /*
 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -152,50 +152,31 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
        }
 }
-/* Locks kprobe: irqs must be disabled */
+/* We have preemption disabled.. so it is safe to use __ versions */
-void __kprobes lock_kprobes(void)
+static inline void set_kprobe_instance(struct kprobe *kp)
 {
-        unsigned long flags = 0;
+        __get_cpu_var(kprobe_instance) = kp;
-        /* Avoiding local interrupts to happen right after we take the kprobe_lock
-         * and before we get a chance to update kprobe_cpu, this to prevent
-         * deadlock when we have a kprobe on ISR routine and a kprobe on task
-         * routine
-         */
-        local_irq_save(flags);
-        spin_lock(&kprobe_lock);
-        kprobe_cpu = smp_processor_id();
-        local_irq_restore(flags);
 }
-void __kprobes unlock_kprobes(void)
+static inline void reset_kprobe_instance(void)
 {
-        unsigned long flags = 0;
+        __get_cpu_var(kprobe_instance) = NULL;
-        /* Avoiding local interrupts to happen right after we update
-         * kprobe_cpu and before we get a a chance to release kprobe_lock,
-         * this to prevent deadlock when we have a kprobe on ISR routine and
-         * a kprobe on task routine
-         */
-        local_irq_save(flags);
-        kprobe_cpu = NR_CPUS;
-        spin_unlock(&kprobe_lock);
-        local_irq_restore(flags);
 }
-/* You have to be holding the kprobe_lock */
+/*
+ * This routine is called either:
+ *      - under the kprobe_lock spinlock - during kprobe_[un]register()
+ *                              OR
+ *      - with preemption disabled - from arch/xxx/kernel/kprobes.c
+ */
 struct kprobe __kprobes *get_kprobe(void *addr)
 {
        struct hlist_head *head;
        struct hlist_node *node;
+        struct kprobe *p;
        head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
-        hlist_for_each(node, head) {
+        hlist_for_each_entry_rcu(p, node, head, hlist) {
-                struct kprobe *p = hlist_entry(node, struct kprobe, hlist);
                if (p->addr == addr)
                        return p;
        }
@@ -210,13 +191,13 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe *kp;
-        list_for_each_entry(kp, &p->list, list) {
+        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->pre_handler) {
-                        curr_kprobe = kp;
+                        set_kprobe_instance(kp);
                        if (kp->pre_handler(kp, regs))
                                return 1;
                }
-                curr_kprobe = NULL;
+                reset_kprobe_instance();
        }
        return 0;
 }
@@ -226,11 +207,11 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 {
        struct kprobe *kp;
-        list_for_each_entry(kp, &p->list, list) {
+        list_for_each_entry_rcu(kp, &p->list, list) {
                if (kp->post_handler) {
-                        curr_kprobe = kp;
+                        set_kprobe_instance(kp);
                        kp->post_handler(kp, regs, flags);
-                        curr_kprobe = NULL;
+                        reset_kprobe_instance();
                }
        }
        return;
@@ -239,12 +220,14 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
                                        int trapnr)
 {
+        struct kprobe *cur = __get_cpu_var(kprobe_instance);
        /*
         * if we faulted "during" the execution of a user specified
         * probe handler, invoke just that probe's fault handler
         */
-        if (curr_kprobe && curr_kprobe->fault_handler) {
+        if (cur && cur->fault_handler) {
-                if (curr_kprobe->fault_handler(curr_kprobe, regs, trapnr))
+                if (cur->fault_handler(cur, regs, trapnr))
                        return 1;
        }
        return 0;
@@ -252,17 +235,18 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
 static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
-        struct kprobe *kp = curr_kprobe;
+        struct kprobe *cur = __get_cpu_var(kprobe_instance);
-        if (curr_kprobe && kp->break_handler) {
+        int ret = 0;
-                if (kp->break_handler(kp, regs)) {
-                        curr_kprobe = NULL;
+        if (cur && cur->break_handler) {
-                        return 1;
+                if (cur->break_handler(cur, regs))
-                }
+                        ret = 1;
        }
-        curr_kprobe = NULL;
+        reset_kprobe_instance();
-        return 0;
+        return ret;
 }
+/* Called with kretprobe_lock held */
 struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 {
        struct hlist_node *node;
@@ -272,6 +256,7 @@ struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
        return NULL;
 }
+/* Called with kretprobe_lock held */
 static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
                                                              *rp)
 {
@@ -282,6 +267,7 @@ static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
        return NULL;
 }
+/* Called with kretprobe_lock held */
 void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 {
        /*
@@ -300,6 +286,7 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri)
        hlist_add_head(&ri->uflist, &ri->rp->used_instances);
 }
+/* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
 {
        /* remove rp inst off the rprobe_inst_table */
@@ -333,13 +320,13 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
        struct hlist_node *node, *tmp;
        unsigned long flags = 0;
-        spin_lock_irqsave(&kprobe_lock, flags);
+        spin_lock_irqsave(&kretprobe_lock, flags);
        head = kretprobe_inst_table_head(current);
        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
                if (ri->task == tk)
                        recycle_rp_inst(ri);
        }
-        spin_unlock_irqrestore(&kprobe_lock, flags);
+        spin_unlock_irqrestore(&kretprobe_lock, flags);
 }
 /*
@@ -350,9 +337,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
                                           struct pt_regs *regs)
 {
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+        unsigned long flags = 0;
        /*TODO: consider to only swap the RA after the last pre_handler fired */
+        spin_lock_irqsave(&kretprobe_lock, flags);
        arch_prepare_kretprobe(rp, regs);
+        spin_unlock_irqrestore(&kretprobe_lock, flags);
        return 0;
 }
@@ -383,13 +373,13 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
        struct kprobe *kp;
        if (p->break_handler) {
-                list_for_each_entry(kp, &old_p->list, list) {
+                list_for_each_entry_rcu(kp, &old_p->list, list) {
                        if (kp->break_handler)
                                return -EEXIST;
                }
-                list_add_tail(&p->list, &old_p->list);
+                list_add_tail_rcu(&p->list, &old_p->list);
        } else
-                list_add(&p->list, &old_p->list);
+                list_add_rcu(&p->list, &old_p->list);
        return 0;
 }
@@ -407,18 +397,18 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
        ap->break_handler = aggr_break_handler;
        INIT_LIST_HEAD(&ap->list);
-        list_add(&p->list, &ap->list);
+        list_add_rcu(&p->list, &ap->list);
        INIT_HLIST_NODE(&ap->hlist);
-        hlist_del(&p->hlist);
+        hlist_del_rcu(&p->hlist);
-        hlist_add_head(&ap->hlist,
+        hlist_add_head_rcu(&ap->hlist,
                &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]);
 }
 /*
 * This is the second or subsequent kprobe at the address - handle
 * the intricacies
- * TODO: Move kcalloc outside the spinlock
+ * TODO: Move kcalloc outside the spin_lock
 */
 static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
                                          struct kprobe *p)
@@ -444,7 +434,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
 {
        arch_disarm_kprobe(p);
-        hlist_del(&p->hlist);
+        hlist_del_rcu(&p->hlist);
        spin_unlock_irqrestore(&kprobe_lock, flags);
        arch_remove_kprobe(p);
 }
@@ -452,11 +442,10 @@ static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
 static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
                struct kprobe *p, unsigned long flags)
 {
-        list_del(&p->list);
+        list_del_rcu(&p->list);
-        if (list_empty(&old_p->list)) {
+        if (list_empty(&old_p->list))
                cleanup_kprobe(old_p, flags);
-                kfree(old_p);
+        else
-        } else
                spin_unlock_irqrestore(&kprobe_lock, flags);
 }
@@ -479,9 +468,9 @@ int __kprobes register_kprobe(struct kprobe *p)
        if ((ret = arch_prepare_kprobe(p)) != 0)
                goto rm_kprobe;
+        p->nmissed = 0;
        spin_lock_irqsave(&kprobe_lock, flags);
        old_p = get_kprobe(p->addr);
-        p->nmissed = 0;
        if (old_p) {
                ret = register_aggr_kprobe(old_p, p);
                goto out;
@@ -489,7 +478,7 @@ int __kprobes register_kprobe(struct kprobe *p)
        arch_copy_kprobe(p);
        INIT_HLIST_NODE(&p->hlist);
-        hlist_add_head(&p->hlist,
+        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
        arch_arm_kprobe(p);
@@ -510,10 +499,16 @@ void __kprobes unregister_kprobe(struct kprobe *p)
        spin_lock_irqsave(&kprobe_lock, flags);
        old_p = get_kprobe(p->addr);
        if (old_p) {
+                /* cleanup_*_kprobe() does the spin_unlock_irqrestore */
                if (old_p->pre_handler == aggr_pre_handler)
                        cleanup_aggr_kprobe(old_p, p, flags);
                else
                        cleanup_kprobe(p, flags);
+                synchronize_sched();
+                if (old_p->pre_handler == aggr_pre_handler &&
+                                list_empty(&old_p->list))
+                        kfree(old_p);
        } else
                spin_unlock_irqrestore(&kprobe_lock, flags);
 }
@@ -590,13 +585,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
        unregister_kprobe(&rp->kp);
        /* No race here */
-        spin_lock_irqsave(&kprobe_lock, flags);
+        spin_lock_irqsave(&kretprobe_lock, flags);
        free_rp_inst(rp);
        while ((ri = get_used_rp_inst(rp)) != NULL) {
                ri->rp = NULL;
                hlist_del(&ri->uflist);
        }
-        spin_unlock_irqrestore(&kprobe_lock, flags);
+        spin_unlock_irqrestore(&kretprobe_lock, flags);
 }
 static int __init init_kprobes(void)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index f50f174e92da..e75950a1092c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -165,6 +165,12 @@ EXPORT_SYMBOL(kthread_bind);
 int kthread_stop(struct task_struct *k)
 {
+        return kthread_stop_sem(k, NULL);
+}
+EXPORT_SYMBOL(kthread_stop);
+int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
+{
        int ret;
        down(&kthread_stop_lock);
@@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k)
        /* Now set kthread_should_stop() to true, and wake it up. */
        kthread_stop_info.k = k;
-        wake_up_process(k);
+        if (s)
+                up(s);
+        else
+                wake_up_process(k);
        put_task_struct(k);
        /* Once it dies, reset stop ptr, gather result and we're done. */
@@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k)
        return ret;
 }
-EXPORT_SYMBOL(kthread_stop);
+EXPORT_SYMBOL(kthread_stop_sem);
 static __init int helper_init(void)
 {
diff --git a/kernel/module.c b/kernel/module.c
index ff5c500ab625..2ea929d51ad0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -37,6 +37,7 @@
 #include <linux/stop_machine.h>
 #include <linux/device.h>
 #include <linux/string.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
diff --git a/kernel/params.c b/kernel/params.c
index fbf173215fd2..47ba69547945 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 #if 0
 #define DEBUGP printk
@@ -80,8 +81,6 @@ static char *next_arg(char *args, char **param, char **val)
        int in_quote = 0, quoted = 0;
        char *next;
-        /* Chew any extra spaces */
-        while (*args == ' ') args++;
        if (*args == '"') {
                args++;
                in_quote = 1;
@@ -121,6 +120,10 @@ static char *next_arg(char *args, char **param, char **val)
                next = args + i + 1;
        } else
                next = args + i;
+        /* Chew up trailing spaces. */
+        while (*next == ' ')
+                next++;
        return next;
 }
@@ -135,6 +138,10 @@ int parse_args(const char *name,
        DEBUGP("Parsing ARGS: %s\n", args);
+        /* Chew leading spaces */
+        while (*args == ' ')
+                args++;
        while (*args) {
                int ret;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ad85d3f0dcc4..cae4f5728997 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -36,7 +36,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
        union cpu_time_count ret;
        ret.sched = 0;          /* high half always zero when .cpu used */
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-                ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+                ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
        } else {
                ret.cpu = timespec_to_cputime(tp);
        }
@@ -91,7 +91,7 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
 * Update expiry time from increment, and increase overrun count,
 * given the current clock sample.
 */
-static inline void bump_cpu_timer(struct k_itimer *timer,
+static void bump_cpu_timer(struct k_itimer *timer,
                                  union cpu_time_count now)
 {
        int i;
@@ -110,7 +110,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
                for (i = 0; incr < delta - incr; i++)
                        incr = incr << 1;
                for (; i >= 0; incr >>= 1, i--) {
-                        if (delta <= incr)
+                        if (delta < incr)
                                continue;
                        timer->it.cpu.expires.sched += incr;
                        timer->it_overrun += 1 << i;
@@ -128,7 +128,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
                for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
                             incr = cputime_add(incr, incr);
                for (; i >= 0; incr = cputime_halve(incr), i--) {
-                        if (cputime_le(delta, incr))
+                        if (cputime_lt(delta, incr))
                                continue;
                        timer->it.cpu.expires.cpu =
                                cputime_add(timer->it.cpu.expires.cpu, incr);
@@ -380,14 +380,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 int posix_cpu_timer_del(struct k_itimer *timer)
 {
        struct task_struct *p = timer->it.cpu.task;
+        int ret = 0;
-        if (timer->it.cpu.firing)
+        if (likely(p != NULL)) {
-                return TIMER_RETRY;
-        if (unlikely(p == NULL))
-                return 0;
-        if (!list_empty(&timer->it.cpu.entry)) {
                read_lock(&tasklist_lock);
                if (unlikely(p->signal == NULL)) {
                        /*
@@ -396,18 +391,20 @@ int posix_cpu_timer_del(struct k_itimer *timer)
                         */
                        BUG_ON(!list_empty(&timer->it.cpu.entry));
                } else {
-                        /*
-                         * Take us off the task's timer list.
-                         */
                        spin_lock(&p->sighand->siglock);
-                        list_del(&timer->it.cpu.entry);
+                        if (timer->it.cpu.firing)
+                                ret = TIMER_RETRY;
+                        else
+                                list_del(&timer->it.cpu.entry);
                        spin_unlock(&p->sighand->siglock);
                }
                read_unlock(&tasklist_lock);
+                if (!ret)
+                        put_task_struct(p);
        }
-        put_task_struct(p);
-        return 0;
+        return ret;
 }
 /*
@@ -424,7 +421,6 @@ static void cleanup_timers(struct list_head *head,
        cputime_t ptime = cputime_add(utime, stime);
        list_for_each_entry_safe(timer, next, head, entry) {
-                timer->task = NULL;
                list_del_init(&timer->entry);
                if (cputime_lt(timer->expires.cpu, ptime)) {
                        timer->expires.cpu = cputime_zero;
@@ -436,7 +432,6 @@ static void cleanup_timers(struct list_head *head,
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
-                timer->task = NULL;
                list_del_init(&timer->entry);
                if (cputime_lt(timer->expires.cpu, utime)) {
                        timer->expires.cpu = cputime_zero;
@@ -448,7 +443,6 @@ static void cleanup_timers(struct list_head *head,
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
-                timer->task = NULL;
                list_del_init(&timer->entry);
                if (timer->expires.sched < sched_time) {
                        timer->expires.sched = 0;
@@ -492,6 +486,9 @@ static void process_timer_rebalance(struct task_struct *p,
        struct task_struct *t = p;
        unsigned int nthreads = atomic_read(&p->signal->live);
+        if (!nthreads)
+                return;
        switch (clock_idx) {
        default:
                BUG();
@@ -500,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p,
                left = cputime_div(cputime_sub(expires.cpu, val.cpu),
                                   nthreads);
                do {
-                        if (!unlikely(t->exit_state)) {
+                        if (likely(!(t->flags & PF_EXITING))) {
                                ticks = cputime_add(prof_ticks(t), left);
                                if (cputime_eq(t->it_prof_expires,
                                               cputime_zero) ||
@@ -515,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p,
                left = cputime_div(cputime_sub(expires.cpu, val.cpu),
                                   nthreads);
                do {
-                        if (!unlikely(t->exit_state)) {
+                        if (likely(!(t->flags & PF_EXITING))) {
                                ticks = cputime_add(virt_ticks(t), left);
                                if (cputime_eq(t->it_virt_expires,
                                               cputime_zero) ||
@@ -530,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p,
                nsleft = expires.sched - val.sched;
                do_div(nsleft, nthreads);
                do {
-                        if (!unlikely(t->exit_state)) {
+                        if (likely(!(t->flags & PF_EXITING))) {
                                ns = t->sched_time + nsleft;
                                if (t->it_sched_expires == 0 ||
                                    t->it_sched_expires > ns) {
@@ -569,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
        struct cpu_timer_list *next;
        unsigned long i;
+        if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING))
+                return;
        head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
                p->cpu_timers : p->signal->cpu_timers);
        head += CPUCLOCK_WHICH(timer->it_clock);
@@ -579,17 +579,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
        listpos = head;
        if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
                list_for_each_entry(next, head, entry) {
-                        if (next->expires.sched > nt->expires.sched) {
+                        if (next->expires.sched > nt->expires.sched)
-                                listpos = &next->entry;
                                break;
-                        }
+                        listpos = &next->entry;
                }
        } else {
                list_for_each_entry(next, head, entry) {
-                        if (cputime_gt(next->expires.cpu, nt->expires.cpu)) {
+                        if (cputime_gt(next->expires.cpu, nt->expires.cpu))
-                                listpos = &next->entry;
                                break;
-                        }
+                        listpos = &next->entry;
                }
        }
        list_add(&nt->entry, listpos);
@@ -733,9 +731,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
         * Disarm any old timer after extracting its expiry time.
         */
        BUG_ON(!irqs_disabled());
+        ret = 0;
        spin_lock(&p->sighand->siglock);
        old_expires = timer->it.cpu.expires;
-        list_del_init(&timer->it.cpu.entry);
+        if (unlikely(timer->it.cpu.firing)) {
+                timer->it.cpu.firing = -1;
+                ret = TIMER_RETRY;
+        } else
+                list_del_init(&timer->it.cpu.entry);
        spin_unlock(&p->sighand->siglock);
        /*
@@ -783,7 +787,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                }
        }
-        if (unlikely(timer->it.cpu.firing)) {
+        if (unlikely(ret)) {
                /*
                 * We are colliding with the timer actually firing.
                 * Punt after filling in the timer's old value, and
@@ -791,8 +795,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                 * it as an overrun (thanks to bump_cpu_timer above).
                 */
                read_unlock(&tasklist_lock);
-                timer->it.cpu.firing = -1;
-                ret = TIMER_RETRY;
                goto out;
        }
@@ -958,14 +960,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 static void check_thread_timers(struct task_struct *tsk,
                                struct list_head *firing)
 {
+        int maxfire;
        struct list_head *timers = tsk->cpu_timers;
+        maxfire = 20;
        tsk->it_prof_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+                if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
                        tsk->it_prof_expires = t->expires.cpu;
                        break;
                }
@@ -974,12 +978,13 @@ static void check_thread_timers(struct task_struct *tsk,
        }
        ++timers;
+        maxfire = 20;
        tsk->it_virt_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+                if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
                        tsk->it_virt_expires = t->expires.cpu;
                        break;
                }
@@ -988,12 +993,13 @@ static void check_thread_timers(struct task_struct *tsk,
        }
        ++timers;
+        maxfire = 20;
        tsk->it_sched_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (tsk->sched_time < t->expires.sched) {
+                if (!--maxfire || tsk->sched_time < t->expires.sched) {
                        tsk->it_sched_expires = t->expires.sched;
                        break;
                }
@@ -1010,6 +1016,7 @@ static void check_thread_timers(struct task_struct *tsk,
 static void check_process_timers(struct task_struct *tsk,
                                 struct list_head *firing)
 {
+        int maxfire;
        struct signal_struct *const sig = tsk->signal;
        cputime_t utime, stime, ptime, virt_expires, prof_expires;
        unsigned long long sched_time, sched_expires;
@@ -1042,12 +1049,13 @@ static void check_process_timers(struct task_struct *tsk,
        } while (t != tsk);
        ptime = cputime_add(utime, stime);
+        maxfire = 20;
        prof_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (cputime_lt(ptime, t->expires.cpu)) {
+                if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
                        prof_expires = t->expires.cpu;
                        break;
                }
@@ -1056,12 +1064,13 @@ static void check_process_timers(struct task_struct *tsk,
        }
        ++timers;
+        maxfire = 20;
        virt_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (cputime_lt(utime, t->expires.cpu)) {
+                if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
                        virt_expires = t->expires.cpu;
                        break;
                }
@@ -1070,12 +1079,13 @@ static void check_process_timers(struct task_struct *tsk,
        }
        ++timers;
+        maxfire = 20;
        sched_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (sched_time < t->expires.sched) {
+                if (!--maxfire || sched_time < t->expires.sched) {
                        sched_expires = t->expires.sched;
                        break;
                }
@@ -1158,6 +1168,9 @@ static void check_process_timers(struct task_struct *tsk,
                unsigned long long sched_left, sched;
                const unsigned int nthreads = atomic_read(&sig->live);
+                if (!nthreads)
+                        return;
                prof_left = cputime_sub(prof_expires, utime);
                prof_left = cputime_sub(prof_left, stime);
                prof_left = cputime_div(prof_left, nthreads);
@@ -1194,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk,
                        do {
                                t = next_thread(t);
-                        } while (unlikely(t->exit_state));
+                        } while (unlikely(t->flags & PF_EXITING));
                } while (t != tsk);
        }
 }
@@ -1212,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                /*
                 * The task was cleaned up already, no future firings.
                 */
-                return;
+                goto out;
        /*
         * Fetch the current sample and update the timer's expiry time.
@@ -1222,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                bump_cpu_timer(timer, now);
                if (unlikely(p->exit_state)) {
                        clear_dead_task(timer, now);
-                        return;
+                        goto out;
                }
                read_lock(&tasklist_lock); /* arm_timer needs it.  */
        } else {
@@ -1235,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                        put_task_struct(p);
                        timer->it.cpu.task = p = NULL;
                        timer->it.cpu.expires.sched = 0;
-                        read_unlock(&tasklist_lock);
+                        goto out_unlock;
-                        return;
                } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
                        /*
                         * We've noticed that the thread is dead, but
@@ -1244,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                         * drop our task ref.
                         */
                        clear_dead_task(timer, now);
-                        read_unlock(&tasklist_lock);
+                        goto out_unlock;
-                        return;
                }
                cpu_clock_sample_group(timer->it_clock, p, &now);
                bump_cpu_timer(timer, now);
@@ -1257,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
         */
        arm_timer(timer, now);
+out_unlock:
        read_unlock(&tasklist_lock);
+out:
+        timer->it_overrun_last = timer->it_overrun;
+        timer->it_overrun = -1;
+        ++timer->it_requeue_pending;
 }
 /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index b7b532acd9fc..5870efb3e200 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -270,7 +270,7 @@ static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
        long sec = tp->tv_sec;
        long nsec = tp->tv_nsec + res - 1;
-        if (nsec > NSEC_PER_SEC) {
+        if (nsec >= NSEC_PER_SEC) {
                sec++;
                nsec -= NSEC_PER_SEC;
        }
@@ -1157,7 +1157,7 @@ retry_delete:
 }
 /*
- * This is called by __exit_signal, only when there are no more
+ * This is called by do_exit or de_thread, only when there are no more
 * references to the shared signal_struct.
 */
 void exit_itimers(struct signal_struct *sig)
@@ -1209,13 +1209,9 @@ static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
        do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
-        tp->tv_sec += wall_to_mono.tv_sec;
+        set_normalized_timespec(tp, tp->tv_sec + wall_to_mono.tv_sec,
-        tp->tv_nsec += wall_to_mono.tv_nsec;
+                                tp->tv_nsec + wall_to_mono.tv_nsec);
-        if ((tp->tv_nsec - NSEC_PER_SEC) > 0) {
-                tp->tv_nsec -= NSEC_PER_SEC;
-                tp->tv_sec++;
-        }
        return 0;
 }
@@ -1295,13 +1291,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
        return error;
 }
-static void nanosleep_wake_up(unsigned long __data)
-{
-        struct task_struct *p = (struct task_struct *) __data;
-        wake_up_process(p);
-}
 /*
 * The standard says that an absolute nanosleep call MUST wake up at
 * the requested time in spite of clock settings.  Here is what we do:
@@ -1442,7 +1431,6 @@ static int common_nsleep(clockid_t which_clock,
                         int flags, struct timespec *tsave)
 {
        struct timespec t, dum;
-        struct timer_list new_timer;
        DECLARE_WAITQUEUE(abs_wqueue, current);
        u64 rq_time = (u64)0;
        s64 left;
@@ -1451,10 +1439,6 @@ static int common_nsleep(clockid_t which_clock,
            &current_thread_info()->restart_block;
        abs_wqueue.flags = 0;
-        init_timer(&new_timer);
-        new_timer.expires = 0;
-        new_timer.data = (unsigned long) current;
-        new_timer.function = nanosleep_wake_up;
        abs = flags & TIMER_ABSTIME;
        if (restart_block->fn == clock_nanosleep_restart) {
@@ -1490,13 +1474,8 @@ static int common_nsleep(clockid_t which_clock,
                if (left < (s64)0)
                        break;
-                new_timer.expires = jiffies + left;
+                schedule_timeout_interruptible(left);
-                __set_current_state(TASK_INTERRUPTIBLE);
-                add_timer(&new_timer);
-                schedule();
-                del_timer_sync(&new_timer);
                left = rq_time - get_jiffies_64();
        } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 396c7873e804..5ec248cb7f4a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -19,6 +19,15 @@ config PM
          will issue the hlt instruction if nothing is to be done, thereby
          sending the processor to sleep and saving power.
+config PM_LEGACY
+        bool "Legacy Power Management API"
+        depends on PM
+        default y
+        ---help---
+           Support for pm_register() and friends.
+           If unsure, say Y.
 config PM_DEBUG
        bool "Power Management Debug Support"
        depends on PM
@@ -29,7 +38,7 @@ config PM_DEBUG
 config SOFTWARE_SUSPEND
        bool "Software Suspend"
-        depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP))
+        depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP)
        ---help---
          Enable the possibility of suspending the machine.
          It doesn't need APM.
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 2f438d0eaa13..04be7d0d96a7 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,8 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
 EXTRA_CFLAGS    +=      -DDEBUG
 endif
-obj-y                           := main.o process.o console.o pm.o
+obj-y                           := main.o process.o console.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)  += swsusp.o disk.o
+obj-$(CONFIG_PM_LEGACY)         += pm.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)  += swsusp.o disk.o snapshot.o
 obj-$(CONFIG_SUSPEND_SMP)       += smp.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 2d8bf054d036..027322a564f4 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -17,12 +17,12 @@
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/pm.h>
 #include "power.h"
 extern suspend_disk_method_t pm_disk_mode;
-extern struct pm_ops * pm_ops;
 extern int swsusp_suspend(void);
 extern int swsusp_write(void);
@@ -30,7 +30,6 @@ extern int swsusp_check(void);
 extern int swsusp_read(void);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
-extern int swsusp_free(void);
 static int noresume = 0;
@@ -49,13 +48,11 @@ dev_t swsusp_resume_device;
 static void power_down(suspend_disk_method_t mode)
 {
-        unsigned long flags;
        int error = 0;
-        local_irq_save(flags);
        switch(mode) {
        case PM_DISK_PLATFORM:
-                device_shutdown();
+                kernel_power_off_prepare();
                error = pm_ops->enter(PM_SUSPEND_DISK);
                break;
        case PM_DISK_SHUTDOWN:
@@ -95,10 +92,7 @@ static void free_some_memory(void)
        printk("Freeing memory...  ");
        while ((tmp = shrink_all_memory(10000))) {
                pages += tmp;
-                printk("\b%c", p[i]);
+                printk("\b%c", p[i++ % 4]);
-                i++;
-                if (i > 3)
-                        i = 0;
        }
        printk("\bdone (%li pages freed)\n", pages);
 }
@@ -180,13 +174,12 @@ int pm_suspend_disk(void)
                goto Done;
        if (in_suspend) {
+                device_resume();
                pr_debug("PM: writing image.\n");
                error = swsusp_write();
                if (!error)
                        power_down(pm_disk_mode);
                else {
-                /* swsusp_write can not fail in device_resume,
-                   no need to do second device_resume */
                        swsusp_free();
                        unprepare_processes();
                        return error;
@@ -254,14 +247,17 @@ static int software_resume(void)
        pr_debug("PM: Reading swsusp image.\n");
-        if ((error = swsusp_read()))
+        if ((error = swsusp_read())) {
-                goto Cleanup;
+                swsusp_free();
+                goto Thaw;
+        }
        pr_debug("PM: Preparing devices for restore.\n");
        if ((error = device_suspend(PMSG_FREEZE))) {
                printk("Some devices failed to suspend\n");
-                goto Free;
+                swsusp_free();
+                goto Thaw;
        }
        mb();
@@ -270,9 +266,7 @@ static int software_resume(void)
        swsusp_resume();
        pr_debug("PM: Restore failed, recovering.n");
        device_resume();
- Free:
+ Thaw:
-        swsusp_free();
- Cleanup:
        unprepare_processes();
 Done:
        /* For success case, the suspend path will release the lock */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 22bdc93cc038..d253f3ae2fa5 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -24,7 +24,7 @@
 DECLARE_MUTEX(pm_sem);
-struct pm_ops * pm_ops = NULL;
+struct pm_ops *pm_ops;
 suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
 /**
@@ -151,6 +151,18 @@ static char *pm_states[PM_SUSPEND_MAX] = {
 #endif
 };
+static inline int valid_state(suspend_state_t state)
+{
+        /* Suspend-to-disk does not really need low-level support.
+         * It can work with reboot if needed. */
+        if (state == PM_SUSPEND_DISK)
+                return 1;
+        if (pm_ops && pm_ops->valid && !pm_ops->valid(state))
+                return 0;
+        return 1;
+}
 /**
 *      enter_state - Do common work of entering low-power state.
@@ -167,6 +179,8 @@ static int enter_state(suspend_state_t state)
 {
        int error;
+        if (!valid_state(state))
+                return -ENODEV;
        if (down_trylock(&pm_sem))
                return -EBUSY;
@@ -236,8 +250,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
        char * s = buf;
        for (i = 0; i < PM_SUSPEND_MAX; i++) {
-                if (pm_states[i])
+                if (pm_states[i] && valid_state(i))
-                        s += sprintf(s,"%s ",pm_states[i]);
+                        s += sprintf(s,"%s ", pm_states[i]);
        }
        s += sprintf(s,"\n");
        return (s - buf);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 159149321b3c..33c508e857dd 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include <linux/interrupt.h>
 int pm_active;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index cd6a3493cc0d..6c042b5ee14b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -1,7 +1,7 @@
 #include <linux/suspend.h>
 #include <linux/utsname.h>
-/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but
+/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but
   we probably do not take enough locks for switching consoles, etc,
   so bad things might happen.
 */
@@ -9,6 +9,9 @@
 #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
 #endif
+#define MAX_PBES        ((PAGE_SIZE - sizeof(struct new_utsname) \
+                        - 4 - 3*sizeof(unsigned long) - sizeof(int) \
+                        - sizeof(void *)) / sizeof(swp_entry_t))
 struct swsusp_info {
        struct new_utsname      uts;
@@ -18,7 +21,7 @@ struct swsusp_info {
        unsigned long           image_pages;
        unsigned long           pagedir_pages;
        suspend_pagedir_t       * suspend_pagedir;
-        swp_entry_t             pagedir[768];
+        swp_entry_t             pagedir[MAX_PBES];
 } __attribute__((aligned(PAGE_SIZE)));
@@ -50,3 +53,20 @@ extern void thaw_processes(void);
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+extern unsigned int nr_copy_pages;
+extern suspend_pagedir_t *pagedir_nosave;
+extern suspend_pagedir_t *pagedir_save;
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
+extern void free_pagedir(struct pbe *pblist);
+extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
+extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
+extern void swsusp_free(void);
+extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
new file mode 100644
index 000000000000..4a6dbcefd378
--- /dev/null
+++ b/kernel/power/snapshot.c
@@ -0,0 +1,453 @@
+/*
+ * linux/kernel/power/snapshot.c
+ *
+ * This file provide system snapshot/restore functionality.
+ *
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2, and is based on swsusp.c.
+ *
+ */
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/smp_lock.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/console.h>
+#include <linux/highmem.h>
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include "power.h"
+#ifdef CONFIG_HIGHMEM
+struct highmem_page {
+        char *data;
+        struct page *page;
+        struct highmem_page *next;
+};
+static struct highmem_page *highmem_copy;
+static int save_highmem_zone(struct zone *zone)
+{
+        unsigned long zone_pfn;
+        mark_free_pages(zone);
+        for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+                struct page *page;
+                struct highmem_page *save;
+                void *kaddr;
+                unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+                if (!(pfn%1000))
+                        printk(".");
+                if (!pfn_valid(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                /*
+                 * This condition results from rvmalloc() sans vmalloc_32()
+                 * and architectural memory reservations. This should be
+                 * corrected eventually when the cases giving rise to this
+                 * are better understood.
+                 */
+                if (PageReserved(page)) {
+                        printk("highmem reserved page?!\n");
+                        continue;
+                }
+                BUG_ON(PageNosave(page));
+                if (PageNosaveFree(page))
+                        continue;
+                save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+                if (!save)
+                        return -ENOMEM;
+                save->next = highmem_copy;
+                save->page = page;
+                save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+                if (!save->data) {
+                        kfree(save);
+                        return -ENOMEM;
+                }
+                kaddr = kmap_atomic(page, KM_USER0);
+                memcpy(save->data, kaddr, PAGE_SIZE);
+                kunmap_atomic(kaddr, KM_USER0);
+                highmem_copy = save;
+        }
+        return 0;
+}
+int save_highmem(void)
+{
+        struct zone *zone;
+        int res = 0;
+        pr_debug("swsusp: Saving Highmem\n");
+        for_each_zone (zone) {
+                if (is_highmem(zone))
+                        res = save_highmem_zone(zone);
+                if (res)
+                        return res;
+        }
+        return 0;
+}
+int restore_highmem(void)
+{
+        printk("swsusp: Restoring Highmem\n");
+        while (highmem_copy) {
+                struct highmem_page *save = highmem_copy;
+                void *kaddr;
+                highmem_copy = save->next;
+                kaddr = kmap_atomic(save->page, KM_USER0);
+                memcpy(kaddr, save->data, PAGE_SIZE);
+                kunmap_atomic(kaddr, KM_USER0);
+                free_page((long) save->data);
+                kfree(save);
+        }
+        return 0;
+}
+#endif
+static int pfn_is_nosave(unsigned long pfn)
+{
+        unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+        unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+        return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+/**
+ *      saveable - Determine whether a page should be cloned or not.
+ *      @pfn:   The page
+ *
+ *      We save a page if it's Reserved, and not in the range of pages
+ *      statically defined as 'unsaveable', or if it isn't reserved, and
+ *      isn't part of a free chunk of pages.
+ */
+static int saveable(struct zone *zone, unsigned long *zone_pfn)
+{
+        unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
+        struct page *page;
+        if (!pfn_valid(pfn))
+                return 0;
+        page = pfn_to_page(pfn);
+        BUG_ON(PageReserved(page) && PageNosave(page));
+        if (PageNosave(page))
+                return 0;
+        if (PageReserved(page) && pfn_is_nosave(pfn)) {
+                pr_debug("[nosave pfn 0x%lx]", pfn);
+                return 0;
+        }
+        if (PageNosaveFree(page))
+                return 0;
+        return 1;
+}
+static unsigned count_data_pages(void)
+{
+        struct zone *zone;
+        unsigned long zone_pfn;
+        unsigned int n = 0;
+        for_each_zone (zone) {
+                if (is_highmem(zone))
+                        continue;
+                mark_free_pages(zone);
+                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+                        n += saveable(zone, &zone_pfn);
+        }
+        return n;
+}
+static void copy_data_pages(struct pbe *pblist)
+{
+        struct zone *zone;
+        unsigned long zone_pfn;
+        struct pbe *pbe, *p;
+        pbe = pblist;
+        for_each_zone (zone) {
+                if (is_highmem(zone))
+                        continue;
+                mark_free_pages(zone);
+                /* This is necessary for swsusp_free() */
+                for_each_pb_page (p, pblist)
+                        SetPageNosaveFree(virt_to_page(p));
+                for_each_pbe (p, pblist)
+                        SetPageNosaveFree(virt_to_page(p->address));
+                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+                        if (saveable(zone, &zone_pfn)) {
+                                struct page *page;
+                                page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+                                BUG_ON(!pbe);
+                                pbe->orig_address = (unsigned long)page_address(page);
+                                /* copy_page is not usable for copying task structs. */
+                                memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+                                pbe = pbe->next;
+                        }
+                }
+        }
+        BUG_ON(pbe);
+}
+/**
+ *      free_pagedir - free pages allocated with alloc_pagedir()
+ */
+void free_pagedir(struct pbe *pblist)
+{
+        struct pbe *pbe;
+        while (pblist) {
+                pbe = (pblist + PB_PAGE_SKIP)->next;
+                ClearPageNosave(virt_to_page(pblist));
+                ClearPageNosaveFree(virt_to_page(pblist));
+                free_page((unsigned long)pblist);
+                pblist = pbe;
+        }
+}
+/**
+ *      fill_pb_page - Create a list of PBEs on a given memory page
+ */
+static inline void fill_pb_page(struct pbe *pbpage)
+{
+        struct pbe *p;
+        p = pbpage;
+        pbpage += PB_PAGE_SKIP;
+        do
+                p->next = p + 1;
+        while (++p < pbpage);
+}
+/**
+ *      create_pbe_list - Create a list of PBEs on top of a given chain
+ *      of memory pages allocated with alloc_pagedir()
+ */
+void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
+{
+        struct pbe *pbpage, *p;
+        unsigned int num = PBES_PER_PAGE;
+        for_each_pb_page (pbpage, pblist) {
+                if (num >= nr_pages)
+                        break;
+                fill_pb_page(pbpage);
+                num += PBES_PER_PAGE;
+        }
+        if (pbpage) {
+                for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
+                        p->next = p + 1;
+                p->next = NULL;
+        }
+        pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
+}
+/**
+ *      @safe_needed - on resume, for storing the PBE list and the image,
+ *      we can only use memory pages that do not conflict with the pages
+ *      which had been used before suspend.
+ *
+ *      The unsafe pages are marked with the PG_nosave_free flag
+ *
+ *      Allocated but unusable (ie eaten) memory pages should be marked
+ *      so that swsusp_free() can release them
+ */
+static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
+{
+        void *res;
+        if (safe_needed)
+                do {
+                        res = (void *)get_zeroed_page(gfp_mask);
+                        if (res && PageNosaveFree(virt_to_page(res)))
+                                /* This is for swsusp_free() */
+                                SetPageNosave(virt_to_page(res));
+                } while (res && PageNosaveFree(virt_to_page(res)));
+        else
+                res = (void *)get_zeroed_page(gfp_mask);
+        if (res) {
+                SetPageNosave(virt_to_page(res));
+                SetPageNosaveFree(virt_to_page(res));
+        }
+        return res;
+}
+unsigned long get_safe_page(gfp_t gfp_mask)
+{
+        return (unsigned long)alloc_image_page(gfp_mask, 1);
+}
+/**
+ *      alloc_pagedir - Allocate the page directory.
+ *
+ *      First, determine exactly how many pages we need and
+ *      allocate them.
+ *
+ *      We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
+ *      struct pbe elements (pbes) and the last element in the page points
+ *      to the next page.
+ *
+ *      On each page we set up a list of struct_pbe elements.
+ */
+struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed)
+{
+        unsigned int num;
+        struct pbe *pblist, *pbe;
+        if (!nr_pages)
+                return NULL;
+        pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
+        pblist = alloc_image_page(gfp_mask, safe_needed);
+        /* FIXME: rewrite this ugly loop */
+        for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
+                        pbe = pbe->next, num += PBES_PER_PAGE) {
+                pbe += PB_PAGE_SKIP;
+                pbe->next = alloc_image_page(gfp_mask, safe_needed);
+        }
+        if (!pbe) { /* get_zeroed_page() failed */
+                free_pagedir(pblist);
+                pblist = NULL;
+        }
+        return pblist;
+}
+/**
+ * Free pages we allocated for suspend. Suspend pages are alocated
+ * before atomic copy, so we need to free them after resume.
+ */
+void swsusp_free(void)
+{
+        struct zone *zone;
+        unsigned long zone_pfn;
+        for_each_zone(zone) {
+                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+                        if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
+                                struct page *page;
+                                page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+                                if (PageNosave(page) && PageNosaveFree(page)) {
+                                        ClearPageNosave(page);
+                                        ClearPageNosaveFree(page);
+                                        free_page((long) page_address(page));
+                                }
+                        }
+        }
+}
+/**
+ *      enough_free_mem - Make sure we enough free memory to snapshot.
+ *
+ *      Returns TRUE or FALSE after checking the number of available
+ *      free pages.
+ */
+static int enough_free_mem(unsigned int nr_pages)
+{
+        pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
+        return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
+                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+}
+int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+{
+        struct pbe *p;
+        for_each_pbe (p, pblist) {
+                p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
+                if (!p->address)
+                        return -ENOMEM;
+        }
+        return 0;
+}
+static struct pbe *swsusp_alloc(unsigned int nr_pages)
+{
+        struct pbe *pblist;
+        if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
+                printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
+                return NULL;
+        }
+        create_pbe_list(pblist, nr_pages);
+        if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
+                printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+                swsusp_free();
+                return NULL;
+        }
+        return pblist;
+}
+asmlinkage int swsusp_save(void)
+{
+        unsigned int nr_pages;
+        pr_debug("swsusp: critical section: \n");
+        drain_local_pages();
+        nr_pages = count_data_pages();
+        printk("swsusp: Need to copy %u pages\n", nr_pages);
+        pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
+                 nr_pages,
+                 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
+                 PAGES_FOR_IO, nr_free_pages());
+        /* This is needed because of the fixed size of swsusp_info */
+        if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
+                return -ENOSPC;
+        if (!enough_free_mem(nr_pages)) {
+                printk(KERN_ERR "swsusp: Not enough free memory\n");
+                return -ENOMEM;
+        }
+        pagedir_nosave = swsusp_alloc(nr_pages);
+        if (!pagedir_nosave)
+                return -ENOMEM;
+        /* During allocating of suspend pagedir, new cold pages may appear.
+         * Kill them.
+         */
+        drain_local_pages();
+        copy_data_pages(pagedir_nosave);
+        /*
+         * End of critical section. From now on, we can write to memory,
+         * but we should not touch disk. This specially means we must _not_
+         * touch swap space! Except we must write out our image of course.
+         */
+        nr_copy_pages = nr_pages;
+        printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
+        return 0;
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index d967e875ee82..c05f46e7348f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,11 +1,10 @@
 /*
 * linux/kernel/power/swsusp.c
 *
- * This file is to realize architecture-independent
+ * This file provides code to write suspend image to swap and read it back.
- * machine suspend feature using pretty near only high-level routines
 *
 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
 *
 * This file is released under the GPLv2.
 *
@@ -47,11 +46,7 @@
 #include <linux/utsname.h>
 #include <linux/version.h>
 #include <linux/delay.h>
-#include <linux/reboot.h>
 #include <linux/bitops.h>
-#include <linux/vt_kern.h>
-#include <linux/kbd_kern.h>
-#include <linux/keyboard.h>
 #include <linux/spinlock.h>
 #include <linux/genhd.h>
 #include <linux/kernel.h>
@@ -63,10 +58,8 @@
 #include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
-#include <linux/console.h>
 #include <linux/highmem.h>
 #include <linux/bio.h>
-#include <linux/mount.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -80,36 +73,31 @@
 #include "power.h"
+#ifdef CONFIG_HIGHMEM
+int save_highmem(void);
+int restore_highmem(void);
+#else
+static int save_highmem(void) { return 0; }
+static int restore_highmem(void) { return 0; }
+#endif
 #define CIPHER "aes"
 #define MAXKEY 32
 #define MAXIV  32
-/* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
-/* Variables to be preserved over suspend */
-static int nr_copy_pages_check;
 extern char resume_file[];
 /* Local variables that should not be affected by save */
-static unsigned int nr_copy_pages __nosavedata = 0;
+unsigned int nr_copy_pages __nosavedata = 0;
 /* Suspend pagedir is allocated before final copy, therefore it
   must be freed after resume
-   Warning: this is evil. There are actually two pagedirs at time of
-   resume. One is "pagedir_save", which is empty frame allocated at
-   time of suspend, that must be freed. Second is "pagedir_nosave",
-   allocated at time of resume, that travels through memory not to
-   collide with anything.
   Warning: this is even more evil than it seems. Pagedirs this file
   talks about are completely different from page directories used by
   MMU hardware.
 */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
-static suspend_pagedir_t *pagedir_save;
 #define SWSUSP_SIG      "S1SUSPEND"
@@ -124,12 +112,6 @@ static struct swsusp_header {
 static struct swsusp_info swsusp_info;
 /*
- * XXX: We try to keep some more pages free so that I/O operations succeed
- * without paging. Might this be more?
- */
-#define PAGES_FOR_IO    512
-/*
 * Saving part...
 */
@@ -141,8 +123,8 @@ static struct swsusp_info swsusp_info;
 static unsigned short swapfile_used[MAX_SWAPFILES];
 static unsigned short root_swap;
-static int write_page(unsigned long addr, swp_entry_t * loc);
+static int write_page(unsigned long addr, swp_entry_t *loc);
-static int bio_read_page(pgoff_t page_off, void * page);
+static int bio_read_page(pgoff_t page_off, void *page);
 static u8 key_iv[MAXKEY+MAXIV];
@@ -363,7 +345,7 @@ static void lock_swapdevices(void)
 }
 /**
- *      write_swap_page - Write one page to a fresh swap location.
+ *      write_page - Write one page to a fresh swap location.
 *      @addr:  Address we're writing.
 *      @loc:   Place to store the entry we used.
 *
@@ -374,7 +356,7 @@ static void lock_swapdevices(void)
 *      This is a partial improvement, since we will at least return other
 *      errors, though we need to eventually fix the damn code.
 */
-static int write_page(unsigned long addr, swp_entry_t * loc)
+static int write_page(unsigned long addr, swp_entry_t *loc)
 {
        swp_entry_t entry;
        int error = 0;
@@ -402,15 +384,14 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
 static void data_free(void)
 {
        swp_entry_t entry;
-        int i;
+        struct pbe *p;
-        for (i = 0; i < nr_copy_pages; i++) {
+        for_each_pbe (p, pagedir_nosave) {
-                entry = (pagedir_nosave + i)->swap_address;
+                entry = p->swap_address;
                if (entry.val)
                        swap_free(entry);
                else
                        break;
-                (pagedir_nosave + i)->swap_address = (swp_entry_t){0};
        }
 }
@@ -512,8 +493,8 @@ static void free_pagedir_entries(void)
 static int write_pagedir(void)
 {
        int error = 0;
-        unsigned n = 0;
+        unsigned int n = 0;
-        struct pbe * pbe;
+        struct pbe *pbe;
        printk( "Writing pagedir...");
        for_each_pb_page (pbe, pagedir_nosave) {
@@ -527,6 +508,26 @@ static int write_pagedir(void)
 }
 /**
+ *      enough_swap - Make sure we have enough swap to save the image.
+ *
+ *      Returns TRUE or FALSE after checking the total amount of swap
+ *      space avaiable.
+ *
+ *      FIXME: si_swapinfo(&i) returns all swap devices information.
+ *      We should only consider resume_device.
+ */
+static int enough_swap(unsigned int nr_pages)
+{
+        struct sysinfo i;
+        si_swapinfo(&i);
+        pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
+        return i.freeswap > (nr_pages + PAGES_FOR_IO +
+                (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+}
+/**
 *      write_suspend_image - Write entire image and metadata.
 *
 */
@@ -534,6 +535,11 @@ static int write_suspend_image(void)
 {
        int error;
+        if (!enough_swap(nr_copy_pages)) {
+                printk(KERN_ERR "swsusp: Not enough free swap\n");
+                return -ENOSPC;
+        }
        init_header();
        if ((error = data_write()))
                goto FreeData;
@@ -553,433 +559,6 @@ static int write_suspend_image(void)
        goto Done;
 }
-#ifdef CONFIG_HIGHMEM
-struct highmem_page {
-        char *data;
-        struct page *page;
-        struct highmem_page *next;
-};
-static struct highmem_page *highmem_copy;
-static int save_highmem_zone(struct zone *zone)
-{
-        unsigned long zone_pfn;
-        mark_free_pages(zone);
-        for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-                struct page *page;
-                struct highmem_page *save;
-                void *kaddr;
-                unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-                if (!(pfn%1000))
-                        printk(".");
-                if (!pfn_valid(pfn))
-                        continue;
-                page = pfn_to_page(pfn);
-                /*
-                 * This condition results from rvmalloc() sans vmalloc_32()
-                 * and architectural memory reservations. This should be
-                 * corrected eventually when the cases giving rise to this
-                 * are better understood.
-                 */
-                if (PageReserved(page)) {
-                        printk("highmem reserved page?!\n");
-                        continue;
-                }
-                BUG_ON(PageNosave(page));
-                if (PageNosaveFree(page))
-                        continue;
-                save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
-                if (!save)
-                        return -ENOMEM;
-                save->next = highmem_copy;
-                save->page = page;
-                save->data = (void *) get_zeroed_page(GFP_ATOMIC);
-                if (!save->data) {
-                        kfree(save);
-                        return -ENOMEM;
-                }
-                kaddr = kmap_atomic(page, KM_USER0);
-                memcpy(save->data, kaddr, PAGE_SIZE);
-                kunmap_atomic(kaddr, KM_USER0);
-                highmem_copy = save;
-        }
-        return 0;
-}
-#endif /* CONFIG_HIGHMEM */
-static int save_highmem(void)
-{
-#ifdef CONFIG_HIGHMEM
-        struct zone *zone;
-        int res = 0;
-        pr_debug("swsusp: Saving Highmem\n");
-        for_each_zone (zone) {
-                if (is_highmem(zone))
-                        res = save_highmem_zone(zone);
-                if (res)
-                        return res;
-        }
-#endif
-        return 0;
-}
-static int restore_highmem(void)
-{
-#ifdef CONFIG_HIGHMEM
-        printk("swsusp: Restoring Highmem\n");
-        while (highmem_copy) {
-                struct highmem_page *save = highmem_copy;
-                void *kaddr;
-                highmem_copy = save->next;
-                kaddr = kmap_atomic(save->page, KM_USER0);
-                memcpy(kaddr, save->data, PAGE_SIZE);
-                kunmap_atomic(kaddr, KM_USER0);
-                free_page((long) save->data);
-                kfree(save);
-        }
-#endif
-        return 0;
-}
-static int pfn_is_nosave(unsigned long pfn)
-{
-        unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
-        unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
-        return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
-}
-/**
- *      saveable - Determine whether a page should be cloned or not.
- *      @pfn:   The page
- *
- *      We save a page if it's Reserved, and not in the range of pages
- *      statically defined as 'unsaveable', or if it isn't reserved, and
- *      isn't part of a free chunk of pages.
- */
-static int saveable(struct zone * zone, unsigned long * zone_pfn)
-{
-        unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
-        struct page * page;
-        if (!pfn_valid(pfn))
-                return 0;
-        page = pfn_to_page(pfn);
-        BUG_ON(PageReserved(page) && PageNosave(page));
-        if (PageNosave(page))
-                return 0;
-        if (PageReserved(page) && pfn_is_nosave(pfn)) {
-                pr_debug("[nosave pfn 0x%lx]", pfn);
-                return 0;
-        }
-        if (PageNosaveFree(page))
-                return 0;
-        return 1;
-}
-static void count_data_pages(void)
-{
-        struct zone *zone;
-        unsigned long zone_pfn;
-        nr_copy_pages = 0;
-        for_each_zone (zone) {
-                if (is_highmem(zone))
-                        continue;
-                mark_free_pages(zone);
-                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-                        nr_copy_pages += saveable(zone, &zone_pfn);
-        }
-}
-static void copy_data_pages(void)
-{
-        struct zone *zone;
-        unsigned long zone_pfn;
-        struct pbe * pbe = pagedir_nosave;
-        pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
-        for_each_zone (zone) {
-                if (is_highmem(zone))
-                        continue;
-                mark_free_pages(zone);
-                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-                        if (saveable(zone, &zone_pfn)) {
-                                struct page * page;
-                                page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
-                                BUG_ON(!pbe);
-                                pbe->orig_address = (long) page_address(page);
-                                /* copy_page is not usable for copying task structs. */
-                                memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
-                                pbe = pbe->next;
-                        }
-                }
-        }
-        BUG_ON(pbe);
-}
-/**
- *      calc_nr - Determine the number of pages needed for a pbe list.
- */
-static int calc_nr(int nr_copy)
-{
-        return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1);
-}
-/**
- *      free_pagedir - free pages allocated with alloc_pagedir()
- */
-static inline void free_pagedir(struct pbe *pblist)
-{
-        struct pbe *pbe;
-        while (pblist) {
-                pbe = (pblist + PB_PAGE_SKIP)->next;
-                free_page((unsigned long)pblist);
-                pblist = pbe;
-        }
-}
-/**
- *      fill_pb_page - Create a list of PBEs on a given memory page
- */
-static inline void fill_pb_page(struct pbe *pbpage)
-{
-        struct pbe *p;
-        p = pbpage;
-        pbpage += PB_PAGE_SKIP;
-        do
-                p->next = p + 1;
-        while (++p < pbpage);
-}
-/**
- *      create_pbe_list - Create a list of PBEs on top of a given chain
- *      of memory pages allocated with alloc_pagedir()
- */
-static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
-{
-        struct pbe *pbpage, *p;
-        unsigned num = PBES_PER_PAGE;
-        for_each_pb_page (pbpage, pblist) {
-                if (num >= nr_pages)
-                        break;
-                fill_pb_page(pbpage);
-                num += PBES_PER_PAGE;
-        }
-        if (pbpage) {
-                for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
-                        p->next = p + 1;
-                p->next = NULL;
-        }
-        pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
-}
-/**
- *      alloc_pagedir - Allocate the page directory.
- *
- *      First, determine exactly how many pages we need and
- *      allocate them.
- *
- *      We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
- *      struct pbe elements (pbes) and the last element in the page points
- *      to the next page.
- *
- *      On each page we set up a list of struct_pbe elements.
- */
-static struct pbe * alloc_pagedir(unsigned nr_pages)
-{
-        unsigned num;
-        struct pbe *pblist, *pbe;
-        if (!nr_pages)
-                return NULL;
-        pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-        pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-        for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
-                        pbe = pbe->next, num += PBES_PER_PAGE) {
-                pbe += PB_PAGE_SKIP;
-                pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-        }
-        if (!pbe) { /* get_zeroed_page() failed */
-                free_pagedir(pblist);
-                pblist = NULL;
-        }
-        return pblist;
-}
-/**
- *      free_image_pages - Free pages allocated for snapshot
- */
-static void free_image_pages(void)
-{
-        struct pbe * p;
-        for_each_pbe (p, pagedir_save) {
-                if (p->address) {
-                        ClearPageNosave(virt_to_page(p->address));
-                        free_page(p->address);
-                        p->address = 0;
-                }
-        }
-}
-/**
- *      alloc_image_pages - Allocate pages for the snapshot.
- */
-static int alloc_image_pages(void)
-{
-        struct pbe * p;
-        for_each_pbe (p, pagedir_save) {
-                p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-                if (!p->address)
-                        return -ENOMEM;
-                SetPageNosave(virt_to_page(p->address));
-        }
-        return 0;
-}
-void swsusp_free(void)
-{
-        BUG_ON(PageNosave(virt_to_page(pagedir_save)));
-        BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
-        free_image_pages();
-        free_pagedir(pagedir_save);
-}
-/**
- *      enough_free_mem - Make sure we enough free memory to snapshot.
- *
- *      Returns TRUE or FALSE after checking the number of available
- *      free pages.
- */
-static int enough_free_mem(void)
-{
-        if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) {
-                pr_debug("swsusp: Not enough free pages: Have %d\n",
-                         nr_free_pages());
-                return 0;
-        }
-        return 1;
-}
-/**
- *      enough_swap - Make sure we have enough swap to save the image.
- *
- *      Returns TRUE or FALSE after checking the total amount of swap
- *      space avaiable.
- *
- *      FIXME: si_swapinfo(&i) returns all swap devices information.
- *      We should only consider resume_device.
- */
-static int enough_swap(void)
-{
-        struct sysinfo i;
-        si_swapinfo(&i);
-        if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO))  {
-                pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap);
-                return 0;
-        }
-        return 1;
-}
-static int swsusp_alloc(void)
-{
-        int error;
-        pagedir_nosave = NULL;
-        nr_copy_pages = calc_nr(nr_copy_pages);
-        pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
-                 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
-        if (!enough_free_mem())
-                return -ENOMEM;
-        if (!enough_swap())
-                return -ENOSPC;
-        if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
-                printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
-                return -ENOMEM;
-        }
-        create_pbe_list(pagedir_save, nr_copy_pages);
-        pagedir_nosave = pagedir_save;
-        if ((error = alloc_image_pages())) {
-                printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-                swsusp_free();
-                return error;
-        }
-        nr_copy_pages_check = nr_copy_pages;
-        return 0;
-}
-static int suspend_prepare_image(void)
-{
-        int error;
-        pr_debug("swsusp: critical section: \n");
-        if (save_highmem()) {
-                printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n");
-                restore_highmem();
-                return -ENOMEM;
-        }
-        drain_local_pages();
-        count_data_pages();
-        printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
-        error = swsusp_alloc();
-        if (error)
-                return error;
-        /* During allocating of suspend pagedir, new cold pages may appear.
-         * Kill them.
-         */
-        drain_local_pages();
-        copy_data_pages();
-        /*
-         * End of critical section. From now on, we can write to memory,
-         * but we should not touch disk. This specially means we must _not_
-         * touch swap space! Except we must write out our image of course.
-         */
-        printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
-        return 0;
-}
 /* It is important _NOT_ to umount filesystems at this point. We want
 * them synced (in case something goes wrong) but we DO not want to mark
 * filesystem clean: it is not. (And it does not matter, if we resume
@@ -988,28 +567,24 @@ static int suspend_prepare_image(void)
 int swsusp_write(void)
 {
        int error;
-        device_resume();
+        if ((error = swsusp_swap_check())) {
+                printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
+                return error;
+        }
        lock_swapdevices();
        error = write_suspend_image();
        /* This will unlock ignored swap devices since writing is finished */
        lock_swapdevices();
        return error;
 }
-extern asmlinkage int swsusp_arch_suspend(void);
-extern asmlinkage int swsusp_arch_resume(void);
-asmlinkage int swsusp_save(void)
-{
-        return suspend_prepare_image();
-}
 int swsusp_suspend(void)
 {
        int error;
        if ((error = arch_prepare_suspend()))
                return error;
        local_irq_disable();
@@ -1021,15 +596,12 @@ int swsusp_suspend(void)
         */
        if ((error = device_power_down(PMSG_FREEZE))) {
                printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
-                local_irq_enable();
+                goto Enable_irqs;
-                return error;
        }
-        if ((error = swsusp_swap_check())) {
+        if ((error = save_highmem())) {
-                printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
+                printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
-                device_power_up();
+                goto Restore_highmem;
-                local_irq_enable();
-                return error;
        }
        save_processor_state();
@@ -1037,9 +609,10 @@ int swsusp_suspend(void)
                printk(KERN_ERR "Error %d suspending\n", error);
        /* Restore control flow magically appears here */
        restore_processor_state();
-        BUG_ON (nr_copy_pages_check != nr_copy_pages);
+Restore_highmem:
        restore_highmem();
        device_power_up();
+Enable_irqs:
        local_irq_enable();
        return error;
 }
@@ -1057,6 +630,11 @@ int swsusp_resume(void)
         * execution continues at place where swsusp_arch_suspend was called
         */
        BUG_ON(!error);
+        /* The only reason why swsusp_arch_resume() can fail is memory being
+         * very tight, so we have to free it as soon as we can to avoid
+         * subsequent failures
+         */
+        swsusp_free();
        restore_processor_state();
        restore_highmem();
        touch_softlockup_watchdog();
@@ -1066,158 +644,43 @@ int swsusp_resume(void)
 }
 /**
- *      On resume, for storing the PBE list and the image,
+ *      mark_unsafe_pages - mark the pages that cannot be used for storing
- *      we can only use memory pages that do not conflict with the pages
+ *      the image during resume, because they conflict with the pages that
- *      which had been used before suspend.
+ *      had been used before suspend
- *
- *      We don't know which pages are usable until we allocate them.
- *
- *      Allocated but unusable (ie eaten) memory pages are linked together
- *      to create a list, so that we can free them easily
- *
- *      We could have used a type other than (void *)
- *      for this purpose, but ...
 */
-static void **eaten_memory = NULL;
-static inline void eat_page(void *page)
-{
-        void **c;
-        c = eaten_memory;
+static void mark_unsafe_pages(struct pbe *pblist)
-        eaten_memory = page;
-        *eaten_memory = c;
-}
-static unsigned long get_usable_page(unsigned gfp_mask)
-{
-        unsigned long m;
-        m = get_zeroed_page(gfp_mask);
-        while (!PageNosaveFree(virt_to_page(m))) {
-                eat_page((void *)m);
-                m = get_zeroed_page(gfp_mask);
-                if (!m)
-                        break;
-        }
-        return m;
-}
-static void free_eaten_memory(void)
-{
-        unsigned long m;
-        void **c;
-        int i = 0;
-        c = eaten_memory;
-        while (c) {
-                m = (unsigned long)c;
-                c = *c;
-                free_page(m);
-                i++;
-        }
-        eaten_memory = NULL;
-        pr_debug("swsusp: %d unused pages freed\n", i);
-}
-/**
- *      check_pagedir - We ensure here that pages that the PBEs point to
- *      won't collide with pages where we're going to restore from the loaded
- *      pages later
- */
-static int check_pagedir(struct pbe *pblist)
-{
-        struct pbe *p;
-        /* This is necessary, so that we can free allocated pages
-         * in case of failure
-         */
-        for_each_pbe (p, pblist)
-                p->address = 0UL;
-        for_each_pbe (p, pblist) {
-                p->address = get_usable_page(GFP_ATOMIC);
-                if (!p->address)
-                        return -ENOMEM;
-        }
-        return 0;
-}
-/**
- *      swsusp_pagedir_relocate - It is possible, that some memory pages
- *      occupied by the list of PBEs collide with pages where we're going to
- *      restore from the loaded pages later.  We relocate them here.
- */
-static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 {
        struct zone *zone;
        unsigned long zone_pfn;
-        struct pbe *pbpage, *tail, *p;
+        struct pbe *p;
-        void *m;
-        int rel = 0, error = 0;
        if (!pblist) /* a sanity check */
-                return NULL;
+                return;
-        pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
-                        swsusp_info.pagedir_pages);
-        /* Set page flags */
+        /* Clear page flags */
        for_each_zone (zone) {
-                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-                        SetPageNosaveFree(pfn_to_page(zone_pfn +
+                        if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+                                ClearPageNosaveFree(pfn_to_page(zone_pfn +
                                        zone->zone_start_pfn));
        }
-        /* Clear orig addresses */
+        /* Mark orig addresses */
        for_each_pbe (p, pblist)
-                ClearPageNosaveFree(virt_to_page(p->orig_address));
+                SetPageNosaveFree(virt_to_page(p->orig_address));
-        tail = pblist + PB_PAGE_SKIP;
-        /* Relocate colliding pages */
-        for_each_pb_page (pbpage, pblist) {
-                if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
-                        m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
-                        if (!m) {
-                                error = -ENOMEM;
-                                break;
-                        }
-                        memcpy(m, (void *)pbpage, PAGE_SIZE);
-                        if (pbpage == pblist)
-                                pblist = (struct pbe *)m;
-                        else
-                                tail->next = (struct pbe *)m;
-                        eat_page((void *)pbpage);
-                        pbpage = (struct pbe *)m;
-                        /* We have to link the PBEs again */
-                        for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
+}
-                                if (p->next) /* needed to save the end */
-                                        p->next = p + 1;
-                        rel++;
-                }
-                tail = pbpage + PB_PAGE_SKIP;
-        }
-        if (error) {
+static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
-                printk("\nswsusp: Out of memory\n\n");
+{
-                free_pagedir(pblist);
+        /* We assume both lists contain the same number of elements */
-                free_eaten_memory();
+        while (src) {
-                pblist = NULL;
+                dst->orig_address = src->orig_address;
+                dst->swap_address = src->swap_address;
+                dst = dst->next;
+                src = src->next;
        }
-        else
-                printk("swsusp: Relocated %d pages\n", rel);
-        return pblist;
 }
 /*
@@ -1231,7 +694,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 static atomic_t io_done = ATOMIC_INIT(0);
-static int end_io(struct bio * bio, unsigned int num, int err)
+static int end_io(struct bio *bio, unsigned int num, int err)
 {
        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                panic("I/O error reading memory image");
@@ -1239,7 +702,7 @@ static int end_io(struct bio * bio, unsigned int num, int err)
        return 0;
 }
-static struct block_device * resume_bdev;
+static struct block_device *resume_bdev;
 /**
 *      submit - submit BIO request.
@@ -1252,10 +715,10 @@ static struct block_device * resume_bdev;
 *      Then submit it and wait.
 */
-static int submit(int rw, pgoff_t page_off, void * page)
+static int submit(int rw, pgoff_t page_off, void *page)
 {
        int error = 0;
-        struct bio * bio;
+        struct bio *bio;
        bio = bio_alloc(GFP_ATOMIC, 1);
        if (!bio)
@@ -1284,12 +747,12 @@ static int submit(int rw, pgoff_t page_off, void * page)
        return error;
 }
-static int bio_read_page(pgoff_t page_off, void * page)
+static int bio_read_page(pgoff_t page_off, void *page)
 {
        return submit(READ, page_off, page);
 }
-static int bio_write_page(pgoff_t page_off, void * page)
+static int bio_write_page(pgoff_t page_off, void *page)
 {
        return submit(WRITE, page_off, page);
 }
@@ -1299,7 +762,7 @@ static int bio_write_page(pgoff_t page_off, void * page)
 * I really don't think that it's foolproof but more than nothing..
 */
-static const char * sanity_check(void)
+static const char *sanity_check(void)
 {
        dump_info();
        if (swsusp_info.version_code != LINUX_VERSION_CODE)
@@ -1325,7 +788,7 @@ static const char * sanity_check(void)
 static int check_header(void)
 {
-        const char * reason = NULL;
+        const char *reason = NULL;
        int error;
        if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
@@ -1356,7 +819,7 @@ static int check_sig(void)
                 * Reset swap signature now.
                 */
                error = bio_write_page(0, &swsusp_header);
-        } else { 
+        } else {
                return -EINVAL;
        }
        if (!error)
@@ -1373,7 +836,7 @@ static int check_sig(void)
 static int data_read(struct pbe *pblist)
 {
-        struct pbe * p;
+        struct pbe *p;
        int error = 0;
        int i = 0;
        int mod = swsusp_info.image_pages / 100;
@@ -1411,7 +874,7 @@ static int data_read(struct pbe *pblist)
 static int read_pagedir(struct pbe *pblist)
 {
        struct pbe *pbpage, *p;
-        unsigned i = 0;
+        unsigned int i = 0;
        int error;
        if (!pblist)
@@ -1433,10 +896,8 @@ static int read_pagedir(struct pbe *pblist)
                        break;
        }
-        if (error)
+        if (!error)
-                free_page((unsigned long)pblist);
+                BUG_ON(i != swsusp_info.pagedir_pages);
-        BUG_ON(i != swsusp_info.pagedir_pages);
        return error;
 }
@@ -1460,32 +921,29 @@ static int read_suspend_image(void)
        int error = 0;
        struct pbe *p;
-        if (!(p = alloc_pagedir(nr_copy_pages)))
+        if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0)))
                return -ENOMEM;
        if ((error = read_pagedir(p)))
                return error;
        create_pbe_list(p, nr_copy_pages);
+        mark_unsafe_pages(p);
-        if (!(pagedir_nosave = swsusp_pagedir_relocate(p)))
+        pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+        if (pagedir_nosave) {
+                create_pbe_list(pagedir_nosave, nr_copy_pages);
+                copy_page_backup_list(pagedir_nosave, p);
+        }
+        free_pagedir(p);
+        if (!pagedir_nosave)
                return -ENOMEM;
        /* Allocate memory for the image and read the data from swap */
-        error = check_pagedir(pagedir_nosave);
+        error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1);
-        free_eaten_memory();
        if (!error)
                error = data_read(pagedir_nosave);
-        if (error) { /* We fail cleanly */
-                for_each_pbe (p, pagedir_nosave)
-                        if (p->address) {
-                                free_page(p->address);
-                                p->address = 0UL;
-                        }
-                free_pagedir(pagedir_nosave);
-        }
        return error;
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 4b8f0f9230a4..5287be83e3e7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -10,7 +10,7 @@
 * elsewhere, in preparation for a serial line console (someday).
 * Ted Ts'o, 2/11/93.
 * Modified for sysctl support, 1/8/97, Chris Horn.
- * Fixed SMP synchronization, 08/08/99, Manfred Spraul 
+ * Fixed SMP synchronization, 08/08/99, Manfred Spraul
 *     manfreds@colorfullife.com
 * Rewrote bits to get rid of console_lock
 *      01Mar01 Andrew Morton <andrewm@uow.edu.au>
@@ -148,7 +148,7 @@ static int __init console_setup(char *str)
        if (!strcmp(str, "ttyb"))
                strcpy(name, "ttyS1");
 #endif
-        for(s = name; *s; s++)
+        for (s = name; *s; s++)
                if ((*s >= '0' && *s <= '9') || *s == ',')
                        break;
        idx = simple_strtoul(s, NULL, 10);
@@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str)
                size = roundup_pow_of_two(size);
        if (size > log_buf_len) {
                unsigned long start, dest_idx, offset;
-                char * new_log_buf;
+                char *new_log_buf;
                new_log_buf = alloc_bootmem(size);
                if (!new_log_buf) {
-                        printk("log_buf_len: allocation failed\n");
+                        printk(KERN_WARNING "log_buf_len: allocation failed\n");
                        goto out;
                }
@@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str)
                log_end -= offset;
                spin_unlock_irqrestore(&logbuf_lock, flags);
-                printk("log_buf_len: %d\n", log_buf_len);
+                printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
        }
 out:
        return 1;
 }
@@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup);
 *      9 -- Return number of unread characters in the log buffer
 *     10 -- Return size of the log buffer
 */
-int do_syslog(int type, char __user * buf, int len)
+int do_syslog(int type, char __user *buf, int len)
 {
        unsigned long i, j, limit, count;
        int do_clear = 0;
@@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len)
                        error = -EFAULT;
                        goto out;
                }
-                error = wait_event_interruptible(log_wait, (log_start - log_end));
+                error = wait_event_interruptible(log_wait,
+                                                        (log_start - log_end));
                if (error)
                        goto out;
                i = 0;
@@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len)
                        error = i;
                break;
        case 4:         /* Read/clear last kernel messages */
-                do_clear = 1; 
+                do_clear = 1;
                /* FALL THRU */
        case 3:         /* Read last kernel messages */
                error = -EINVAL;
@@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len)
                limit = log_end;
                /*
                 * __put_user() could sleep, and while we sleep
-                 * printk() could overwrite the messages 
+                 * printk() could overwrite the messages
                 * we try to copy to user space. Therefore
                 * the messages are copied in reverse. <manfreds>
                 */
-                for(i = 0; i < count && !error; i++) {
+                for (i = 0; i < count && !error; i++) {
                        j = limit-1-i;
                        if (j + log_buf_len < log_end)
                                break;
@@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len)
                if (error)
                        break;
                error = i;
-                if(i != count) {
+                if (i != count) {
                        int offset = count-error;
                        /* buffer overflow during copy, correct user buffer. */
-                        for(i=0;i<error;i++) {
+                        for (i = 0; i < error; i++) {
                                if (__get_user(c,&buf[i+offset]) ||
                                    __put_user(c,&buf[i])) {
                                        error = -EFAULT;
@@ -351,7 +351,7 @@ out:
        return error;
 }
-asmlinkage long sys_syslog(int type, char __user * buf, int len)
+asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
        return do_syslog(type, buf, len);
 }
@@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end)
        cur_index = start;
        start_print = start;
        while (cur_index != end) {
-                if (    msg_level < 0 &&
+                if (msg_level < 0 && ((end - cur_index) > 2) &&
-                        ((end - cur_index) > 2) &&
+                                LOG_BUF(cur_index + 0) == '<' &&
-                        LOG_BUF(cur_index + 0) == '<' &&
+                                LOG_BUF(cur_index + 1) >= '0' &&
-                        LOG_BUF(cur_index + 1) >= '0' &&
+                                LOG_BUF(cur_index + 1) <= '7' &&
-                        LOG_BUF(cur_index + 1) <= '7' &&
+                                LOG_BUF(cur_index + 2) == '>') {
-                        LOG_BUF(cur_index + 2) == '>')
-                {
                        msg_level = LOG_BUF(cur_index + 1) - '0';
                        cur_index += 3;
                        start_print = cur_index;
                }
                while (cur_index != end) {
                        char c = LOG_BUF(cur_index);
-                        cur_index++;
+                        cur_index++;
                        if (c == '\n') {
                                if (msg_level < 0) {
                                        /*
@@ -461,7 +459,7 @@ static void zap_locks(void)
        static unsigned long oops_timestamp;
        if (time_after_eq(jiffies, oops_timestamp) &&
-                        !time_after(jiffies, oops_timestamp + 30*HZ))
+                        !time_after(jiffies, oops_timestamp + 30 * HZ))
                return;
        oops_timestamp = jiffies;
@@ -493,9 +491,12 @@ __attribute__((weak)) unsigned long long printk_clock(void)
        return sched_clock();
 }
-/*
+/**
+ * printk - print a kernel message
+ * @fmt: format string
+ *
 * This is printk.  It can be called from any context.  We want it to work.
- * 
+ *
 * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
 * call the console drivers.  If we fail to get the semaphore we place the output
 * into the log buffer and return.  The current holder of the console_sem will
@@ -505,6 +506,9 @@ __attribute__((weak)) unsigned long long printk_clock(void)
 * One effect of this deferred printing is that code which calls printk() and
 * then changes console_loglevel may break. This is because console_loglevel
 * is inspected when the actual printing occurs.
+ *
+ * See also:
+ * printf(3)
 */
 asmlinkage int printk(const char *fmt, ...)
@@ -639,18 +643,27 @@ EXPORT_SYMBOL(vprintk);
 #else
-asmlinkage long sys_syslog(int type, char __user * buf, int len)
+asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
        return 0;
 }
-int do_syslog(int type, char __user * buf, int len) { return 0; }
+int do_syslog(int type, char __user *buf, int len)
-static void call_console_drivers(unsigned long start, unsigned long end) {}
+{
+        return 0;
+}
+static void call_console_drivers(unsigned long start, unsigned long end)
+{
+}
 #endif
 /**
 * add_preferred_console - add a device to the list of preferred consoles.
+ * @name: device name
+ * @idx: device index
+ * @options: options for this console
 *
 * The last preferred console added will be used for kernel messages
 * and stdin/out/err for init.  Normally this is used by console_setup
@@ -760,7 +773,8 @@ void release_console_sem(void)
 }
 EXPORT_SYMBOL(release_console_sem);
-/** console_conditional_schedule - yield the CPU if required
+/**
+ * console_conditional_schedule - yield the CPU if required
 *
 * If the console code is currently allowed to sleep, and
 * if this CPU should yield the CPU to another task, do
@@ -802,7 +816,6 @@ void console_unblank(void)
                        c->unblank();
        release_console_sem();
 }
-EXPORT_SYMBOL(console_unblank);
 /*
 * Return the console tty driver structure and its associated index
@@ -851,9 +864,9 @@ EXPORT_SYMBOL(console_start);
 * print any messages that were printed by the kernel before the
 * console driver was initialized.
 */
-void register_console(struct console * console)
+void register_console(struct console *console)
 {
-        int     i;
+        int i;
        unsigned long flags;
        if (preferred_console < 0)
@@ -878,7 +891,8 @@ void register_console(struct console * console)
         *      See if this console matches one we selected on
         *      the command line.
         */
-        for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) {
+        for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
+                        i++) {
                if (strcmp(console_cmdline[i].name, console->name) != 0)
                        continue;
                if (console->index >= 0 &&
@@ -933,26 +947,26 @@ void register_console(struct console * console)
 }
 EXPORT_SYMBOL(register_console);
-int unregister_console(struct console * console)
+int unregister_console(struct console *console)
 {
-        struct console *a,*b;
+        struct console *a, *b;
        int res = 1;
        acquire_console_sem();
        if (console_drivers == console) {
                console_drivers=console->next;
                res = 0;
-        } else {
+        } else if (console_drivers) {
                for (a=console_drivers->next, b=console_drivers ;
                     a; b=a, a=b->next) {
                        if (a == console) {
                                b->next = a->next;
                                res = 0;
                                break;
-                        }  
+                        }
                }
        }
-        
        /* If last console is removed, we re-enable picking the first
         * one that gets registered. Without that, pmac early boot console
         * would prevent fbcon from taking over.
@@ -972,6 +986,8 @@ EXPORT_SYMBOL(unregister_console);
 /**
 * tty_write_message - write a message to a certain tty, not just the console.
+ * @tty: the destination tty_struct
+ * @msg: the message to write
 *
 * This is used for messages that need to be redirected to a specific tty.
 * We don't put it into the syslog queue right now maybe in the future if
@@ -994,7 +1010,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
        static DEFINE_SPINLOCK(ratelimit_lock);
-        static unsigned long toks = 10*5*HZ;
+        static unsigned long toks = 10 * 5 * HZ;
        static unsigned long last_msg;
        static int missed;
        unsigned long flags;
@@ -1007,6 +1023,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
                toks = ratelimit_burst * ratelimit_jiffies;
        if (toks >= ratelimit_jiffies) {
                int lost = missed;
                missed = 0;
                toks -= ratelimit_jiffies;
                spin_unlock_irqrestore(&ratelimit_lock, flags);
@@ -1021,7 +1038,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 EXPORT_SYMBOL(__printk_ratelimit);
 /* minimum time in jiffies between messages */
-int printk_ratelimit_jiffies = 5*HZ;
+int printk_ratelimit_jiffies = 5 * HZ;
 /* number of messages we send before ratelimiting */
 int printk_ratelimit_burst = 10;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 019e04ec065a..656476eedb1b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child)
                        signal_wake_up(child, 1);
                }
        }
+        if (child->signal->flags & SIGNAL_GROUP_EXIT) {
+                sigaddset(&child->pending.signal, SIGKILL);
+                signal_wake_up(child, 1);
+        }
        spin_unlock(&child->sighand->siglock);
 }
@@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child)
                SET_LINKS(child);
        }
-        if (child->state == TASK_TRACED)
+        ptrace_untrace(child);
-                ptrace_untrace(child);
 }
 /*
@@ -152,7 +155,7 @@ int ptrace_attach(struct task_struct *task)
        retval = -EPERM;
        if (task->pid <= 1)
                goto bad;
-        if (task == current)
+        if (task->tgid == current->tgid)
                goto bad;
        /* the same process cannot be attached many times */
        if (task->ptrace & PT_PTRACED)
@@ -238,7 +241,8 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
                if (write) {
                        copy_to_user_page(vma, page, addr,
                                          maddr + offset, buf, bytes);
-                        set_page_dirty_lock(page);
+                        if (!PageCompound(page))
+                                set_page_dirty_lock(page);
                } else {
                        copy_from_user_page(vma, page, addr,
                                            buf, maddr + offset, bytes);
@@ -403,3 +407,85 @@ int ptrace_request(struct task_struct *child, long request,
        return ret;
 }
+#ifndef __ARCH_SYS_PTRACE
+static int ptrace_get_task_struct(long request, long pid,
+                struct task_struct **childp)
+{
+        struct task_struct *child;
+        int ret;
+        /*
+         * Callers use child == NULL as an indication to exit early even
+         * when the return value is 0, so make sure it is non-NULL here.
+         */
+        *childp = NULL;
+        if (request == PTRACE_TRACEME) {
+                /*
+                 * Are we already being traced?
+                 */
+                if (current->ptrace & PT_PTRACED)
+                        return -EPERM;
+                ret = security_ptrace(current->parent, current);
+                if (ret)
+                        return -EPERM;
+                /*
+                 * Set the ptrace bit in the process ptrace flags.
+                 */
+                current->ptrace |= PT_PTRACED;
+                return 0;
+        }
+        /*
+         * You may not mess with init
+         */
+        if (pid == 1)
+                return -EPERM;
+        ret = -ESRCH;
+        read_lock(&tasklist_lock);
+        child = find_task_by_pid(pid);
+        if (child)
+                get_task_struct(child);
+        read_unlock(&tasklist_lock);
+        if (!child)
+                return -ESRCH;
+        *childp = child;
+        return 0;
+}
+asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+{
+        struct task_struct *child;
+        long ret;
+        /*
+         * This lock_kernel fixes a subtle race with suid exec
+         */
+        lock_kernel();
+        ret = ptrace_get_task_struct(request, pid, &child);
+        if (!child)
+                goto out;
+        if (request == PTRACE_ATTACH) {
+                ret = ptrace_attach(child);
+                goto out_put_task_struct;
+        }
+        ret = ptrace_check_attach(child, request == PTRACE_KILL);
+        if (ret < 0)
+                goto out_put_task_struct;
+        ret = arch_ptrace(child, request, addr, data);
+        if (ret < 0)
+                goto out_put_task_struct;
+ out_put_task_struct:
+        put_task_struct(child);
+ out:
+        unlock_kernel();
+        return ret;
+}
+#endif /* __ARCH_SYS_PTRACE */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index bef3b6901b76..c4d159a21e04 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -71,7 +71,7 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int maxbatch = 10;
+static int maxbatch = 10000;
 #ifndef __HAVE_ARCH_CMPXCHG
 /*
@@ -109,6 +109,10 @@ void fastcall call_rcu(struct rcu_head *head,
        rdp = &__get_cpu_var(rcu_data);
        *rdp->nxttail = head;
        rdp->nxttail = &head->next;
+        if (unlikely(++rdp->count > 10000))
+                set_need_resched();
        local_irq_restore(flags);
 }
@@ -140,10 +144,25 @@ void fastcall call_rcu_bh(struct rcu_head *head,
        rdp = &__get_cpu_var(rcu_bh_data);
        *rdp->nxttail = head;
        rdp->nxttail = &head->next;
+        rdp->count++;
+/*
+ *  Should we directly call rcu_do_batch() here ?
+ *  if (unlikely(rdp->count > 10000))
+ *      rcu_do_batch(rdp);
+ */
        local_irq_restore(flags);
 }
 /*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+        return rcu_ctrlblk.completed;
+}
+/*
 * Invoke the completed RCU callbacks. They are expected to be in
 * a per-cpu list.
 */
@@ -157,6 +176,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
                next = rdp->donelist = list->next;
                list->func(list);
                list = next;
+                rdp->count--;
                if (++count >= maxbatch)
                        break;
        }
@@ -490,6 +510,7 @@ void synchronize_kernel(void)
 }
 module_param(maxbatch, int, 0);
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
 EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
new file mode 100644
index 000000000000..88c28d476550
--- /dev/null
+++ b/kernel/rcutorture.c
@@ -0,0 +1,514 @@
+/*
+ * Read-Copy Update /proc-based torture test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *
+ * See also:  Documentation/RCU/torture.txt
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcuref.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/stat.h>
+MODULE_LICENSE("GPL");
+static int nreaders = -1;       /* # reader threads, defaults to 4*ncpus */
+static int stat_interval = 0;   /* Interval between stats, in seconds. */
+                                /*  Defaults to "only at end of test". */
+static int verbose = 0;         /* Print more debug info. */
+MODULE_PARM(nreaders, "i");
+MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+MODULE_PARM(stat_interval, "i");
+MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
+MODULE_PARM(verbose, "i");
+MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
+#define TORTURE_FLAG "rcutorture: "
+#define PRINTK_STRING(s) \
+        do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_STRING(s) \
+        do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_ERRSTRING(s) \
+        do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
+static char printk_buf[4096];
+static int nrealreaders;
+static struct task_struct *writer_task;
+static struct task_struct **reader_tasks;
+static struct task_struct *stats_task;
+#define RCU_TORTURE_PIPE_LEN 10
+struct rcu_torture {
+        struct rcu_head rtort_rcu;
+        int rtort_pipe_count;
+        struct list_head rtort_free;
+        int rtort_mbtest;
+};
+static int fullstop = 0;        /* stop generating callbacks at test end. */
+static LIST_HEAD(rcu_torture_freelist);
+static struct rcu_torture *rcu_torture_current = NULL;
+static long rcu_torture_current_version = 0;
+static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
+static DEFINE_SPINLOCK(rcu_torture_lock);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
+        { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
+        { 0 };
+static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+atomic_t n_rcu_torture_alloc;
+atomic_t n_rcu_torture_alloc_fail;
+atomic_t n_rcu_torture_free;
+atomic_t n_rcu_torture_mberror;
+atomic_t n_rcu_torture_error;
+/*
+ * Allocate an element from the rcu_tortures pool.
+ */
+struct rcu_torture *
+rcu_torture_alloc(void)
+{
+        struct list_head *p;
+        spin_lock(&rcu_torture_lock);
+        if (list_empty(&rcu_torture_freelist)) {
+                atomic_inc(&n_rcu_torture_alloc_fail);
+                spin_unlock(&rcu_torture_lock);
+                return NULL;
+        }
+        atomic_inc(&n_rcu_torture_alloc);
+        p = rcu_torture_freelist.next;
+        list_del_init(p);
+        spin_unlock(&rcu_torture_lock);
+        return container_of(p, struct rcu_torture, rtort_free);
+}
+/*
+ * Free an element to the rcu_tortures pool.
+ */
+static void
+rcu_torture_free(struct rcu_torture *p)
+{
+        atomic_inc(&n_rcu_torture_free);
+        spin_lock(&rcu_torture_lock);
+        list_add_tail(&p->rtort_free, &rcu_torture_freelist);
+        spin_unlock(&rcu_torture_lock);
+}
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+        int i;
+        struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+        if (fullstop) {
+                /* Test is ending, just drop callbacks on the floor. */
+                /* The next initialization will pick up the pieces. */
+                return;
+        }
+        i = rp->rtort_pipe_count;
+        if (i > RCU_TORTURE_PIPE_LEN)
+                i = RCU_TORTURE_PIPE_LEN;
+        atomic_inc(&rcu_torture_wcount[i]);
+        if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+                rp->rtort_mbtest = 0;
+                rcu_torture_free(rp);
+        } else
+                call_rcu(p, rcu_torture_cb);
+}
+struct rcu_random_state {
+        unsigned long rrs_state;
+        unsigned long rrs_count;
+};
+#define RCU_RANDOM_MULT 39916801  /* prime */
+#define RCU_RANDOM_ADD  479001701 /* prime */
+#define RCU_RANDOM_REFRESH 10000
+#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static long
+rcu_random(struct rcu_random_state *rrsp)
+{
+        long refresh;
+        if (--rrsp->rrs_count < 0) {
+                get_random_bytes(&refresh, sizeof(refresh));
+                rrsp->rrs_state += refresh;
+                rrsp->rrs_count = RCU_RANDOM_REFRESH;
+        }
+        rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
+        return swahw32(rrsp->rrs_state);
+}
+/*
+ * RCU torture writer kthread.  Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+        int i;
+        long oldbatch = rcu_batches_completed();
+        struct rcu_torture *rp;
+        struct rcu_torture *old_rp;
+        static DEFINE_RCU_RANDOM(rand);
+        VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
+        set_user_nice(current, 19);
+        do {
+                schedule_timeout_uninterruptible(1);
+                if (rcu_batches_completed() == oldbatch)
+                        continue;
+                if ((rp = rcu_torture_alloc()) == NULL)
+                        continue;
+                rp->rtort_pipe_count = 0;
+                udelay(rcu_random(&rand) & 0x3ff);
+                old_rp = rcu_torture_current;
+                rp->rtort_mbtest = 1;
+                rcu_assign_pointer(rcu_torture_current, rp);
+                smp_wmb();
+                if (old_rp != NULL) {
+                        i = old_rp->rtort_pipe_count;
+                        if (i > RCU_TORTURE_PIPE_LEN)
+                                i = RCU_TORTURE_PIPE_LEN;
+                        atomic_inc(&rcu_torture_wcount[i]);
+                        old_rp->rtort_pipe_count++;
+                        call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
+                }
+                rcu_torture_current_version++;
+                oldbatch = rcu_batches_completed();
+        } while (!kthread_should_stop() && !fullstop);
+        VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
+        while (!kthread_should_stop())
+                schedule_timeout_uninterruptible(1);
+        return 0;
+}
+/*
+ * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array.  The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static int
+rcu_torture_reader(void *arg)
+{
+        int completed;
+        DEFINE_RCU_RANDOM(rand);
+        struct rcu_torture *p;
+        int pipe_count;
+        VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
+        set_user_nice(current, 19);
+        do {
+                rcu_read_lock();
+                completed = rcu_batches_completed();
+                p = rcu_dereference(rcu_torture_current);
+                if (p == NULL) {
+                        /* Wait for rcu_torture_writer to get underway */
+                        rcu_read_unlock();
+                        schedule_timeout_interruptible(HZ);
+                        continue;
+                }
+                if (p->rtort_mbtest == 0)
+                        atomic_inc(&n_rcu_torture_mberror);
+                udelay(rcu_random(&rand) & 0x7f);
+                preempt_disable();
+                pipe_count = p->rtort_pipe_count;
+                if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+                        /* Should not happen, but... */
+                        pipe_count = RCU_TORTURE_PIPE_LEN;
+                }
+                ++__get_cpu_var(rcu_torture_count)[pipe_count];
+                completed = rcu_batches_completed() - completed;
+                if (completed > RCU_TORTURE_PIPE_LEN) {
+                        /* Should not happen, but... */
+                        completed = RCU_TORTURE_PIPE_LEN;
+                }
+                ++__get_cpu_var(rcu_torture_batch)[completed];
+                preempt_enable();
+                rcu_read_unlock();
+                schedule();
+        } while (!kthread_should_stop() && !fullstop);
+        VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+        while (!kthread_should_stop())
+                schedule_timeout_uninterruptible(1);
+        return 0;
+}
+/*
+ * Create an RCU-torture statistics message in the specified buffer.
+ */
+static int
+rcu_torture_printk(char *page)
+{
+        int cnt = 0;
+        int cpu;
+        int i;
+        long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+        long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+        for_each_cpu(cpu) {
+                for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+                        pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
+                        batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+                }
+        }
+        for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+                if (pipesummary[i] != 0)
+                        break;
+        }
+        cnt += sprintf(&page[cnt], "rcutorture: ");
+        cnt += sprintf(&page[cnt],
+                       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
+                       "rtmbe: %d",
+                       rcu_torture_current,
+                       rcu_torture_current_version,
+                       list_empty(&rcu_torture_freelist),
+                       atomic_read(&n_rcu_torture_alloc),
+                       atomic_read(&n_rcu_torture_alloc_fail),
+                       atomic_read(&n_rcu_torture_free),
+                       atomic_read(&n_rcu_torture_mberror));
+        if (atomic_read(&n_rcu_torture_mberror) != 0)
+                cnt += sprintf(&page[cnt], " !!!");
+        cnt += sprintf(&page[cnt], "\nrcutorture: ");
+        if (i > 1) {
+                cnt += sprintf(&page[cnt], "!!! ");
+                atomic_inc(&n_rcu_torture_error);
+        }
+        cnt += sprintf(&page[cnt], "Reader Pipe: ");
+        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+                cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
+        cnt += sprintf(&page[cnt], "\nrcutorture: ");
+        cnt += sprintf(&page[cnt], "Reader Batch: ");
+        for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+                cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
+        cnt += sprintf(&page[cnt], "\nrcutorture: ");
+        cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
+        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+                cnt += sprintf(&page[cnt], " %d",
+                               atomic_read(&rcu_torture_wcount[i]));
+        }
+        cnt += sprintf(&page[cnt], "\n");
+        return cnt;
+}
+/*
+ * Print torture statistics.  Caller must ensure that there is only
+ * one call to this function at a given time!!!  This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
+ */
+static void
+rcu_torture_stats_print(void)
+{
+        int cnt;
+        cnt = rcu_torture_printk(printk_buf);
+        printk(KERN_ALERT "%s", printk_buf);
+}
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int
+rcu_torture_stats(void *arg)
+{
+        VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+        do {
+                schedule_timeout_interruptible(stat_interval * HZ);
+                rcu_torture_stats_print();
+        } while (!kthread_should_stop());
+        VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
+        return 0;
+}
+static void
+rcu_torture_cleanup(void)
+{
+        int i;
+        fullstop = 1;
+        if (writer_task != NULL) {
+                VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+                kthread_stop(writer_task);
+        }
+        writer_task = NULL;
+        if (reader_tasks != NULL) {
+                for (i = 0; i < nrealreaders; i++) {
+                        if (reader_tasks[i] != NULL) {
+                                VERBOSE_PRINTK_STRING(
+                                        "Stopping rcu_torture_reader task");
+                                kthread_stop(reader_tasks[i]);
+                        }
+                        reader_tasks[i] = NULL;
+                }
+                kfree(reader_tasks);
+                reader_tasks = NULL;
+        }
+        rcu_torture_current = NULL;
+        if (stats_task != NULL) {
+                VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+                kthread_stop(stats_task);
+        }
+        stats_task = NULL;
+        /* Wait for all RCU callbacks to fire.  */
+        for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+                synchronize_rcu();
+        rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
+        printk(KERN_ALERT TORTURE_FLAG
+               "--- End of test: %s\n",
+               atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE");
+}
+static int
+rcu_torture_init(void)
+{
+        int i;
+        int cpu;
+        int firsterr = 0;
+        /* Process args and tell the world that the torturer is on the job. */
+        if (nreaders >= 0)
+                nrealreaders = nreaders;
+        else
+                nrealreaders = 2 * num_online_cpus();
+        printk(KERN_ALERT TORTURE_FLAG
+               "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
+               nrealreaders, stat_interval, verbose);
+        fullstop = 0;
+        /* Set up the freelist. */
+        INIT_LIST_HEAD(&rcu_torture_freelist);
+        for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+                rcu_tortures[i].rtort_mbtest = 0;
+                list_add_tail(&rcu_tortures[i].rtort_free,
+                              &rcu_torture_freelist);
+        }
+        /* Initialize the statistics so that each run gets its own numbers. */
+        rcu_torture_current = NULL;
+        rcu_torture_current_version = 0;
+        atomic_set(&n_rcu_torture_alloc, 0);
+        atomic_set(&n_rcu_torture_alloc_fail, 0);
+        atomic_set(&n_rcu_torture_free, 0);
+        atomic_set(&n_rcu_torture_mberror, 0);
+        atomic_set(&n_rcu_torture_error, 0);
+        for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+                atomic_set(&rcu_torture_wcount[i], 0);
+        for_each_cpu(cpu) {
+                for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+                        per_cpu(rcu_torture_count, cpu)[i] = 0;
+                        per_cpu(rcu_torture_batch, cpu)[i] = 0;
+                }
+        }
+        /* Start up the kthreads. */
+        VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
+        writer_task = kthread_run(rcu_torture_writer, NULL,
+                                  "rcu_torture_writer");
+        if (IS_ERR(writer_task)) {
+                firsterr = PTR_ERR(writer_task);
+                VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
+                writer_task = NULL;
+                goto unwind;
+        }
+        reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
+                               GFP_KERNEL);
+        if (reader_tasks == NULL) {
+                VERBOSE_PRINTK_ERRSTRING("out of memory");
+                firsterr = -ENOMEM;
+                goto unwind;
+        }
+        for (i = 0; i < nrealreaders; i++) {
+                VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
+                reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
+                                              "rcu_torture_reader");
+                if (IS_ERR(reader_tasks[i])) {
+                        firsterr = PTR_ERR(reader_tasks[i]);
+                        VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
+                        reader_tasks[i] = NULL;
+                        goto unwind;
+                }
+        }
+        if (stat_interval > 0) {
+                VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
+                stats_task = kthread_run(rcu_torture_stats, NULL,
+                                        "rcu_torture_stats");
+                if (IS_ERR(stats_task)) {
+                        firsterr = PTR_ERR(stats_task);
+                        VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
+                        stats_task = NULL;
+                        goto unwind;
+                }
+        }
+        return 0;
+unwind:
+        rcu_torture_cleanup();
+        return firsterr;
+}
+module_init(rcu_torture_init);
+module_exit(rcu_torture_cleanup);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1f31a528fdba..6f46c94cc29e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,6 +206,7 @@ struct runqueue {
         */
        unsigned long nr_running;
 #ifdef CONFIG_SMP
+        unsigned long prio_bias;
        unsigned long cpu_load[3];
 #endif
        unsigned long long nr_switches;
@@ -659,13 +660,68 @@ static int effective_prio(task_t *p)
        return prio;
 }
+#ifdef CONFIG_SMP
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
+{
+        rq->prio_bias += MAX_PRIO - prio;
+}
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
+{
+        rq->prio_bias -= MAX_PRIO - prio;
+}
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+        rq->nr_running++;
+        if (rt_task(p)) {
+                if (p != rq->migration_thread)
+                        /*
+                         * The migration thread does the actual balancing. Do
+                         * not bias by its priority as the ultra high priority
+                         * will skew balancing adversely.
+                         */
+                        inc_prio_bias(rq, p->prio);
+        } else
+                inc_prio_bias(rq, p->static_prio);
+}
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+        rq->nr_running--;
+        if (rt_task(p)) {
+                if (p != rq->migration_thread)
+                        dec_prio_bias(rq, p->prio);
+        } else
+                dec_prio_bias(rq, p->static_prio);
+}
+#else
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
+{
+}
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
+{
+}
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+        rq->nr_running++;
+}
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+        rq->nr_running--;
+}
+#endif
 /*
 * __activate_task - move a task to the runqueue.
 */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
        enqueue_task(p, rq->active);
-        rq->nr_running++;
+        inc_nr_running(p, rq);
 }
 /*
@@ -674,7 +730,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
        enqueue_task_head(p, rq->active);
-        rq->nr_running++;
+        inc_nr_running(p, rq);
 }
 static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -759,7 +815,8 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
        }
 #endif
-        p->prio = recalc_task_prio(p, now);
+        if (!rt_task(p))
+                p->prio = recalc_task_prio(p, now);
        /*
         * This checks to make sure it's not an uninterruptible task
@@ -793,7 +850,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
 */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-        rq->nr_running--;
+        dec_nr_running(p, rq);
        dequeue_task(p, p->array);
        p->array = NULL;
 }
@@ -808,21 +865,28 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 #ifdef CONFIG_SMP
 static void resched_task(task_t *p)
 {
-        int need_resched, nrpolling;
+        int cpu;
        assert_spin_locked(&task_rq(p)->lock);
-        /* minimise the chance of sending an interrupt to poll_idle() */
+        if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
-        nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
+                return;
-        need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
-        nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
+        set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+        cpu = task_cpu(p);
+        if (cpu == smp_processor_id())
+                return;
-        if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
+        /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */
-                smp_send_reschedule(task_cpu(p));
+        smp_mb();
+        if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))
+                smp_send_reschedule(cpu);
 }
 #else
 static inline void resched_task(task_t *p)
 {
+        assert_spin_locked(&task_rq(p)->lock);
        set_tsk_need_resched(p);
 }
 #endif
@@ -930,27 +994,61 @@ void kick_process(task_t *p)
 * We want to under-estimate the load of migration sources, to
 * balance conservatively.
 */
-static inline unsigned long source_load(int cpu, int type)
+static inline unsigned long __source_load(int cpu, int type, enum idle_type idle)
 {
        runqueue_t *rq = cpu_rq(cpu);
-        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+        unsigned long running = rq->nr_running;
+        unsigned long source_load, cpu_load = rq->cpu_load[type-1],
+                load_now = running * SCHED_LOAD_SCALE;
        if (type == 0)
-                return load_now;
+                source_load = load_now;
+        else
+                source_load = min(cpu_load, load_now);
+        if (running > 1 || (idle == NOT_IDLE && running))
+                /*
+                 * If we are busy rebalancing the load is biased by
+                 * priority to create 'nice' support across cpus. When
+                 * idle rebalancing we should only bias the source_load if
+                 * there is more than one task running on that queue to
+                 * prevent idle rebalance from trying to pull tasks from a
+                 * queue with only one running task.
+                 */
+                source_load = source_load * rq->prio_bias / running;
+        return source_load;
+}
-        return min(rq->cpu_load[type-1], load_now);
+static inline unsigned long source_load(int cpu, int type)
+{
+        return __source_load(cpu, type, NOT_IDLE);
 }
 /*
 * Return a high guess at the load of a migration-target cpu
 */
-static inline unsigned long target_load(int cpu, int type)
+static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
 {
        runqueue_t *rq = cpu_rq(cpu);
-        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+        unsigned long running = rq->nr_running;
+        unsigned long target_load, cpu_load = rq->cpu_load[type-1],
+                load_now = running * SCHED_LOAD_SCALE;
        if (type == 0)
-                return load_now;
+                target_load = load_now;
+        else
+                target_load = max(cpu_load, load_now);
+        if (running > 1 || (idle == NOT_IDLE && running))
+                target_load = target_load * rq->prio_bias / running;
+        return target_load;
+}
-        return max(rq->cpu_load[type-1], load_now);
+static inline unsigned long target_load(int cpu, int type)
+{
+        return __target_load(cpu, type, NOT_IDLE);
 }
 /*
@@ -1339,7 +1437,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
 #endif
 #ifdef CONFIG_PREEMPT
        /* Want to start with kernel preemption disabled. */
-        p->thread_info->preempt_count = 1;
+        task_thread_info(p)->preempt_count = 1;
 #endif
        /*
         * Share the timeslice between parent and child, thus the
@@ -1411,7 +1509,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
                                list_add_tail(&p->run_list, &current->run_list);
                                p->array = current->array;
                                p->array->nr_active++;
-                                rq->nr_running++;
+                                inc_nr_running(p, rq);
                        }
                        set_need_resched();
                } else
@@ -1468,7 +1566,7 @@ void fastcall sched_exit(task_t *p)
         * the sleep_avg of the parent as well.
         */
        rq = task_rq_lock(p->parent, &flags);
-        if (p->first_time_slice) {
+        if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
                p->parent->time_slice += p->time_slice;
                if (unlikely(p->parent->time_slice > task_timeslice(p)))
                        p->parent->time_slice = task_timeslice(p);
@@ -1756,9 +1854,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
               runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
        dequeue_task(p, src_array);
-        src_rq->nr_running--;
+        dec_nr_running(p, src_rq);
        set_task_cpu(p, this_cpu);
-        this_rq->nr_running++;
+        inc_nr_running(p, this_rq);
        enqueue_task(p, this_array);
        p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
                                + this_rq->timestamp_last_tick;
@@ -1937,9 +2035,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
-                                load = target_load(i, load_idx);
+                                load = __target_load(i, load_idx, idle);
                        else
-                                load = source_load(i, load_idx);
+                                load = __source_load(i, load_idx, idle);
                        avg_load += load;
                }
@@ -2044,14 +2142,15 @@ out_balanced:
 /*
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
-static runqueue_t *find_busiest_queue(struct sched_group *group)
+static runqueue_t *find_busiest_queue(struct sched_group *group,
+        enum idle_type idle)
 {
        unsigned long load, max_load = 0;
        runqueue_t *busiest = NULL;
        int i;
        for_each_cpu_mask(i, group->cpumask) {
-                load = source_load(i, 0);
+                load = __source_load(i, 0, idle);
                if (load > max_load) {
                        max_load = load;
@@ -2095,7 +2194,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                goto out_balanced;
        }
-        busiest = find_busiest_queue(group);
+        busiest = find_busiest_queue(group, idle);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -2218,7 +2317,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
                goto out_balanced;
        }
-        busiest = find_busiest_queue(group);
+        busiest = find_busiest_queue(group, NEWLY_IDLE);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
                goto out_balanced;
@@ -2511,8 +2610,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                cpustat->idle = cputime64_add(cpustat->idle, tmp);
        /* Account for system time used */
        acct_update_integrals(p);
-        /* Update rss highwater mark */
-        update_mem_hiwater(p);
 }
 /*
@@ -3453,8 +3550,10 @@ void set_user_nice(task_t *p, long nice)
                goto out_unlock;
        }
        array = p->array;
-        if (array)
+        if (array) {
                dequeue_task(p, array);
+                dec_prio_bias(rq, p->static_prio);
+        }
        old_prio = p->prio;
        new_prio = NICE_TO_PRIO(nice);
@@ -3464,6 +3563,7 @@ void set_user_nice(task_t *p, long nice)
        if (array) {
                enqueue_task(p, array);
+                inc_prio_bias(rq, p->static_prio);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -3565,8 +3665,6 @@ int idle_cpu(int cpu)
        return cpu_curr(cpu) == cpu_rq(cpu)->idle;
 }
-EXPORT_SYMBOL_GPL(idle_cpu);
 /**
 * idle_task - return the idle task for a given cpu.
 * @cpu: the processor in question.
@@ -4229,10 +4327,10 @@ static void show_task(task_t *p)
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
        {
-                unsigned long *n = (unsigned long *) (p->thread_info+1);
+                unsigned long *n = end_of_stack(p);
                while (!*n)
                        n++;
-                free = (unsigned long) n - (unsigned long)(p->thread_info+1);
+                free = (unsigned long)n - (unsigned long)end_of_stack(p);
        }
 #endif
        printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
@@ -4312,9 +4410,9 @@ void __devinit init_idle(task_t *idle, int cpu)
        /* Set the preempt count _outside_ the spinlocks! */
 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
-        idle->thread_info->preempt_count = (idle->lock_depth >= 0);
+        task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
 #else
-        idle->thread_info->preempt_count = 0;
+        task_thread_info(idle)->preempt_count = 0;
 #endif
 }
@@ -4682,7 +4780,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
                /* Unbind it from offline cpu so it can run.  Fall thru. */
-                kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
+                kthread_bind(cpu_rq(cpu)->migration_thread,
+                             any_online_cpu(cpu_online_map));
                kthread_stop(cpu_rq(cpu)->migration_thread);
                cpu_rq(cpu)->migration_thread = NULL;
                break;
diff --git a/kernel/signal.c b/kernel/signal.c
index b92c3c9f8b9a..d7611f189ef7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -262,7 +262,7 @@ next_signal(struct sigpending *pending, sigset_t *mask)
        return sig;
 }
-static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags,
+static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
                                         int override_rlimit)
 {
        struct sigqueue *q = NULL;
@@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __n
        } else {
                INIT_LIST_HEAD(&q->list);
                q->flags = 0;
-                q->lock = NULL;
                q->user = get_uid(t->user);
        }
        return(q);
@@ -397,20 +396,8 @@ void __exit_signal(struct task_struct *tsk)
        flush_sigqueue(&tsk->pending);
        if (sig) {
                /*
-                 * We are cleaning up the signal_struct here.  We delayed
+                 * We are cleaning up the signal_struct here.
-                 * calling exit_itimers until after flush_sigqueue, just in
-                 * case our thread-local pending queue contained a queued
-                 * timer signal that would have been cleared in
-                 * exit_itimers.  When that called sigqueue_free, it would
-                 * attempt to re-take the tasklist_lock and deadlock.  This
-                 * can never happen if we ensure that all queues the
-                 * timer's signal might be queued on have been flushed
-                 * first.  The shared_pending queue, and our own pending
-                 * queue are the only queues the timer could be on, since
-                 * there are no other threads left in the group and timer
-                 * signals are constrained to threads inside the group.
                 */
-                exit_itimers(sig);
                exit_thread_group_keys(sig);
                kmem_cache_free(signal_cachep, sig);
        }
@@ -418,6 +405,8 @@ void __exit_signal(struct task_struct *tsk)
 void exit_signal(struct task_struct *tsk)
 {
+        atomic_dec(&tsk->signal->live);
        write_lock_irq(&tasklist_lock);
        __exit_signal(tsk);
        write_unlock_irq(&tasklist_lock);
@@ -524,16 +513,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 {
        int sig = 0;
-        /* SIGKILL must have priority, otherwise it is quite easy
+        sig = next_signal(pending, mask);
-         * to create an unkillable process, sending sig < SIGKILL
-         * to self */
-        if (unlikely(sigismember(&pending->signal, SIGKILL))) {
-                if (!sigismember(mask, SIGKILL))
-                        sig = SIGKILL;
-        }
-        if (likely(!sig))
-                sig = next_signal(pending, mask);
        if (sig) {
                if (current->notifier) {
                        if (sigismember(current->notifier_mask, sig)) {
@@ -578,7 +558,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
-                tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+                if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
+                        tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
        }
        if ( signr &&
             ((info->si_code & __SI_MASK) == __SI_TIMER) &&
@@ -661,8 +642,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
        if (!valid_signal(sig))
                return error;
        error = -EPERM;
-        if ((!info || ((unsigned long)info != 1 &&
+        if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
-                        (unsigned long)info != 2 && SI_FROMUSER(info)))
            && ((sig != SIGCONT) ||
                (current->signal->session != t->signal->session))
            && (current->euid ^ t->suid) && (current->euid ^ t->uid)
@@ -799,7 +779,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
         * fast-pathed signals for kernel-internal things like SIGSTOP
         * or SIGKILL.
         */
-        if ((unsigned long)info == 2)
+        if (info == SEND_SIG_FORCED)
                goto out_set;
        /* Real-time signals must be queued if sent by sigqueue, or
@@ -811,19 +791,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
           pass on the info struct.  */
        q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-                                             ((unsigned long) info < 2 ||
+                                             (is_si_special(info) ||
                                              info->si_code >= 0)));
        if (q) {
                list_add_tail(&q->list, &signals->list);
                switch ((unsigned long) info) {
-                case 0:
+                case (unsigned long) SEND_SIG_NOINFO:
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_USER;
                        q->info.si_pid = current->pid;
                        q->info.si_uid = current->uid;
                        break;
-                case 1:
+                case (unsigned long) SEND_SIG_PRIV:
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_KERNEL;
@@ -834,20 +814,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
                        copy_siginfo(&q->info, info);
                        break;
                }
-        } else {
+        } else if (!is_si_special(info)) {
-                if (sig >= SIGRTMIN && info && (unsigned long)info != 1
+                if (sig >= SIGRTMIN && info->si_code != SI_USER)
-                   && info->si_code != SI_USER)
                /*
                 * Queue overflow, abort.  We may abort if the signal was rt
                 * and sent by user using something other than kill().
                 */
                        return -EAGAIN;
-                if (((unsigned long)info > 1) && (info->si_code == SI_TIMER))
-                        /*
-                         * Set up a return to indicate that we dropped 
-                         * the signal.
-                         */
-                        ret = info->si_sys_private;
        }
 out_set:
@@ -868,12 +841,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
                BUG();
        assert_spin_locked(&t->sighand->siglock);
-        if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
-                /*
-                 * Set up a return to indicate that we dropped the signal.
-                 */
-                ret = info->si_sys_private;
        /* Short-circuit ignored signals.  */
        if (sig_ignored(t, sig))
                goto out;
@@ -903,11 +870,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
        int ret;
        spin_lock_irqsave(&t->sighand->siglock, flags);
-        if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
+        if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
                t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
+        }
+        if (sigismember(&t->blocked, sig)) {
                sigdelset(&t->blocked, sig);
-                recalc_sigpending_tsk(t);
        }
+        recalc_sigpending_tsk(t);
        ret = specific_send_sig_info(sig, info, t);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -917,15 +886,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 void
 force_sig_specific(int sig, struct task_struct *t)
 {
-        unsigned long int flags;
+        force_sig_info(sig, SEND_SIG_FORCED, t);
-        spin_lock_irqsave(&t->sighand->siglock, flags);
-        if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN)
-                t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
-        sigdelset(&t->blocked, sig);
-        recalc_sigpending_tsk(t);
-        specific_send_sig_info(sig, (void *)2, t);
-        spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 /*
@@ -936,34 +897,31 @@ force_sig_specific(int sig, struct task_struct *t)
 * as soon as they're available, so putting the signal on the shared queue
 * will be equivalent to sending it to one such thread.
 */
-#define wants_signal(sig, p, mask)                      \
+static inline int wants_signal(int sig, struct task_struct *p)
-        (!sigismember(&(p)->blocked, sig)               \
+{
-         && !((p)->state & mask)                        \
+        if (sigismember(&p->blocked, sig))
-         && !((p)->flags & PF_EXITING)                  \
+                return 0;
-         && (task_curr(p) || !signal_pending(p)))
+        if (p->flags & PF_EXITING)
+                return 0;
+        if (sig == SIGKILL)
+                return 1;
+        if (p->state & (TASK_STOPPED | TASK_TRACED))
+                return 0;
+        return task_curr(p) || !signal_pending(p);
+}
 static void
 __group_complete_signal(int sig, struct task_struct *p)
 {
-        unsigned int mask;
        struct task_struct *t;
        /*
-         * Don't bother traced and stopped tasks (but
-         * SIGKILL will punch through that).
-         */
-        mask = TASK_STOPPED | TASK_TRACED;
-        if (sig == SIGKILL)
-                mask = 0;
-        /*
         * Now find a thread we can wake up to take the signal off the queue.
         *
         * If the main thread wants the signal, it gets first crack.
         * Probably the least surprising to the average bear.
         */
-        if (wants_signal(sig, p, mask))
+        if (wants_signal(sig, p))
                t = p;
        else if (thread_group_empty(p))
                /*
@@ -981,7 +939,7 @@ __group_complete_signal(int sig, struct task_struct *p)
                        t = p->signal->curr_target = p;
                BUG_ON(t->tgid != p->tgid);
-                while (!wants_signal(sig, t, mask)) {
+                while (!wants_signal(sig, t)) {
                        t = next_thread(t);
                        if (t == p->signal->curr_target)
                                /*
@@ -1063,12 +1021,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
        assert_spin_locked(&p->sighand->siglock);
        handle_stop_signal(sig, p);
-        if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
-                /*
-                 * Set up a return to indicate that we dropped the signal.
-                 */
-                ret = info->si_sys_private;
        /* Short-circuit ignored signals.  */
        if (sig_ignored(p, sig))
                return ret;
@@ -1121,8 +1073,8 @@ void zap_other_threads(struct task_struct *p)
                if (t != p->group_leader)
                        t->exit_signal = -1;
+                /* SIGKILL will be handled before any pending SIGSTOP */
                sigaddset(&t->pending.signal, SIGKILL);
-                rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
                signal_wake_up(t, 1);
        }
 }
@@ -1195,6 +1147,40 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
        return error;
 }
+/* like kill_proc_info(), but doesn't use uid/euid of "current" */
+int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
+                      uid_t uid, uid_t euid)
+{
+        int ret = -EINVAL;
+        struct task_struct *p;
+        if (!valid_signal(sig))
+                return ret;
+        read_lock(&tasklist_lock);
+        p = find_task_by_pid(pid);
+        if (!p) {
+                ret = -ESRCH;
+                goto out_unlock;
+        }
+        if ((!info || ((unsigned long)info != 1 &&
+                        (unsigned long)info != 2 && SI_FROMUSER(info)))
+            && (euid != p->suid) && (euid != p->uid)
+            && (uid != p->suid) && (uid != p->uid)) {
+                ret = -EPERM;
+                goto out_unlock;
+        }
+        if (sig && p->sighand) {
+                unsigned long flags;
+                spin_lock_irqsave(&p->sighand->siglock, flags);
+                ret = __group_send_sig_info(sig, info, p);
+                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+        }
+out_unlock:
+        read_unlock(&tasklist_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(kill_proc_info_as_uid);
 /*
 * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1264,10 +1250,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
        return ret;
 }
+#define __si_special(priv) \
+        ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
 int
 send_sig(int sig, struct task_struct *p, int priv)
 {
-        return send_sig_info(sig, (void*)(long)(priv != 0), p);
+        return send_sig_info(sig, __si_special(priv), p);
 }
 /*
@@ -1287,7 +1276,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 void
 force_sig(int sig, struct task_struct *p)
 {
-        force_sig_info(sig, (void*)1L, p);
+        force_sig_info(sig, SEND_SIG_PRIV, p);
 }
 /*
@@ -1312,13 +1301,13 @@ force_sigsegv(int sig, struct task_struct *p)
 int
 kill_pg(pid_t pgrp, int sig, int priv)
 {
-        return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp);
+        return kill_pg_info(sig, __si_special(priv), pgrp);
 }
 int
 kill_proc(pid_t pid, int sig, int priv)
 {
-        return kill_proc_info(sig, (void *)(long)(priv != 0), pid);
+        return kill_proc_info(sig, __si_special(priv), pid);
 }
 /*
@@ -1349,11 +1338,12 @@ void sigqueue_free(struct sigqueue *q)
         * pending queue.
         */
        if (unlikely(!list_empty(&q->list))) {
-                read_lock(&tasklist_lock);  
+                spinlock_t *lock = &current->sighand->siglock;
-                spin_lock_irqsave(q->lock, flags);
+                read_lock(&tasklist_lock);
+                spin_lock_irqsave(lock, flags);
                if (!list_empty(&q->list))
                        list_del_init(&q->list);
-                spin_unlock_irqrestore(q->lock, flags);
+                spin_unlock_irqrestore(lock, flags);
                read_unlock(&tasklist_lock);
        }
        q->flags &= ~SIGQUEUE_PREALLOC;
@@ -1392,7 +1382,6 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
                goto out;
        }
-        q->lock = &p->sighand->siglock;
        list_add_tail(&q->list, &p->pending.list);
        sigaddset(&p->pending.signal, sig);
        if (!sigismember(&p->blocked, sig))
@@ -1440,7 +1429,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
         * We always use the shared queue for process-wide signals,
         * to avoid several races.
         */
-        q->lock = &p->sighand->siglock;
        list_add_tail(&q->list, &p->signal->shared_pending.list);
        sigaddset(&p->signal->shared_pending.signal, sig);
@@ -1502,7 +1490,7 @@ void do_notify_parent(struct task_struct *tsk, int sig)
        psig = tsk->parent->sighand;
        spin_lock_irqsave(&psig->siglock, flags);
-        if (sig == SIGCHLD &&
+        if (!tsk->ptrace && sig == SIGCHLD &&
            (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
             (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
                /*
@@ -1766,7 +1754,8 @@ do_signal_stop(int signr)
                                 * stop is always done with the siglock held,
                                 * so this check has no races.
                                 */
-                                if (t->state < TASK_STOPPED) {
+                                if (!t->exit_state &&
+                                    !(t->state & (TASK_STOPPED|TASK_TRACED))) {
                                        stop_count++;
                                        signal_wake_up(t, 0);
                                }
@@ -1858,9 +1847,9 @@ relock:
                        /* Let the debugger run.  */
                        ptrace_stop(signr, signr, info);
-                        /* We're back.  Did the debugger cancel the sig?  */
+                        /* We're back.  Did the debugger cancel the sig or group_exit? */
                        signr = current->exit_code;
-                        if (signr == 0)
+                        if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT)
                                continue;
                        current->exit_code = 0;
@@ -2262,26 +2251,13 @@ sys_kill(int pid, int sig)
        return kill_something_info(sig, &info, pid);
 }
-/**
+static int do_tkill(int tgid, int pid, int sig)
- *  sys_tgkill - send signal to one specific thread
- *  @tgid: the thread group ID of the thread
- *  @pid: the PID of the thread
- *  @sig: signal to be sent
- *
- *  This syscall also checks the tgid and returns -ESRCH even if the PID
- *  exists but it's not belonging to the target process anymore. This
- *  method solves the problem of threads exiting and PIDs getting reused.
- */
-asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 {
-        struct siginfo info;
        int error;
+        struct siginfo info;
        struct task_struct *p;
-        /* This is only valid for single tasks */
+        error = -ESRCH;
-        if (pid <= 0 || tgid <= 0)
-                return -EINVAL;
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_TKILL;
@@ -2290,8 +2266,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
        read_lock(&tasklist_lock);
        p = find_task_by_pid(pid);
-        error = -ESRCH;
+        if (p && (tgid <= 0 || p->tgid == tgid)) {
-        if (p && (p->tgid == tgid)) {
                error = check_kill_permission(sig, &info, p);
                /*
                 * The null signal is a permissions and process existence
@@ -2305,47 +2280,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
                }
        }
        read_unlock(&tasklist_lock);
        return error;
 }
+/**
+ *  sys_tgkill - send signal to one specific thread
+ *  @tgid: the thread group ID of the thread
+ *  @pid: the PID of the thread
+ *  @sig: signal to be sent
+ *
+ *  This syscall also checks the tgid and returns -ESRCH even if the PID
+ *  exists but it's not belonging to the target process anymore. This
+ *  method solves the problem of threads exiting and PIDs getting reused.
+ */
+asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+{
+        /* This is only valid for single tasks */
+        if (pid <= 0 || tgid <= 0)
+                return -EINVAL;
+        return do_tkill(tgid, pid, sig);
+}
 /*
 *  Send a signal to only one task, even if it's a CLONE_THREAD task.
 */
 asmlinkage long
 sys_tkill(int pid, int sig)
 {
-        struct siginfo info;
-        int error;
-        struct task_struct *p;
        /* This is only valid for single tasks */
        if (pid <= 0)
                return -EINVAL;
-        info.si_signo = sig;
+        return do_tkill(0, pid, sig);
-        info.si_errno = 0;
-        info.si_code = SI_TKILL;
-        info.si_pid = current->tgid;
-        info.si_uid = current->uid;
-        read_lock(&tasklist_lock);
-        p = find_task_by_pid(pid);
-        error = -ESRCH;
-        if (p) {
-                error = check_kill_permission(sig, &info, p);
-                /*
-                 * The null signal is a permissions and process existence
-                 * probe.  No signal is actually delivered.
-                 */
-                if (!error && sig && p->sighand) {
-                        spin_lock_irq(&p->sighand->siglock);
-                        handle_stop_signal(sig, p);
-                        error = specific_send_sig_info(sig, &info, p);
-                        spin_unlock_irq(&p->sighand->siglock);
-                }
-        }
-        read_unlock(&tasklist_lock);
-        return error;
 }
 asmlinkage long
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f766b2fc48be..ad3295cdded5 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -470,7 +470,8 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
                /* Unbind so it can run.  Fall thru. */
-                kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id());
+                kthread_bind(per_cpu(ksoftirqd, hotcpu),
+                             any_online_cpu(cpu_online_map));
        case CPU_DEAD:
                p = per_cpu(ksoftirqd, hotcpu);
                per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 75976209cea7..c67189a25d52 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -73,9 +73,6 @@ void softlockup_tick(struct pt_regs *regs)
 static int watchdog(void * __bind_cpu)
 {
        struct sched_param param = { .sched_priority = 99 };
-        int this_cpu = (long) __bind_cpu;
-        printk("softlockup thread %d started up.\n", this_cpu);
        sched_setscheduler(current, SCHED_FIFO, &param);
        current->flags |= PF_NOFREEZE;
@@ -123,7 +120,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_UP_CANCELED:
                /* Unbind so it can run.  Fall thru. */
-                kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+                kthread_bind(per_cpu(watchdog_task, hotcpu),
+                             any_online_cpu(cpu_online_map));
        case CPU_DEAD:
                p = per_cpu(watchdog_task, hotcpu);
                per_cpu(watchdog_task, hotcpu) = NULL;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 84a9d18aa8da..b3d4dc858e35 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -119,13 +119,12 @@ static int stop_machine(void)
                return ret;
        }
-        /* Don't schedule us away at this point, please. */
-        local_irq_disable();
        /* Now they are all started, make them hold the CPUs, ready. */
+        preempt_disable();
        stopmachine_set_state(STOPMACHINE_PREPARE);
        /* Make them disable irqs. */
+        local_irq_disable();
        stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
        return 0;
@@ -135,6 +134,7 @@ static void restart_machine(void)
 {
        stopmachine_set_state(STOPMACHINE_EXIT);
        local_irq_enable();
+        preempt_enable_no_resched();
 }
 struct stop_machine_data
diff --git a/kernel/sys.c b/kernel/sys.c
index f723522e6986..bce933ebb29f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/cn_proc.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -361,17 +362,38 @@ out_unlock:
        return retval;
 }
+/**
+ *      emergency_restart - reboot the system
+ *
+ *      Without shutting down any hardware or taking any locks
+ *      reboot the system.  This is called when we know we are in
+ *      trouble so this is our best effort to reboot.  This is
+ *      safe to call in interrupt context.
+ */
 void emergency_restart(void)
 {
        machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
-void kernel_restart(char *cmd)
+void kernel_restart_prepare(char *cmd)
 {
        notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
        system_state = SYSTEM_RESTART;
        device_shutdown();
+}
+/**
+ *      kernel_restart - reboot the system
+ *      @cmd: pointer to buffer containing command to execute for restart
+ *              or %NULL
+ *
+ *      Shutdown everything and perform a clean reboot.
+ *      This is not safe to call in interrupt context.
+ */
+void kernel_restart(char *cmd)
+{
+        kernel_restart_prepare(cmd);
        if (!cmd) {
                printk(KERN_EMERG "Restarting system.\n");
        } else {
@@ -382,6 +404,12 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
+/**
+ *      kernel_kexec - reboot the system
+ *
+ *      Move into place and start executing a preloaded standalone
+ *      executable.  If nothing was preloaded return an error.
+ */
 void kernel_kexec(void)
 {
 #ifdef CONFIG_KEXEC
@@ -390,9 +418,7 @@ void kernel_kexec(void)
        if (!image) {
                return;
        }
-        notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+        kernel_restart_prepare(NULL);
-        system_state = SYSTEM_RESTART;
-        device_shutdown();
        printk(KERN_EMERG "Starting new kernel\n");
        machine_shutdown();
        machine_kexec(image);
@@ -400,21 +426,39 @@ void kernel_kexec(void)
 }
 EXPORT_SYMBOL_GPL(kernel_kexec);
-void kernel_halt(void)
+/**
+ *      kernel_halt - halt the system
+ *
+ *      Shutdown everything and perform a clean system halt.
+ */
+void kernel_halt_prepare(void)
 {
        notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
        system_state = SYSTEM_HALT;
        device_shutdown();
+}
+void kernel_halt(void)
+{
+        kernel_halt_prepare();
        printk(KERN_EMERG "System halted.\n");
        machine_halt();
 }
 EXPORT_SYMBOL_GPL(kernel_halt);
-void kernel_power_off(void)
+/**
+ *      kernel_power_off - power_off the system
+ *
+ *      Shutdown everything and perform a clean system power_off.
+ */
+void kernel_power_off_prepare(void)
 {
        notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
        system_state = SYSTEM_POWER_OFF;
        device_shutdown();
+}
+void kernel_power_off(void)
+{
+        kernel_power_off_prepare();
        printk(KERN_EMERG "Power down.\n");
        machine_power_off();
 }
@@ -583,6 +627,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
        current->egid = new_egid;
        current->gid = new_rgid;
        key_fsgid_changed(current);
+        proc_id_connector(current, PROC_EVENT_GID);
        return 0;
 }
@@ -622,6 +667,7 @@ asmlinkage long sys_setgid(gid_t gid)
                return -EPERM;
        key_fsgid_changed(current);
+        proc_id_connector(current, PROC_EVENT_GID);
        return 0;
 }
  
@@ -711,6 +757,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
        current->fsuid = current->euid;
        key_fsuid_changed(current);
+        proc_id_connector(current, PROC_EVENT_UID);
        return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
 }
@@ -758,6 +805,7 @@ asmlinkage long sys_setuid(uid_t uid)
        current->suid = new_suid;
        key_fsuid_changed(current);
+        proc_id_connector(current, PROC_EVENT_UID);
        return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
 }
@@ -806,6 +854,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
                current->suid = suid;
        key_fsuid_changed(current);
+        proc_id_connector(current, PROC_EVENT_UID);
        return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
 }
@@ -858,6 +907,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
                current->sgid = sgid;
        key_fsgid_changed(current);
+        proc_id_connector(current, PROC_EVENT_GID);
        return 0;
 }
@@ -900,6 +950,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
        }
        key_fsuid_changed(current);
+        proc_id_connector(current, PROC_EVENT_UID);
        security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
@@ -928,6 +979,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
                }
                current->fsgid = gid;
                key_fsgid_changed(current);
+                proc_id_connector(current, PROC_EVENT_GID);
        }
        return old_fsgid;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8e56e2495542..9990e10192e8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -169,7 +169,7 @@ struct file_operations proc_sys_file_operations = {
 extern struct proc_dir_entry *proc_sys_root;
-static void register_proc_table(ctl_table *, struct proc_dir_entry *);
+static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *);
 static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
 #endif
@@ -952,7 +952,7 @@ static ctl_table fs_table[] = {
                .data           = &aio_nr,
                .maxlen         = sizeof(aio_nr),
                .mode           = 0444,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = &proc_doulongvec_minmax,
        },
        {
                .ctl_name       = FS_AIO_MAX_NR,
@@ -960,7 +960,7 @@ static ctl_table fs_table[] = {
                .data           = &aio_max_nr,
                .maxlen         = sizeof(aio_max_nr),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .proc_handler   = &proc_doulongvec_minmax,
        },
 #ifdef CONFIG_INOTIFY
        {
@@ -992,10 +992,51 @@ static ctl_table dev_table[] = {
 extern void init_irq_proc (void);
+static DEFINE_SPINLOCK(sysctl_lock);
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+        if (unlikely(p->unregistering))
+                return 0;
+        p->used++;
+        return 1;
+}
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+        if (!--p->used)
+                if (unlikely(p->unregistering))
+                        complete(p->unregistering);
+}
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+        /*
+         * if p->used is 0, nobody will ever touch that entry again;
+         * we'll eliminate all paths to it before dropping sysctl_lock
+         */
+        if (unlikely(p->used)) {
+                struct completion wait;
+                init_completion(&wait);
+                p->unregistering = &wait;
+                spin_unlock(&sysctl_lock);
+                wait_for_completion(&wait);
+                spin_lock(&sysctl_lock);
+        }
+        /*
+         * do not remove from the list until nobody holds it; walking the
+         * list in do_sysctl() relies on that.
+         */
+        list_del_init(&p->ctl_entry);
+}
 void __init sysctl_init(void)
 {
 #ifdef CONFIG_PROC_FS
-        register_proc_table(root_table, proc_sys_root);
+        register_proc_table(root_table, proc_sys_root, &root_table_header);
        init_irq_proc();
 #endif
 }
@@ -1004,6 +1045,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
               void __user *newval, size_t newlen)
 {
        struct list_head *tmp;
+        int error = -ENOTDIR;
        if (nlen <= 0 || nlen >= CTL_MAXNAME)
                return -ENOTDIR;
@@ -1012,20 +1054,30 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
                if (!oldlenp || get_user(old_len, oldlenp))
                        return -EFAULT;
        }
+        spin_lock(&sysctl_lock);
        tmp = &root_table_header.ctl_entry;
        do {
                struct ctl_table_header *head =
                        list_entry(tmp, struct ctl_table_header, ctl_entry);
                void *context = NULL;
-                int error = parse_table(name, nlen, oldval, oldlenp, 
+                if (!use_table(head))
+                        continue;
+                spin_unlock(&sysctl_lock);
+                error = parse_table(name, nlen, oldval, oldlenp, 
                                        newval, newlen, head->ctl_table,
                                        &context);
                kfree(context);
+                spin_lock(&sysctl_lock);
+                unuse_table(head);
                if (error != -ENOTDIR)
-                        return error;
+                        break;
-                tmp = tmp->next;
+        } while ((tmp = tmp->next) != &root_table_header.ctl_entry);
-        } while (tmp != &root_table_header.ctl_entry);
+        spin_unlock(&sysctl_lock);
-        return -ENOTDIR;
+        return error;
 }
 asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
@@ -1236,12 +1288,16 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
                return NULL;
        tmp->ctl_table = table;
        INIT_LIST_HEAD(&tmp->ctl_entry);
+        tmp->used = 0;
+        tmp->unregistering = NULL;
+        spin_lock(&sysctl_lock);
        if (insert_at_head)
                list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
        else
                list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+        spin_unlock(&sysctl_lock);
 #ifdef CONFIG_PROC_FS
-        register_proc_table(table, proc_sys_root);
+        register_proc_table(table, proc_sys_root, tmp);
 #endif
        return tmp;
 }
@@ -1255,10 +1311,13 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
 */
 void unregister_sysctl_table(struct ctl_table_header * header)
 {
-        list_del(&header->ctl_entry);
+        might_sleep();
+        spin_lock(&sysctl_lock);
+        start_unregistering(header);
 #ifdef CONFIG_PROC_FS
        unregister_proc_table(header->ctl_table, proc_sys_root);
 #endif
+        spin_unlock(&sysctl_lock);
        kfree(header);
 }
@@ -1269,7 +1328,7 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 #ifdef CONFIG_PROC_FS
 /* Scan the sysctl entries in table and add them all into /proc */
-static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
+static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
 {
        struct proc_dir_entry *de;
        int len;
@@ -1305,13 +1364,14 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
                        de = create_proc_entry(table->procname, mode, root);
                        if (!de)
                                continue;
+                        de->set = set;
                        de->data = (void *) table;
                        if (table->proc_handler)
                                de->proc_fops = &proc_sys_file_operations;
                }
                table->de = de;
                if (de->mode & S_IFDIR)
-                        register_proc_table(table->child, de);
+                        register_proc_table(table->child, de, set);
        }
 }
@@ -1336,6 +1396,13 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root
                                continue;
                }
+                /*
+                 * In any case, mark the entry as goner; we'll keep it
+                 * around if it's busy, but we'll know to do nothing with
+                 * its fields.  We are under sysctl_lock here.
+                 */
+                de->data = NULL;
                /* Don't unregister proc entries that are still being used.. */
                if (atomic_read(&de->count))
                        continue;
@@ -1349,27 +1416,38 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
                          size_t count, loff_t *ppos)
 {
        int op;
-        struct proc_dir_entry *de;
+        struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
        struct ctl_table *table;
        size_t res;
-        ssize_t error;
+        ssize_t error = -ENOTDIR;
-        
-        de = PDE(file->f_dentry->d_inode);
-        if (!de || !de->data)
-                return -ENOTDIR;
-        table = (struct ctl_table *) de->data;
-        if (!table || !table->proc_handler)
-                return -ENOTDIR;
-        op = (write ? 002 : 004);
-        if (ctl_perm(table, op))
-                return -EPERM;
        
-        res = count;
+        spin_lock(&sysctl_lock);
+        if (de && de->data && use_table(de->set)) {
-        error = (*table->proc_handler) (table, write, file, buf, &res, ppos);
+                /*
-        if (error)
+                 * at that point we know that sysctl was not unregistered
-                return error;
+                 * and won't be until we finish
-        return res;
+                 */
+                spin_unlock(&sysctl_lock);
+                table = (struct ctl_table *) de->data;
+                if (!table || !table->proc_handler)
+                        goto out;
+                error = -EPERM;
+                op = (write ? 002 : 004);
+                if (ctl_perm(table, op))
+                        goto out;
+                
+                /* careful: calling conventions are nasty here */
+                res = count;
+                error = (*table->proc_handler)(table, write, file,
+                                                buf, &res, ppos);
+                if (!error)
+                        error = res;
+        out:
+                spin_lock(&sysctl_lock);
+                unuse_table(de->set);
+        }
+        spin_unlock(&sysctl_lock);
+        return error;
 }
 static int proc_opensys(struct inode *inode, struct file *file)
@@ -1997,6 +2075,7 @@ int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
 * @filp: the file structure
 * @buffer: the user buffer
 * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
 *
 * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
 * values from/to the user buffer, treated as an ASCII string. 
diff --git a/kernel/time.c b/kernel/time.c
index dd5ae1162a8f..245d595a13cb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc)
                        if (mtemp >= MINSEC) {
                            ltemp = (time_offset / mtemp) << (SHIFT_USEC -
                                                              SHIFT_UPDATE);
-                            if (ltemp < 0)
+                            time_freq += shift_right(ltemp, SHIFT_KH);
-                                time_freq -= -ltemp >> SHIFT_KH;
-                            else
-                                time_freq += ltemp >> SHIFT_KH;
                        } else /* calibration interval too short (p. 12) */
                                result = TIME_ERROR;
                    } else {    /* PLL mode */
                        if (mtemp < MAXSEC) {
                            ltemp *= mtemp;
-                            if (ltemp < 0)
+                            time_freq += shift_right(ltemp,(time_constant +
-                                time_freq -= -ltemp >> (time_constant +
-                                                        time_constant +
-                                                        SHIFT_KF - SHIFT_USEC);
-                            else
-                                time_freq += ltemp >> (time_constant +
                                                       time_constant +
-                                                       SHIFT_KF - SHIFT_USEC);
+                                                       SHIFT_KF - SHIFT_USEC));
                        } else /* calibration interval too long (p. 12) */
                                result = TIME_ERROR;
                    }
-                    if (time_freq > time_tolerance)
+                    time_freq = min(time_freq, time_tolerance);
-                        time_freq = time_tolerance;
+                    time_freq = max(time_freq, -time_tolerance);
-                    else if (time_freq < -time_tolerance)
-                        time_freq = -time_tolerance;
                } /* STA_PLL || STA_PPSTIME */
            } /* txc->modes & ADJ_OFFSET */
            if (txc->modes & ADJ_TICK) {
@@ -384,10 +374,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
        if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
            txc->offset    = save_adjust;
        else {
-            if (time_offset < 0)
+            txc->offset = shift_right(time_offset, SHIFT_UPDATE);
-                txc->offset = -(-time_offset >> SHIFT_UPDATE);
-            else
-                txc->offset = time_offset >> SHIFT_UPDATE;
        }
        txc->freq          = time_freq + pps_freq;
        txc->maxerror      = time_maxerror;
@@ -532,6 +519,7 @@ int do_settimeofday (struct timespec *tv)
        clock_was_set();
        return 0;
 }
+EXPORT_SYMBOL(do_settimeofday);
 void do_gettimeofday (struct timeval *tv)
 {
@@ -570,6 +558,7 @@ void getnstimeofday(struct timespec *tv)
        tv->tv_sec = x.tv_sec;
        tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
 }
+EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
 #if (BITS_PER_LONG < 64)
diff --git a/kernel/timer.c b/kernel/timer.c
index 3ba10fa35b60..fd74268d8663 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec);
 #define time_interpolator_update(x)
 #endif
+u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+EXPORT_SYMBOL(jiffies_64);
 /*
 * per-CPU timer vector definitions:
 */
@@ -91,30 +95,6 @@ static inline void set_running_timer(tvec_base_t *base,
 #endif
 }
-static void check_timer_failed(struct timer_list *timer)
-{
-        static int whine_count;
-        if (whine_count < 16) {
-                whine_count++;
-                printk("Uninitialised timer!\n");
-                printk("This is just a warning.  Your computer is OK\n");
-                printk("function=0x%p, data=0x%lx\n",
-                        timer->function, timer->data);
-                dump_stack();
-        }
-        /*
-         * Now fix it up
-         */
-        timer->magic = TIMER_MAGIC;
-}
-static inline void check_timer(struct timer_list *timer)
-{
-        if (timer->magic != TIMER_MAGIC)
-                check_timer_failed(timer);
-}
 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 {
        unsigned long expires = timer->expires;
@@ -177,7 +157,6 @@ void fastcall init_timer(struct timer_list *timer)
 {
        timer->entry.next = NULL;
        timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
-        timer->magic = TIMER_MAGIC;
 }
 EXPORT_SYMBOL(init_timer);
@@ -230,7 +209,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
        int ret = 0;
        BUG_ON(!timer->function);
-        check_timer(timer);
        base = lock_timer_base(timer, &flags);
@@ -283,9 +261,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
        unsigned long flags;
        BUG_ON(timer_pending(timer) || !timer->function);
-        check_timer(timer);
        spin_lock_irqsave(&base->t_base.lock, flags);
        timer->base = &base->t_base;
        internal_add_timer(base, timer);
@@ -316,8 +291,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 {
        BUG_ON(!timer->function);
-        check_timer(timer);
        /*
         * This is a common optimization triggered by the
         * networking code - if the timer is re-modified
@@ -348,8 +321,6 @@ int del_timer(struct timer_list *timer)
        unsigned long flags;
        int ret = 0;
-        check_timer(timer);
        if (timer_pending(timer)) {
                base = lock_timer_base(timer, &flags);
                if (timer_pending(timer)) {
@@ -412,8 +383,6 @@ out:
 */
 int del_timer_sync(struct timer_list *timer)
 {
-        check_timer(timer);
        for (;;) {
                int ret = try_to_del_timer_sync(timer);
                if (ret >= 0)
@@ -632,134 +601,118 @@ long time_next_adjust;
 */
 static void second_overflow(void)
 {
-    long ltemp;
+        long ltemp;
-    /* Bump the maxerror field */
+        /* Bump the maxerror field */
-    time_maxerror += time_tolerance >> SHIFT_USEC;
+        time_maxerror += time_tolerance >> SHIFT_USEC;
-    if ( time_maxerror > NTP_PHASE_LIMIT ) {
+        if (time_maxerror > NTP_PHASE_LIMIT) {
-        time_maxerror = NTP_PHASE_LIMIT;
+                time_maxerror = NTP_PHASE_LIMIT;
-        time_status |= STA_UNSYNC;
+                time_status |= STA_UNSYNC;
-    }
-    /*
-     * Leap second processing. If in leap-insert state at
-     * the end of the day, the system clock is set back one
-     * second; if in leap-delete state, the system clock is
-     * set ahead one second. The microtime() routine or
-     * external clock driver will insure that reported time
-     * is always monotonic. The ugly divides should be
-     * replaced.
-     */
-    switch (time_state) {
-    case TIME_OK:
-        if (time_status & STA_INS)
-            time_state = TIME_INS;
-        else if (time_status & STA_DEL)
-            time_state = TIME_DEL;
-        break;
-    case TIME_INS:
-        if (xtime.tv_sec % 86400 == 0) {
-            xtime.tv_sec--;
-            wall_to_monotonic.tv_sec++;
-            /* The timer interpolator will make time change gradually instead
-             * of an immediate jump by one second.
-             */
-            time_interpolator_update(-NSEC_PER_SEC);
-            time_state = TIME_OOP;
-            clock_was_set();
-            printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
        }
-        break;
+        /*
-    case TIME_DEL:
+         * Leap second processing. If in leap-insert state at the end of the
-        if ((xtime.tv_sec + 1) % 86400 == 0) {
+         * day, the system clock is set back one second; if in leap-delete
-            xtime.tv_sec++;
+         * state, the system clock is set ahead one second. The microtime()
-            wall_to_monotonic.tv_sec--;
+         * routine or external clock driver will insure that reported time is
-            /* Use of time interpolator for a gradual change of time */
+         * always monotonic. The ugly divides should be replaced.
-            time_interpolator_update(NSEC_PER_SEC);
+         */
-            time_state = TIME_WAIT;
+        switch (time_state) {
-            clock_was_set();
+        case TIME_OK:
-            printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+                if (time_status & STA_INS)
+                        time_state = TIME_INS;
+                else if (time_status & STA_DEL)
+                        time_state = TIME_DEL;
+                break;
+        case TIME_INS:
+                if (xtime.tv_sec % 86400 == 0) {
+                        xtime.tv_sec--;
+                        wall_to_monotonic.tv_sec++;
+                        /*
+                         * The timer interpolator will make time change
+                         * gradually instead of an immediate jump by one second
+                         */
+                        time_interpolator_update(-NSEC_PER_SEC);
+                        time_state = TIME_OOP;
+                        clock_was_set();
+                        printk(KERN_NOTICE "Clock: inserting leap second "
+                                        "23:59:60 UTC\n");
+                }
+                break;
+        case TIME_DEL:
+                if ((xtime.tv_sec + 1) % 86400 == 0) {
+                        xtime.tv_sec++;
+                        wall_to_monotonic.tv_sec--;
+                        /*
+                         * Use of time interpolator for a gradual change of
+                         * time
+                         */
+                        time_interpolator_update(NSEC_PER_SEC);
+                        time_state = TIME_WAIT;
+                        clock_was_set();
+                        printk(KERN_NOTICE "Clock: deleting leap second "
+                                        "23:59:59 UTC\n");
+                }
+                break;
+        case TIME_OOP:
+                time_state = TIME_WAIT;
+                break;
+        case TIME_WAIT:
+                if (!(time_status & (STA_INS | STA_DEL)))
+                time_state = TIME_OK;
        }
-        break;
+        /*
-    case TIME_OOP:
+         * Compute the phase adjustment for the next second. In PLL mode, the
-        time_state = TIME_WAIT;
+         * offset is reduced by a fixed factor times the time constant. In FLL
-        break;
+         * mode the offset is used directly. In either mode, the maximum phase
+         * adjustment for each second is clamped so as to spread the adjustment
-    case TIME_WAIT:
+         * over not more than the number of seconds between updates.
-        if (!(time_status & (STA_INS | STA_DEL)))
+         */
-            time_state = TIME_OK;
-    }
-    /*
-     * Compute the phase adjustment for the next second. In
-     * PLL mode, the offset is reduced by a fixed factor
-     * times the time constant. In FLL mode the offset is
-     * used directly. In either mode, the maximum phase
-     * adjustment for each second is clamped so as to spread
-     * the adjustment over not more than the number of
-     * seconds between updates.
-     */
-    if (time_offset < 0) {
-        ltemp = -time_offset;
-        if (!(time_status & STA_FLL))
-            ltemp >>= SHIFT_KG + time_constant;
-        if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
-            ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
-        time_offset += ltemp;
-        time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    } else {
        ltemp = time_offset;
        if (!(time_status & STA_FLL))
-            ltemp >>= SHIFT_KG + time_constant;
+                ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
-        if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+        ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
-            ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+        ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
        time_offset -= ltemp;
        time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    }
+        /*
-    /*
+         * Compute the frequency estimate and additional phase adjustment due
-     * Compute the frequency estimate and additional phase
+         * to frequency error for the next second. When the PPS signal is
-     * adjustment due to frequency error for the next
+         * engaged, gnaw on the watchdog counter and update the frequency
-     * second. When the PPS signal is engaged, gnaw on the
+         * computed by the pll and the PPS signal.
-     * watchdog counter and update the frequency computed by
+         */
-     * the pll and the PPS signal.
+        pps_valid++;
-     */
+        if (pps_valid == PPS_VALID) {   /* PPS signal lost */
-    pps_valid++;
+                pps_jitter = MAXTIME;
-    if (pps_valid == PPS_VALID) {       /* PPS signal lost */
+                pps_stabil = MAXFREQ;
-        pps_jitter = MAXTIME;
+                time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
-        pps_stabil = MAXFREQ;
+                                STA_PPSWANDER | STA_PPSERROR);
-        time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+        }
-                         STA_PPSWANDER | STA_PPSERROR);
+        ltemp = time_freq + pps_freq;
-    }
+        time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
-    ltemp = time_freq + pps_freq;
-    if (ltemp < 0)
-        time_adj -= -ltemp >>
-            (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
-    else
-        time_adj += ltemp >>
-            (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
 #if HZ == 100
-    /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
+        /*
-     * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
+         * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
-     */
+         * get 128.125; => only 0.125% error (p. 14)
-    if (time_adj < 0)
+         */
-        time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
+        time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
-    else
+#endif
-        time_adj += (time_adj >> 2) + (time_adj >> 5);
+#if HZ == 250
+        /*
+         * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
+         * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+         */
+        time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 #if HZ == 1000
-    /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
+        /*
-     * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
+         * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
-     */
+         * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-    if (time_adj < 0)
+         */
-        time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
+        time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
-    else
-        time_adj += (time_adj >> 6) + (time_adj >> 7);
 #endif
 }
@@ -768,23 +721,20 @@ static void update_wall_time_one_tick(void)
 {
        long time_adjust_step, delta_nsec;
-        if ( (time_adjust_step = time_adjust) != 0 ) {
+        if ((time_adjust_step = time_adjust) != 0 ) {
-            /* We are doing an adjtime thing. 
+                /*
-             *
+                 * We are doing an adjtime thing.  Prepare time_adjust_step to
-             * Prepare time_adjust_step to be within bounds.
+                 * be within bounds.  Note that a positive time_adjust means we
-             * Note that a positive time_adjust means we want the clock
+                 * want the clock to run faster.
-             * to run faster.
+                 *
-             *
+                 * Limit the amount of the step to be in the range
-             * Limit the amount of the step to be in the range
+                 * -tickadj .. +tickadj
-             * -tickadj .. +tickadj
+                 */
-             */
+                time_adjust_step = min(time_adjust_step, (long)tickadj);
-             if (time_adjust > tickadj)
+                time_adjust_step = max(time_adjust_step, (long)-tickadj);
-                time_adjust_step = tickadj;
-             else if (time_adjust < -tickadj)
+                /* Reduce by this step the amount of time left  */
-                time_adjust_step = -tickadj;
+                time_adjust -= time_adjust_step;
-            /* Reduce by this step the amount of time left  */
-            time_adjust -= time_adjust_step;
        }
        delta_nsec = tick_nsec + time_adjust_step * 1000;
        /*
@@ -792,13 +742,8 @@ static void update_wall_time_one_tick(void)
         * advance the tick more.
         */
        time_phase += time_adj;
-        if (time_phase <= -FINENSEC) {
+        if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
-                long ltemp = -time_phase >> (SHIFT_SCALE - 10);
+                long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
-                time_phase += ltemp << (SHIFT_SCALE - 10);
-                delta_nsec -= ltemp;
-        }
-        else if (time_phase >= FINENSEC) {
-                long ltemp = time_phase >> (SHIFT_SCALE - 10);
                time_phase -= ltemp << (SHIFT_SCALE - 10);
                delta_nsec += ltemp;
        }
@@ -1128,8 +1073,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
                if (timeout < 0)
                {
                        printk(KERN_ERR "schedule_timeout: wrong timeout "
-                               "value %lx from %p\n", timeout,
+                                "value %lx from %p\n", timeout,
-                               __builtin_return_address(0));
+                                __builtin_return_address(0));
                        current->state = TASK_RUNNING;
                        goto out;
                }
@@ -1137,12 +1082,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
        expire = timeout + jiffies;
-        init_timer(&timer);
+        setup_timer(&timer, process_timeout, (unsigned long)current);
-        timer.expires = expire;
+        __mod_timer(&timer, expire);
-        timer.data = (unsigned long) current;
-        timer.function = process_timeout;
-        add_timer(&timer);
        schedule();
        del_singleshot_timer_sync(&timer);
@@ -1159,15 +1100,15 @@ EXPORT_SYMBOL(schedule_timeout);
 */
 signed long __sched schedule_timeout_interruptible(signed long timeout)
 {
-       __set_current_state(TASK_INTERRUPTIBLE);
+        __set_current_state(TASK_INTERRUPTIBLE);
-       return schedule_timeout(timeout);
+        return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_interruptible);
 signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 {
-       __set_current_state(TASK_UNINTERRUPTIBLE);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
-       return schedule_timeout(timeout);
+        return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
@@ -1507,16 +1448,18 @@ static void time_interpolator_update(long delta_nsec)
        if (!time_interpolator)
                return;
-        /* The interpolator compensates for late ticks by accumulating
+        /*
-         * the late time in time_interpolator->offset. A tick earlier than
+         * The interpolator compensates for late ticks by accumulating the late
-         * expected will lead to a reset of the offset and a corresponding
+         * time in time_interpolator->offset. A tick earlier than expected will
-         * jump of the clock forward. Again this only works if the
+         * lead to a reset of the offset and a corresponding jump of the clock
-         * interpolator clock is running slightly slower than the regular clock
+         * forward. Again this only works if the interpolator clock is running
-         * and the tuning logic insures that.
+         * slightly slower than the regular clock and the tuning logic insures
-         */
+         * that.
+         */
        counter = time_interpolator_get_counter(1);
-        offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
+        offset = time_interpolator->offset +
+                        GET_TI_NSECS(counter, time_interpolator);
        if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
                time_interpolator->offset = offset - delta_nsec;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 91bacb13a7e2..2bd5aee1c736 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -12,6 +12,8 @@
 *   Andrew Morton <andrewm@uow.edu.au>
 *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *   Theodore Ts'o <tytso@mit.edu>
+ *
+ * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>.
 */
 #include <linux/module.h>
@@ -57,7 +59,7 @@ struct cpu_workqueue_struct {
 * per-CPU workqueues:
 */
 struct workqueue_struct {
-        struct cpu_workqueue_struct cpu_wq[NR_CPUS];
+        struct cpu_workqueue_struct *cpu_wq;
        const char *name;
        struct list_head list;  /* Empty if single thread */
 };
@@ -100,9 +102,9 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
        if (!test_and_set_bit(0, &work->pending)) {
                if (unlikely(is_single_threaded(wq)))
-                        cpu = 0;
+                        cpu = any_online_cpu(cpu_online_map);
                BUG_ON(!list_empty(&work->entry));
-                __queue_work(wq->cpu_wq + cpu, work);
+                __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
                ret = 1;
        }
        put_cpu();
@@ -116,9 +118,9 @@ static void delayed_work_timer_fn(unsigned long __data)
        int cpu = smp_processor_id();
        if (unlikely(is_single_threaded(wq)))
-                cpu = 0;
+                cpu = any_online_cpu(cpu_online_map);
-        __queue_work(wq->cpu_wq + cpu, work);
+        __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
@@ -264,14 +266,14 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
        might_sleep();
        if (is_single_threaded(wq)) {
-                /* Always use cpu 0's area. */
+                /* Always use first cpu's area. */
-                flush_cpu_workqueue(wq->cpu_wq + 0);
+                flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, any_online_cpu(cpu_online_map)));
        } else {
                int cpu;
                lock_cpu_hotplug();
                for_each_online_cpu(cpu)
-                        flush_cpu_workqueue(wq->cpu_wq + cpu);
+                        flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
                unlock_cpu_hotplug();
        }
 }
@@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
                                                   int cpu)
 {
-        struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+        struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
        struct task_struct *p;
        spin_lock_init(&cwq->lock);
@@ -312,12 +314,13 @@ struct workqueue_struct *__create_workqueue(const char *name,
        if (!wq)
                return NULL;
+        wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
        wq->name = name;
        /* We don't need the distraction of CPUs appearing and vanishing. */
        lock_cpu_hotplug();
        if (singlethread) {
                INIT_LIST_HEAD(&wq->list);
-                p = create_workqueue_thread(wq, 0);
+                p = create_workqueue_thread(wq, any_online_cpu(cpu_online_map));
                if (!p)
                        destroy = 1;
                else
@@ -353,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
        unsigned long flags;
        struct task_struct *p;
-        cwq = wq->cpu_wq + cpu;
+        cwq = per_cpu_ptr(wq->cpu_wq, cpu);
        spin_lock_irqsave(&cwq->lock, flags);
        p = cwq->thread;
        cwq->thread = NULL;
@@ -371,7 +374,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
        /* We don't need the distraction of CPUs appearing and vanishing. */
        lock_cpu_hotplug();
        if (is_single_threaded(wq))
-                cleanup_workqueue_thread(wq, 0);
+                cleanup_workqueue_thread(wq, any_online_cpu(cpu_online_map));
        else {
                for_each_online_cpu(cpu)
                        cleanup_workqueue_thread(wq, cpu);
@@ -380,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
                spin_unlock(&workqueue_lock);
        }
        unlock_cpu_hotplug();
+        free_percpu(wq->cpu_wq);
        kfree(wq);
 }
@@ -458,7 +462,7 @@ int current_is_keventd(void)
        BUG_ON(!keventd_wq);
-        cwq = keventd_wq->cpu_wq + cpu;
+        cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
        if (current == cwq->thread)
                ret = 1;
@@ -470,7 +474,7 @@ int current_is_keventd(void)
 /* Take the work from this (downed) CPU. */
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
-        struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+        struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
        LIST_HEAD(list);
        struct work_struct *work;
@@ -481,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
                printk("Taking work for %s\n", wq->name);
                work = list_entry(list.next,struct work_struct,entry);
                list_del(&work->entry);
-                __queue_work(wq->cpu_wq + smp_processor_id(), work);
+                __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
        }
        spin_unlock_irq(&cwq->lock);
 }
@@ -508,16 +512,19 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        case CPU_ONLINE:
                /* Kick off worker threads. */
                list_for_each_entry(wq, &workqueues, list) {
-                        kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu);
+                        struct cpu_workqueue_struct *cwq;
-                        wake_up_process(wq->cpu_wq[hotcpu].thread);
+                        cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
+                        kthread_bind(cwq->thread, hotcpu);
+                        wake_up_process(cwq->thread);
                }
                break;
        case CPU_UP_CANCELED:
                list_for_each_entry(wq, &workqueues, list) {
                        /* Unbind so it can run. */
-                        kthread_bind(wq->cpu_wq[hotcpu].thread,
+                        kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
-                                     smp_processor_id());
+                                     any_online_cpu(cpu_online_map));
                        cleanup_workqueue_thread(wq, hotcpu);
                }
                break;
author	Len Brown <len.brown@intel.com>	2005-12-06 17:31:30 -0500
committer	Len Brown <len.brown@intel.com>	2005-12-06 17:31:30 -0500
commit	3d5271f9883cba7b54762bc4fe027d4172f06db7 (patch)
tree	ab8a881a14478598a0c8bda0d26c62cdccfffd6d /kernel
parent	378b2556f4e09fa6f87ff0cb5c4395ff28257d02 (diff)
parent	9115a6c787596e687df03010d97fccc5e0762506 (diff)